# Importing all the important libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers sentence_transformers sentencepiece evaluate sacrebleu bert-score peft

In [None]:
!conda update -n base -c conda-forge conda -y

In [None]:
!conda install conda-forge::rouge-score -y

In [5]:
import torch
import pandas as pd
import tensorflow as tf
import sentencepiece as spm
from IPython.display import display
from tqdm.notebook import tqdm
from datasets import load_from_disk, Dataset
from torch.utils.data import DataLoader, Dataset as tDataset
from transformers import (
    logging,
    AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, AutoModelForMaskedLM,
    Trainer, TrainingArguments,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model
from evaluate import load

2025-02-08 19:43:00.858188: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-08 19:43:03.877668: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-08 19:43:04.326385: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-08 19:43:04.328190: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-08 19:43:05.131920: I tensorflow/core/platform/cpu_feature_gua

# Set GPU

## Mac

In [6]:
# for mac
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        details = tf.config.experimental.get_device_details(gpu)
        print("GPU details: ", details)
else:
    print("No GPU found. Using CPU.")

# set GPU device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

2025-02-08 18:52:13.478109: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-08 18:52:25.442651: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-08 18:52:25.445253: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'compute_capability': (7, 5), 'device_name': 'Tesla T4'}
Using device: cpu


## Window / Linux

In [6]:
# for window
print("Tensorflow GPUs: ", tf.config.list_physical_devices('GPU'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using PyTorch device:", device)
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

2025-02-08 19:43:56.112952: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-08 19:44:07.939907: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-08 19:44:07.942270: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Tensorflow GPUs:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Using PyTorch device: cuda
GPU Name: Tesla T4


# Class

In [7]:
# Custom dataset class for batching
class TextDataset(tDataset):
    def __init__(self, texts):
        self.texts = [str(text) if text is not None else "" for text in texts] 

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

# Functions

In [8]:
# function to save model-variants df
def save_model_variants_df(df, df_name):
    df.to_csv(f"model-variants/{df_name}.csv", index=False, encoding="utf-8")

In [9]:
def save_model_variants_df_arrow(df, df_name): 
    df.save_to_disk(f"model-variants/{df_name}_hf_dataset")

In [10]:
# function to load gen df
def load_gen_df(df_name):
    return pd.read_csv(f"gen/{df_name}.csv", header=0, encoding="utf-8")

In [11]:
# function to load model-variants df
def load_model_variants_df(df_name):
    return pd.read_csv(f"model-variants/{df_name}.csv", header=0, encoding="utf-8")

In [12]:
# function to load model-variants df with arrow
def load_model_variants_df_arrow(df_name):
    return load_from_disk(f"model-variants/{df_name}_hf_dataset")

# Set settings

In [13]:
tqdm.pandas()

In [14]:
# Suppress specific warnings from the transformers library
logging.set_verbosity_error()

# Fine-Tuning Transformer Models for Burmese
This notebook fine-tunes three transformer models:
- mBERT (Multilingual BERT)
- mT5 (Multilingual T5)
- XLM-RoBERTa

Apply:
- Sentence-Piece Tokenization for Burmese segmentation
- LoRA for efficient fine-tuning
- Prefix-Tuning for lightweight adaptations
- Mixed Precision Training for speed improvements

In [15]:
# spt models
spt_models = {
    "bpe": spm.SentencePieceProcessor("spt/spt_bpe.model"),
    #"unigram": spm.SentencePieceProcessor("spt/spt_unigram.model"),
}

In [16]:
# model names
train_model_names = {
    "mBERT": "bert-base-multilingual-cased",
    #"mT5": "google/mt5-small",
    #"XLM-R": "xlm-roberta-base"
}

In [17]:
# train tokenizers
train_tokenizers = {
    "mBERT": AutoTokenizer.from_pretrained(train_model_names["mBERT"]),
    #"mT5": AutoTokenizer.from_pretrained(train_model_names["mT5"], use_fast=False, legacy=True),
    #"XLM-R": AutoTokenizer.from_pretrained(train_model_names["XLM-R"])
}

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

## Data Preprocessing
Datasets used for training:
- myXNLI & ALT Corpus (normalized)
- Back-translated datasets (NLLB, Seamless M4T)
- Pseudo-parallel datasets (MiniLM, LaBSE)

### Data Preparation

In [14]:
# Load and process dataset
def load_and_rename_columns_multilingual(file_name):
    df = load_gen_df(f"{file_name}")

    column_mapping = {
        "english": "source",
        "burmese": "target",
        "english_back_translated": "source",
        "burmese_translated": "target",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["source", "target"]]

    return df

In [15]:
# datasets
datasets = {
    "normal": [
        "myxnli_normalized_1", 
        "myxnli_normalized_2", 
        "alt_combined_normalized"
    ],
    "nllb_back_translated": [
        "myxnli_nllb_back_translated_final_1", 
        "myxnli_nllb_back_translated_final_2", 
        "alt_combined_nllb_back_translated_final"
    ],
    "seamless_m4t_back_translated": [
        "myxnli_seamless_m4t_back_translated_final_1", 
        "myxnli_seamless_m4t_back_translated_final_2", 
        "alt_combined_seamless_m4t_back_translated_final"
    ]
}

In [16]:
# Load and process datasets
loaded_datasets = {}
for key, file_list in datasets.items():
    loaded_datasets[key] = [load_and_rename_columns_multilingual(file) for file in file_list]

In [17]:
# combine all datasets
combined = pd.concat(
    loaded_datasets["normal"] + 
    loaded_datasets["nllb_back_translated"] + 
    loaded_datasets["seamless_m4t_back_translated"],
    ignore_index=True
)

In [18]:
# Shuffle the data to prevent order bias
combined = combined.sample(frac=1).reset_index(drop=True)

In [19]:
# display combined dataset
display(combined.head())

Unnamed: 0,source,target
0,archaeologists think that a fire broke out in ...,ရှေးဟောင်းသုတေသီတွေက Knossos မှာ မီးလောင်တာ BC...
1,there are political meetings in every neighbor...,ရပ်ကွက်တိုင်းမှာ နိုင်ငံရေး အစည်းအဝေးတွေရှိတယ်။
2,the lawyer said that in article 712 (1) gao wa...,ရှေ့နေက ပုဒ်မ ၇၁၂ (၁) မှာ Gao ကို ငွေကြေးဆိုင်...
3,things can get confusing when talking about do...,Dordogne အကြောင်းပြောသောအခါ၊ ဝေးကွာသောနေရာများ...
4,making financial management a top priority acr...,ဘဏ္ဍာရေး စီမံခန့်ခွဲမှုကို ပြည်ထောင်စု အစိုးရတ...


In [23]:
# print length
print(f"Combined dataset length: {len(combined)}")

Combined dataset length: 1627576


In [20]:
# save data
save_model_variants_df(combined, "combined")

### Tokenize

In [14]:
#  function to tokenize Burmese text using the selected SentencePiece model before applying Transformer tokenization.
def tokenize(examples, tokenizer, spt_tokenizer, model_name):
    spt_burmese = [spt_tokenizer.encode_as_pieces(text) for text in examples["target"]]
    examples["target"] = [" ".join(tokens) for tokens in spt_burmese]

    if "t5" in model_name:  # mT5: text-to-text format
        return tokenizer(
            examples["source"], 
            padding="max_length", 
            truncation=True, 
            max_length=512,
        )
    
    # BERT-based models: Masked/Causal LM
    return tokenizer(
        examples["source"],
        examples["target"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [None]:
# tokenize for each model and spt
for model_name, tokenizer in train_tokenizers.items():
    for spt_name, spt_tokenizer in spt_models.items():
        dataset = load_model_variants_df("combined")

        # Convert to Hugging Face Dataset
        dataset = Dataset.from_pandas(dataset)

        # apply tokenize
        dataset = dataset.map(
            lambda x, _: tokenize(x, tokenizer, spt_tokenizer, model_name),
            batched=True,
            desc=f"Tokenizing dataset for {model_name} with {spt_name}",
            with_indices=True,  # Passing index as a second argument
            num_proc=10
        )

        # save
        save_model_variants_df_arrow(dataset, f"{model_name.lower()}_{spt_name}")

Tokenizing dataset for mT5 with bpe (num_proc=10):   0%|          | 0/1627576 [00:00<?, ? examples/s]

## Fine Tuning
Fine-tuning for:
- mBERT (best perplexity, but weak BLEU/ROUGE)
- mT5 (best for generation, but requires more data)
- XLM-R (good BLEU/ROUGE, but poor perplexity)

In [28]:
# train models
train_models = {
    "mBERT": AutoModelForMaskedLM.from_pretrained(train_model_names["mBERT"], num_labels=1).to(device),
    #"mT5": AutoModelForSeq2SeqLM.from_pretrained(train_model_names["mT5"]).to(device),
    #"XLM-R": AutoModelForSequenceClassification.from_pretrained(train_model_names["XLM-R"], num_labels=1).to(device)
}

In [19]:
tokenized_datasets = {
    model_name: {
        spt_name: load_model_variants_df_arrow(f"{model_name.lower()}_{spt_name}")
        for spt_name in spt_models.keys()
    }
    for model_name in train_tokenizers.keys()
}

In [20]:
# Training Hyperparameters
train_args = {
    "num_train_epochs": 5,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "gradient_accumulation_steps": 2,
    "learning_rate": 3e-5,
    "warmup_steps": 500,
    "weight_decay": 0.01,
    "save_strategy": "epoch",
    "save_total_limit": 3,
    "fp16": True,  # Mixed Precision Training
    "evaluation_strategy": "epoch",
    "load_best_model_at_end": True,
    "metric_for_best_model": "eval_loss",
    "greater_is_better": False,  # Lower loss is better
    "logging_dir": "./logs",
    "logging_steps": 100,
    "optim": "adamw_torch_fused",  # Optimized for GPU
    "use_cpu": False if torch.cuda.is_available() else True
}

In [21]:
def apply_lora(model):
    """
    LoRA reduces memory and computational costs.
    It fine-tunes only attention layers instead of the whole model.
    """
    lora_config = LoraConfig(
        r=8, lora_alpha=16, target_modules=["query", "value"], lora_dropout=0.1
    )
    return get_peft_model(model, lora_config)

In [22]:
# function to fine tune model
def fine_tune_model(model_name, spt_name):
    print(f"Fine-tuning {model_name} on using SPT-{spt_name.upper()}...")

    # Load model and tokenizer
    tokenizer = train_tokenizers[model_name]
    model = train_models[model_name]

    # Apply LoRA for efficient fine-tuning
    model = apply_lora(model)

    # tokenize dataset
    tokenized_dataset = tokenized_datasets[model_name][spt_name]

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"model-variants/results/{model_name}_SPT-{spt_name.upper()}",
        **train_args
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_dataset,
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stops training if no improvement for 3 epochs
    )

    # Train the model
    trainer.train()

    # Save model
    name = f"model-variants/models/{model_name}_SPT-{spt_name.upper()}"
    model.save_pretrained(name)
    tokenizer.save_pretrained(name)

    print(f"Model {model_name} fine-tuned using SPT-{spt_name.upper()}.")

### mBert

In [23]:
# function to add 'labels' in dataset
def add_labels(example):
    example["labels"] = example["input_ids"].copy()
    return example

In [24]:
tokenized_datasets["mBERT"]["bpe"] = tokenized_datasets["mBERT"]["bpe"].map(
    add_labels, 
    batched=True,
    desc=f"Tokenizing dataset"
)

Tokenizing dataset:   0%|          | 0/1627576 [00:00<?, ? examples/s]

In [None]:
# fine tune with SPT-BPE
fine_tune_model("mBERT", "bpe")

Fine-tuning mBERT on using SPT-BPE...


  trainer = Trainer(


{'loss': 18.655, 'grad_norm': 8.106327056884766, 'learning_rate': 6e-06, 'epoch': 0.0009830570123914337}


In [None]:
# fine tune with SPT-Unigram
fine_tune_model("mBERT", "unigram")

### mT5

In [None]:
# fine tune with SPT-BPE
fine_tune_model("mT5", "bpe")

In [None]:
# fine tune with SPT-Unigram
fine_tune_model("mT5", "unigram")

### XLM-R

In [None]:
# fine tune with SPT-BPE
fine_tune_model("XLM-R", "bpe")

In [None]:
# fine tune with SPT-Unigram
fine_tune_model("XLM-R", "unigram")

## Generate Predictions

### mBERT and XLM-R

In [None]:
# dataset class for masked
class MaskedTextDataset(tDataset):
    def __init__(self, texts, tokenizer, mask_ratio=0.15, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.mask_ratio = mask_ratio
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx] if isinstance(self.texts[idx], str) else ""

        # Tokenize and move tensors to GPU
        inputs = self.tokenizer(
            text, return_tensors="pt", truncation=True, padding="max_length", max_length=self.max_length
        )
        
        input_ids = inputs["input_ids"].squeeze(0).to(device)
        attention_mask = inputs["attention_mask"].squeeze(0).to(device)

        # Apply random masking
        seq_length = input_ids.shape[0]
        num_to_mask = max(1, int(self.mask_ratio * (seq_length - 2)))  # Avoid CLS/SEP
        mask_indices = torch.randperm(seq_length - 2)[:num_to_mask] + 1  # Avoid first and last token

        masked_input_ids = input_ids.clone()
        masked_input_ids[mask_indices] = self.tokenizer.mask_token_id  # Replace with [MASK] token

        return masked_input_ids, attention_mask, input_ids

In [None]:
# Function to generate masked predictions
def generate_masked_predictions_batch(dataloader, model, tokenizer):
    all_predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating Masked Predictions"):
            # Move batch data to GPU
            masked_input_ids, attention_mask, _ = [x.to(device) for x in batch]

            # Run model inference on GPU
            outputs = model(input_ids=masked_input_ids, attention_mask=attention_mask)

            # Replace masked tokens with predicted tokens
            predicted_tokens_batch = masked_input_ids.clone()
            for i in range(masked_input_ids.shape[0]):  # Loop over batch
                mask_positions = (masked_input_ids[i] == tokenizer.mask_token_id).nonzero(as_tuple=True)[0]
                for pos in mask_positions:
                    predicted_token_id = torch.argmax(outputs.logits[i, pos], dim=-1).item()
                    predicted_tokens_batch[i, pos] = predicted_token_id

            # Decode predictions
            batch_predictions = tokenizer.batch_decode(predicted_tokens_batch.cpu(), skip_special_tokens=True)
            all_predictions.extend(batch_predictions)

    return all_predictions

#### mBERT

##### BPE

In [None]:
# Load tokenizers & models for mBERT with BPE
mbert_bpe_trained_path = "model-variants/models/mBERT_SPT-BPE"
mbert_bpe_trained_tokenizer = AutoTokenizer.from_pretrained(mbert_bpe_trained_path)
mbert_bpe_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)
mbert_bpe_trained_model = torch.compile(mbert_bpe_trained_model)
mbert_bpe_trained_model.eval()

In [None]:
# load dataset
mbert_bpe_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
mbert_bpe_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
mbert_bpe_trained_predictions_texts = mbert_bpe_trained_predictions["target"].tolist()
mbert_bpe_trained_predictions_dataset = MaskedTextDataset(mbert_bpe_trained_predictions_texts, mbert_bpe_trained_tokenizer)
mbert_bpe_trained_predictions_dataloader = DataLoader(
    mbert_bpe_trained_predictions_dataset, 
    batch_size=mbert_bpe_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
mbert_bpe_trained_predictions["generated"] = generate_masked_predictions_batch(
    mbert_bpe_trained_predictions_dataloader, 
    mbert_bpe_trained_model, 
    mbert_bpe_trained_tokenizer
)

In [None]:
# display
display(mbert_bpe_trained_predictions.head())

In [None]:
# save trained mbert predictions
save_model_variants_df(mbert_bpe_trained_predictions, "mbert_bpe_trained_predictions")

##### Unigram

In [None]:
# Load tokenizers & models for mBERT with BPE
mbert_unigram_trained_path = "model-variants/models/mBERT_SPT-UNIGRAM"
mbert_unigram_trained_tokenizer = AutoTokenizer.from_pretrained(mbert_unigram_trained_path)
mbert_unigram_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)
mbert_unigram_trained_model = torch.compile(mbert_unigram_trained_model)
mbert_unigram_trained_model.eval()

In [None]:
# load dataset
mbert_unigram_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
mbert_unigram_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
mbert_unigram_trained_predictions_texts = mbert_unigram_trained_predictions["target"].tolist()
mbert_unigram_trained_predictions_dataset = MaskedTextDataset(mbert_unigram_trained_predictions_texts, mbert_unigram_trained_tokenizer)
mbert_unigram_trained_predictions_dataloader = DataLoader(
    mbert_unigram_trained_predictions_dataset, 
    batch_size=mbert_unigram_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
mbert_unigram_trained_predictions["generated"] = generate_masked_predictions_batch(
    mbert_unigram_trained_predictions_dataloader, 
    mbert_unigram_trained_model, 
    mbert_unigram_trained_tokenizer
)

In [None]:
# display
display(mbert_unigram_trained_predictions.head())

In [None]:
# save trained mbert predictions
save_model_variants_df(mbert_unigram_trained_predictions, "mbert_unigram_trained_predictions")

#### XLM-R

##### BPE

In [None]:
# Load tokenizers & models for mBERT with BPE
xlmr_bpe_trained_path = "model-variants/models/XLM-R_SPT-BPE"
xlmr_bpe_trained_tokenizer = AutoTokenizer.from_pretrained(xlmr_bpe_trained_path)
xlmr_bpe_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)
xlmr_bpe_trained_model = torch.compile(xlmr_bpe_trained_model)
xlmr_bpe_trained_model.eval()

In [None]:
# load dataset
xlmr_bpe_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
xlmr_bpe_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
xlmr_bpe_trained_predictions_texts = xlmr_bpe_trained_predictions["target"].tolist()
xlmr_bpe_trained_predictions_dataset = MaskedTextDataset(xlmr_bpe_trained_predictions_texts, xlmr_bpe_trained_tokenizer)
xlmr_bpe_trained_predictions_dataloader = DataLoader(
    xlmr_bpe_trained_predictions_dataset, 
    batch_size=xlmr_bpe_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
xlmr_bpe_trained_predictions["generated"] = generate_masked_predictions_batch(
    xlmr_bpe_trained_predictions_dataloader, 
    xlmr_bpe_trained_model, 
    xlmr_bpe_trained_tokenizer
)

In [None]:
# display
display(xlmr_bpe_trained_predictions.head())

In [None]:
# save trained xlmr predictions
save_model_variants_df(xlmr_bpe_trained_predictions, "xlmr_bpe_trained_predictions")

##### Unigram

In [None]:
# Load tokenizers & models for mBERT with BPE
xlmr_unigram_trained_path = "model-variants/models/XLM-R_SPT-UNIGRAM"
xlmr_unigram_trained_tokenizer = AutoTokenizer.from_pretrained(xlmr_unigram_trained_path)
xlmr_unigram_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)
xlmr_unigram_trained_model = torch.compile(xlmr_unigram_trained_model)
xlmr_unigram_trained_model.eval()

In [None]:
# load dataset
xlmr_unigram_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
xlmr_unigram_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
xlmr_unigram_trained_predictions_texts = xlmr_unigram_trained_predictions["target"].tolist()
xlmr_unigram_trained_predictions_dataset = MaskedTextDataset(xlmr_unigram_trained_predictions_texts, xlmr_unigram_trained_tokenizer)
xlmr_unigram_trained_predictions_dataloader = DataLoader(
    xlmr_unigram_trained_predictions_dataset, 
    batch_size=xlmr_unigram_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
xlmr_unigram_trained_predictions["generated"] = generate_masked_predictions_batch(
    xlmr_unigram_trained_predictions_dataloader, 
    xlmr_unigram_trained_model, 
    xlmr_unigram_trained_tokenizer
)

In [None]:
# display
display(xlmr_unigram_trained_predictions.head())

In [None]:
# save trained xlmr predictions
save_model_variants_df(xlmr_unigram_trained_predictions, "xlmr_unigram_trained_predictions")

### mT5

In [None]:
# function to generate predictions
def generate_predictions_batch(dataloader, model, tokenizer, spt_processor):
    predictions = []

    for batch in tqdm(dataloader, desc=f"Generating Predictions", unit="batch"):
        # Apply spt
        spt_encoded_batch = [" ".join(spt_processor.encode_as_pieces(text)) for text in batch]
        
        # Tokenize input
        inputs = tokenizer(spt_encoded_batch, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            # Generate output sequence
            output_tokens = model.generate(**inputs, max_length=128)

        # Decode generated sequences
        decoded_output = [tokenizer.decode(tokens, skip_special_tokens=True) for tokens in output_tokens]
        predictions.extend(decoded_output)

    return predictions

#### BPE

In [None]:
# Load tokenizers & models for mT5 with BPE
mt5_bpe_trained_path = "model-variants/models/mT5_SPT-BPE"
mt5_bpe_trained_tokenizer = AutoTokenizer.from_pretrained(mt5_bpe_trained_path)
mt5_bpe_trained_model = AutoModelForSeq2SeqLM.from_pretrained().to(device)
mt5_bpe_trained_model = torch.compile(mt5_bpe_trained_model)
mt5_bpe_trained_model.eval()

In [None]:
# load dataset
mt5_bpe_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
mt5_bpe_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
mt5_bpe_trained_predictions_texts = mt5_bpe_trained_predictions["target"].tolist()
mt5_bpe_trained_predictions_dataset = TextDataset(mt5_bpe_trained_predictions_texts, mt5_bpe_trained_tokenizer)
mt5_bpe_trained_predictions_dataloader = DataLoader(
    mt5_bpe_trained_predictions_dataset, 
    batch_size=mt5_bpe_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
mt5_bpe_trained_predictions["generated"] = generate_predictions_batch(
    mt5_bpe_trained_predictions_dataloader, 
    mt5_bpe_trained_model, 
    mt5_bpe_trained_tokenizer,
    spt_models["bpe"]
)

In [None]:
# display
display(mt5_bpe_trained_predictions.head())

In [None]:
# save trained mt5 predictions
save_model_variants_df(mt5_bpe_trained_predictions, "mt5_bpe_trained_predictions")

#### Unigram

In [None]:
# Load tokenizers & models for mT5 with Unigram
mt5_unigram_trained_path = "model-variants/models/mT5_SPT-BPE"
mt5_unigram_trained_tokenizer = AutoTokenizer.from_pretrained(mt5_unigram_trained_path)
mt5_unigram_trained_model = AutoModelForSeq2SeqLM.from_pretrained().to(device)
mt5_unigram_trained_model = torch.compile(mt5_unigram_trained_model)
mt5_unigram_trained_model.eval()

In [None]:
# load dataset
mt5_unigram_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
mt5_unigram_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
mt5_unigram_trained_predictions_texts = mt5_unigram_trained_predictions["target"].tolist()
mt5_unigram_trained_predictions_dataset = TextDataset(mt5_unigram_trained_predictions_texts, mt5_unigram_trained_tokenizer)
mt5_unigram_trained_predictions_dataloader = DataLoader(
    mt5_unigram_trained_predictions_dataset, 
    batch_size=mt5_unigram_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
mt5_unigram_trained_predictions["generated"] = generate_predictions_batch(
    mt5_unigram_trained_predictions_dataloader, 
    mt5_unigram_trained_model, 
    mt5_unigram_trained_tokenizer,
    spt_models["unigram"]
)

In [None]:
# display
display(mt5_unigram_trained_predictions.head())

In [None]:
# save trained mt5 predictions
save_model_variants_df(mt5_unigram_trained_predictions, "mt5_unigram_trained_predictions")