# S3

In [12]:
import boto3
import zipfile
import os

In [13]:
# S3 Setup
s3 = boto3.client("s3")
bucket_name = "ms-thesis-sagemaker"  # Replace with your S3 bucket
s3_file_path = "mbert_bpe_hf_dataset.zip"  # Replace with the file name in S3
local_zip_path = "/home/ec2-user/SageMaker/ms-thesis/model-variants/mbert_bpe_hf_dataset.zip"  # Where to save in SageMaker

# Download the ZIP file from S3
s3.download_file(bucket_name, s3_file_path, local_zip_path)
print("ZIP file downloaded from S3 successfully!")

ZIP file downloaded from S3 successfully!


In [14]:
extract_path = "/home/ec2-user/SageMaker/ms-thesis/model-variants/"  # Where to extract

# Unzip the file
with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_path)

print("ZIP file extracted successfully!")

ZIP file extracted successfully!


In [15]:
os.remove(local_zip_path)
print("ZIP file deleted to free space.")

ZIP file deleted to free space.


# Importing all the important libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers sentence_transformers sentencepiece evaluate sacrebleu bert-score peft

In [None]:
!conda update -n base -c conda-forge conda -y

In [None]:
!conda install conda-forge::rouge-score -y

In [1]:
import pandas as pd
import tensorflow as tf
import sentencepiece as spm
import evaluate
import torch
import torch.nn as nn
from copy import deepcopy
from utils.dataframe import (
    load_gen_df, save_tmp_df, load_tmp_df,
    save_model_variants_df, load_model_variants_df,
    save_model_variants_arrow, load_model_variants_arrow
)
from utils.gpu import get_device
from utils.custom_class import MaskedTextDataset, EvaluationDataset, TextDataset
from utils.common import (
    generate_masked_predictions_batch,
    generate_mt5_predictions_batch,
    compute_metrics_batch,
    compute_multilingual_masked_perplexity_batch,
    compute_multilingual_mt5_perplexity_batch,
    convert_to_mean_scores_df
)
from IPython.display import display
from tqdm.notebook import tqdm
from datasets import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils import prune
from transformers import (
    logging,
    AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM,
    Trainer, TrainingArguments, LongformerConfig, LongformerModel,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model
# from optimum.intel.openvino import OVModelForMaskedLM, OVModelForSeq2SeqLM

2025-02-12 17:27:03.218709: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-12 17:27:03.242445: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-12 17:27:03.242480: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-12 17:27:03.257184: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Transformers is only compatible with Keras 2, but yo

# Set settings

In [2]:
tqdm.pandas()

In [3]:
# Suppress specific warnings from the transformers library
logging.set_verbosity_error()

# Common

In [4]:
# gpu device 
device = get_device()

2025-02-12 17:27:09.082541: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


Tensorflow GPUs:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Using PyTorch device: cuda
GPU Name: NVIDIA A10G


2025-02-12 17:27:09.121856: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-12 17:27:09.123844: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355


In [5]:
# spt models
spt_models = {
    "bpe": spm.SentencePieceProcessor("spt/spt_bpe.model"),
    #"unigram": spm.SentencePieceProcessor("spt/spt_unigram.model"),
}

In [6]:
# model names
train_model_names = {
    "mBERT": "bert-base-multilingual-cased",
    #"mT5": "google/mt5-small",
    #"XLM-R": "xlm-roberta-base"
}

In [7]:
# train tokenizers
train_tokenizers = {
    "mBERT": AutoTokenizer.from_pretrained(train_model_names["mBERT"]),
    #"mT5": AutoTokenizer.from_pretrained(train_model_names["mT5"], use_fast=False, legacy=True),
    #"XLM-R": AutoTokenizer.from_pretrained(train_model_names["XLM-R"])
}

# Function

In [8]:
def apply_lora(model, model_name):
    """
    Applies LoRA for efficient fine-tuning.
    """

    # Select correct LoRA target layers
    if "t5" in model_name.lower():
        target_modules = ["q", "v"]  # LoRA for T5/mT5
    else:
        target_modules = ["query", "value"]  # LoRA for BERT

    # Define LoRA Configuration
    lora_config = LoraConfig(
        r=8,                    # Rank of LoRA matrices
        lora_alpha=16,          # Scaling factor
        target_modules=target_modules,  
        lora_dropout=0.1,       # Prevents overfitting
        bias="none"
    )

    # Apply LoRA
    model = get_peft_model(model, lora_config)

    # Move model to GPU
    model.to(device)

    print(f"LoRA applied to {model_name} (Target Modules: {target_modules})")
    
    return model

# Data Preprocessing
Datasets used for training:
- myXNLI & ALT Corpus (normalized)
- Back-translated datasets (NLLB, Seamless M4T)
- Pseudo-parallel datasets (MiniLM, LaBSE)

## Data Preparation

In [14]:
# Load and process dataset
def load_and_rename_columns_multilingual(file_name):
    df = load_gen_df(f"{file_name}")

    column_mapping = {
        "english": "source",
        "burmese": "target",
        "english_back_translated": "source",
        "burmese_translated": "target",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["source", "target"]]

    return df

In [15]:
# datasets
datasets = {
    "normal": [
        "myxnli_normalized_1", 
        "myxnli_normalized_2", 
        "alt_combined_normalized"
    ],
    "nllb_back_translated": [
        "myxnli_nllb_back_translated_final_1", 
        "myxnli_nllb_back_translated_final_2", 
        "alt_combined_nllb_back_translated_final"
    ],
    "seamless_m4t_back_translated": [
        "myxnli_seamless_m4t_back_translated_final_1", 
        "myxnli_seamless_m4t_back_translated_final_2", 
        "alt_combined_seamless_m4t_back_translated_final"
    ]
}

In [16]:
# Load and process datasets
loaded_datasets = {}
for key, file_list in datasets.items():
    loaded_datasets[key] = [load_and_rename_columns_multilingual(file) for file in file_list]

In [17]:
# combine all datasets
combined = pd.concat(
    loaded_datasets["normal"] + 
    loaded_datasets["nllb_back_translated"] + 
    loaded_datasets["seamless_m4t_back_translated"],
    ignore_index=True
)

In [18]:
# Shuffle the data to prevent order bias
combined = combined.sample(frac=1).reset_index(drop=True)

In [19]:
# display combined dataset
display(combined.head())

Unnamed: 0,source,target
0,archaeologists think that a fire broke out in ...,ရှေးဟောင်းသုတေသီတွေက Knossos မှာ မီးလောင်တာ BC...
1,there are political meetings in every neighbor...,ရပ်ကွက်တိုင်းမှာ နိုင်ငံရေး အစည်းအဝေးတွေရှိတယ်။
2,the lawyer said that in article 712 (1) gao wa...,ရှေ့နေက ပုဒ်မ ၇၁၂ (၁) မှာ Gao ကို ငွေကြေးဆိုင်...
3,things can get confusing when talking about do...,Dordogne အကြောင်းပြောသောအခါ၊ ဝေးကွာသောနေရာများ...
4,making financial management a top priority acr...,ဘဏ္ဍာရေး စီမံခန့်ခွဲမှုကို ပြည်ထောင်စု အစိုးရတ...


In [23]:
# print length
print(f"Combined dataset length: {len(combined)}")

Combined dataset length: 1627576


In [20]:
# save data
save_model_variants_df(combined, "combined")

## Tokenize

In [24]:
def tokenize(examples, tokenizer, spt_tokenizer, model_name):
    """
    Tokenizes Burmese text using the selected SentencePiece model before applying Transformer tokenization.
    """
    # Apply SentencePiece Tokenization for Burmese target text
    spt_burmese = [" ".join(spt_tokenizer.encode_as_pieces(text)) for text in examples["target"]]
    examples["target"] = spt_burmese  # Overwrite with tokenized text

    if "t5" in model_name.lower():
        # mT5/T5 (Text-to-Text) - Tokenize source & target separately
        model_inputs = tokenizer(
            examples["source"], 
            padding="max_length", 
            truncation=True, 
            max_length=512
        )

        # Tokenize target`
        labels = tokenizer(
            examples["target"],  
            padding="max_length",  
            truncation=True,  
            max_length=512,
            return_special_tokens_mask=True  # Helps handle special tokens
        )["input_ids"]

        model_inputs["labels"] = labels
        model_inputs["decoder_input_ids"] = labels
        return model_inputs

    # BERT-based models (Masked/Causal LM)
    inputs = tokenizer(
        examples["source"],
        examples["target"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

    # Assign labels for causal LM (BERT-like models)
    inputs["labels"] = deepcopy(inputs["input_ids"])

    return inputs

In [None]:
# tokenize for each model and spt
for model_name, tokenizer in train_tokenizers.items():
    for spt_name, spt_tokenizer in spt_models.items():
        dataset = load_model_variants_df("combined")

        # Convert to Hugging Face Dataset
        dataset = Dataset.from_pandas(dataset)

        # apply tokenize
        dataset = dataset.map(
            lambda x, _: tokenize(x, tokenizer, spt_tokenizer, model_name),
            batched=True,
            desc=f"Tokenizing dataset for {model_name} with {spt_name}",
            with_indices=True,  # Passing index as a second argument
            num_proc=10
        )

        # save
        save_model_variants_arrow(dataset, f"{model_name.lower()}_{spt_name}")

# 1. Fine-Tuning Transformer Models for Burmese
This notebook fine-tunes three transformer models:
- mBERT (best perplexity, but weak BLEU/ROUGE)
- mT5 (best for generation, but requires more data)
- XLM-R (good BLEU/ROUGE, but poor perplexity)

Apply:
- Sentence-Piece Tokenization for Burmese segmentation
- LoRA for efficient fine-tuning
- Prefix-Tuning for lightweight adaptations
- Mixed Precision Training for speed improvements

## Fine-Tuning

In [9]:
# train models
train_models = {
    "mBERT": AutoModelForMaskedLM.from_pretrained(train_model_names["mBERT"], num_labels=1).to(device),
    #"mT5": AutoModelForSeq2SeqLM.from_pretrained(train_model_names["mT5"]).to(device),
    #"XLM-R": AutoModelForMaskedLM.from_pretrained(train_model_names["XLM-R"], num_labels=1).to(device)
}

In [10]:
# tokenized dataset
tokenized_datasets = {
    model_name: {
        spt_name: load_model_variants_arrow(f"{model_name.lower()}_{spt_name}")
        for spt_name in spt_models.keys()
    }
    for model_name in train_tokenizers.keys()
}

Loading dataset from disk:   0%|          | 0/25 [00:00<?, ?it/s]

In [11]:
def fine_tune_model(model_name, spt_name):
    """
    Fine-tunes the model with LoRA on the specified SentencePiece tokenization (SPT).
    """
    print(f"Fine-tuning {model_name} using SPT-{spt_name.upper()}...")

    # Load tokenizer & model
    tokenizer = train_tokenizers[model_name]
    model = train_models[model_name]

    # Move model to GPU before applying LoRA
    model.to(device)

    # Apply LoRA for efficient parameter tuning
    model = apply_lora(model, model_name)

    # Tokenize dataset & split into training and validation sets
    tokenized_dataset = tokenized_datasets[model_name][spt_name]
    split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
    
    train_data = split_dataset["train"]
    val_data = split_dataset["test"]

    # for debug, remove comment
    #train_data = train_data.select(range(100))
    #val_data = val_data.select(range(100))

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"model-variants/results/{model_name}_{spt_name.upper()}",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        learning_rate=3e-5,
        warmup_steps=500,
        weight_decay=0.01,
        save_strategy="epoch",
        save_total_limit=2,
        fp16=False,
        bf16=True,
        eval_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        logging_dir="./logs",
        logging_steps=1000,
        optim="adamw_torch_fused",
        auto_find_batch_size=True,
        disable_tqdm=False,
        label_names=["labels"],
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train the model
    trainer.train()

    # Save trained model and tokenizer
    save_path = f"model-variants/models/{model_name}_{spt_name.upper()}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"Model `{model_name}` fine-tuned and saved at `{save_path}`.")

### mBert

In [None]:
# fine tune with SPT-BPE
fine_tune_model("mBERT", "bpe")

Fine-tuning mBERT using SPT-BPE...
LoRA applied to mBERT (Target Modules: ['query', 'value'])


Epoch,Training Loss,Validation Loss
1,0.0002,3.3e-05


In [None]:
# fine tune with SPT-Unigram
fine_tune_model("mBERT", "unigram")

### mT5

In [None]:
# fine tune with SPT-BPE
fine_tune_model("mT5", "bpe")

Fine-tuning mT5 using SPT-BPE...
LoRA applied to mT5 (Target Modules: ['q', 'v'])


Epoch,Training Loss,Validation Loss


In [None]:
# fine tune with SPT-Unigram
fine_tune_model("mT5", "unigram")

### XLM-R

In [None]:
# fine tune with SPT-BPE
fine_tune_model("XLM-R", "bpe")

In [None]:
# fine tune with SPT-Unigram
fine_tune_model("XLM-R", "unigram")

## Generate Predictions

### mBERT

#### BPE

In [None]:
# Load tokenizers & models for mBERT with BPE
mbert_bpe_trained_path = "model-variants/models/mBERT_SPT-BPE"
mbert_bpe_trained_tokenizer = AutoTokenizer.from_pretrained(mbert_bpe_trained_path)
mbert_bpe_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)

In [None]:
# load dataset
mbert_bpe_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
mbert_bpe_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
mbert_bpe_trained_predictions_texts = mbert_bpe_trained_predictions["target"].tolist()
mbert_bpe_trained_predictions_dataset = MaskedTextDataset(mbert_bpe_trained_predictions_texts, mbert_bpe_trained_tokenizer)
mbert_bpe_trained_predictions_dataloader = DataLoader(
    mbert_bpe_trained_predictions_dataset, 
    batch_size=mbert_bpe_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
mbert_bpe_trained_predictions["generated"] = generate_masked_predictions_batch(
    mbert_bpe_trained_predictions_dataloader, 
    mbert_bpe_trained_model, 
    mbert_bpe_trained_tokenizer,
    device
)

In [None]:
# display
display(mbert_bpe_trained_predictions.head())

In [None]:
# save trained mbert predictions
save_model_variants_df(mbert_bpe_trained_predictions, "mBERT_bpe_trained_predictions")

#### Unigram

In [None]:
# Load tokenizers & models for mBERT with BPE
mbert_unigram_trained_path = "model-variants/models/mBERT_SPT-UNIGRAM"
mbert_unigram_trained_tokenizer = AutoTokenizer.from_pretrained(mbert_unigram_trained_path)
mbert_unigram_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)

In [None]:
# load dataset
mbert_unigram_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
mbert_unigram_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
mbert_unigram_trained_predictions_texts = mbert_unigram_trained_predictions["target"].tolist()
mbert_unigram_trained_predictions_dataset = MaskedTextDataset(mbert_unigram_trained_predictions_texts, mbert_unigram_trained_tokenizer)
mbert_unigram_trained_predictions_dataloader = DataLoader(
    mbert_unigram_trained_predictions_dataset, 
    batch_size=mbert_unigram_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
mbert_unigram_trained_predictions["generated"] = generate_masked_predictions_batch(
    mbert_unigram_trained_predictions_dataloader, 
    mbert_unigram_trained_model, 
    mbert_unigram_trained_tokenizer,
    device
)

In [None]:
# display
display(mbert_unigram_trained_predictions.head())

In [None]:
# save trained mbert predictions
save_model_variants_df(mbert_unigram_trained_predictions, "mBERT_unigram_trained_predictions")

### XLM-R

#### BPE

In [None]:
# Load tokenizers & models for mBERT with BPE
xlmr_bpe_trained_path = "model-variants/models/XLM-R_SPT-BPE"
xlmr_bpe_trained_tokenizer = AutoTokenizer.from_pretrained(xlmr_bpe_trained_path)
xlmr_bpe_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)

In [None]:
# load dataset
xlmr_bpe_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
xlmr_bpe_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
xlmr_bpe_trained_predictions_texts = xlmr_bpe_trained_predictions["target"].tolist()
xlmr_bpe_trained_predictions_dataset = MaskedTextDataset(xlmr_bpe_trained_predictions_texts, xlmr_bpe_trained_tokenizer)
xlmr_bpe_trained_predictions_dataloader = DataLoader(
    xlmr_bpe_trained_predictions_dataset, 
    batch_size=xlmr_bpe_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
xlmr_bpe_trained_predictions["generated"] = generate_masked_predictions_batch(
    xlmr_bpe_trained_predictions_dataloader, 
    xlmr_bpe_trained_model, 
    xlmr_bpe_trained_tokenizer,
    device
)

In [None]:
# display
display(xlmr_bpe_trained_predictions.head())

In [None]:
# save trained xlmr predictions
save_model_variants_df(xlmr_bpe_trained_predictions, "XLM-R_bpe_trained_predictions")

#### Unigram

In [None]:
# Load tokenizers & models for mBERT with BPE
xlmr_unigram_trained_path = "model-variants/models/XLM-R_SPT-UNIGRAM"
xlmr_unigram_trained_tokenizer = AutoTokenizer.from_pretrained(xlmr_unigram_trained_path)
xlmr_unigram_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)

In [None]:
# load dataset
xlmr_unigram_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
xlmr_unigram_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
xlmr_unigram_trained_predictions_texts = xlmr_unigram_trained_predictions["target"].tolist()
xlmr_unigram_trained_predictions_dataset = MaskedTextDataset(xlmr_unigram_trained_predictions_texts, xlmr_unigram_trained_tokenizer)
xlmr_unigram_trained_predictions_dataloader = DataLoader(
    xlmr_unigram_trained_predictions_dataset, 
    batch_size=xlmr_unigram_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
xlmr_unigram_trained_predictions["generated"] = generate_masked_predictions_batch(
    xlmr_unigram_trained_predictions_dataloader, 
    xlmr_unigram_trained_model, 
    xlmr_unigram_trained_tokenizer,
    device
)

In [None]:
# display
display(xlmr_unigram_trained_predictions.head())

In [None]:
# save trained xlmr predictions
save_model_variants_df(xlmr_unigram_trained_predictions, "XLM-R_unigram_trained_predictions")

### mT5

#### BPE

In [None]:
# Load tokenizers & models for mT5 with BPE
mt5_bpe_trained_path = "model-variants/models/mT5_SPT-BPE"
mt5_bpe_trained_tokenizer = AutoTokenizer.from_pretrained(mt5_bpe_trained_path)
mt5_bpe_trained_model = AutoModelForSeq2SeqLM.from_pretrained().to(device)

In [None]:
# load dataset
mt5_bpe_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
mt5_bpe_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
mt5_bpe_trained_predictions_texts = mt5_bpe_trained_predictions["target"].tolist()
mt5_bpe_trained_predictions_dataset = TextDataset(mt5_bpe_trained_predictions_texts)
mt5_bpe_trained_predictions_dataloader = DataLoader(
    mt5_bpe_trained_predictions_dataset, 
    batch_size=mt5_bpe_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
mt5_bpe_trained_predictions["generated"] = generate_mt5_predictions_batch(
    mt5_bpe_trained_predictions_dataloader, 
    mt5_bpe_trained_model, 
    mt5_bpe_trained_tokenizer,
    spt_models["bpe"],
    device
)

In [None]:
# display
display(mt5_bpe_trained_predictions.head())

In [None]:
# save trained mt5 predictions
save_model_variants_df(mt5_bpe_trained_predictions, "mT5_bpe_trained_predictions")

#### Unigram

In [None]:
# Load tokenizers & models for mT5 with Unigram
mt5_unigram_trained_path = "model-variants/models/mT5_SPT-BPE"
mt5_unigram_trained_tokenizer = AutoTokenizer.from_pretrained(mt5_unigram_trained_path)
mt5_unigram_trained_model = AutoModelForSeq2SeqLM.from_pretrained().to(device)

In [None]:
# load dataset
mt5_unigram_trained_predictions = load_model_variants_df("combined")

In [None]:
# batch size
mt5_unigram_trained_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
mt5_unigram_trained_predictions_texts = mt5_unigram_trained_predictions["target"].tolist()
mt5_unigram_trained_predictions_dataset = TextDataset(mt5_unigram_trained_predictions_texts)
mt5_unigram_trained_predictions_dataloader = DataLoader(
    mt5_unigram_trained_predictions_dataset, 
    batch_size=mt5_unigram_trained_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
mt5_unigram_trained_predictions["generated"] = generate_mt5_predictions_batch(
    mt5_unigram_trained_predictions_dataloader, 
    mt5_unigram_trained_model, 
    mt5_unigram_trained_tokenizer,
    spt_models["unigram"],
    device
)

In [None]:
# display
display(mt5_unigram_trained_predictions.head())

In [None]:
# save trained mt5 predictions
save_model_variants_df(mt5_unigram_trained_predictions, "mT5_unigram_trained_predictions")

## Evaluate Model Performance
Compute BLEU, ROUGE-1, ROUGE-2, ROUGE-3, ROUGE-L, chrF-S, BERTScore and Perplexity scores.

### Metrics

#### mBERT

##### BPE

In [None]:
# load dataset
mbert_bpe_trained_metrics = load_model_variants_df(f"mBERT_bpe_trained_predictions")

In [None]:
# compute metrics
print(f"Processing Data for mBERT with BPE..."),
compute_metrics_batch(mbert_bpe_trained_metrics)

In [None]:
# display
print(f"Metrics scores for mBERT with BPE:")
print(f"BLEU Score: {mbert_bpe_trained_metrics['bleu'].mean()}")
print(f"ROUGE-1 Score: {mbert_bpe_trained_metrics['rouge-1'].mean()}")
print(f"ROUGE-2 Score: {mbert_bpe_trained_metrics['rouge-2'].mean()}")
print(f"ROUGE-L Score: {mbert_bpe_trained_metrics['rouge-l'].mean()}")
print(f"chrF-S Score: {mbert_bpe_trained_metrics['chrf-s'].mean()}")
print(f"BERT Score: {mbert_bpe_trained_metrics['bert_score'].mean()}")

In [None]:
# save results
save_tmp_df(mbert_bpe_trained_metrics, f"mBERT_bpe_trained_metrics")

##### Unigram

In [None]:
# load dataset
mbert_unigram_trained_metrics = load_model_variants_df(f"mBERT_unigram_trained_predictions")

In [None]:
# compute metrics
print(f"Processing Data for mBERT with Unigram..."),
compute_metrics_batch(mbert_unigram_trained_metrics)

In [None]:
# display
print(f"Metrics scores for mBERT with BPE:")
print(f"BLEU Score: {mbert_unigram_trained_metrics['bleu'].mean()}")
print(f"ROUGE-1 Score: {mbert_unigram_trained_metrics['rouge-1'].mean()}")
print(f"ROUGE-2 Score: {mbert_unigram_trained_metrics['rouge-2'].mean()}")
print(f"ROUGE-L Score: {mbert_unigram_trained_metrics['rouge-l'].mean()}")
print(f"chrF-S Score: {mbert_unigram_trained_metrics['chrf-s'].mean()}")
print(f"BERT Score: {mbert_unigram_trained_metrics['bert_score'].mean()}")

In [None]:
# save results
save_tmp_df(mbert_unigram_trained_metrics, f"mBERT_unigram_trained_metrics")

#### XLM-R

##### BPE

In [None]:
# load dataset
xlmr_bpe_trained_metrics = load_model_variants_df(f"XLM-R_bpe_trained_predictions")

In [None]:
# compute metrics
print(f"Processing Data for XLM-R with BPE..."),
compute_metrics_batch(xlmr_bpe_trained_metrics)

In [None]:
# display
print(f"Metrics scores for XLM-R with BPE:")
print(f"BLEU Score: {xlmr_bpe_trained_metrics['bleu'].mean()}")
print(f"ROUGE-1 Score: {xlmr_bpe_trained_metrics['rouge-1'].mean()}")
print(f"ROUGE-2 Score: {xlmr_bpe_trained_metrics['rouge-2'].mean()}")
print(f"ROUGE-L Score: {xlmr_bpe_trained_metrics['rouge-l'].mean()}")
print(f"chrF-S Score: {xlmr_bpe_trained_metrics['chrf-s'].mean()}")
print(f"BERT Score: {xlmr_bpe_trained_metrics['bert_score'].mean()}")

In [None]:
# save results
save_tmp_df(xlmr_bpe_trained_metrics, f"XLM-R_bpe_trained_metrics")

##### Unigram

In [None]:
# load dataset
xlmr_unigram_trained_metrics = load_model_variants_df(f"XLM-R_unigram_trained_predictions")

In [None]:
# compute metrics
print(f"Processing Data for XLM-R with Unigram..."),
compute_metrics_batch(xlmr_unigram_trained_metrics)

In [None]:
# display
print(f"Metrics scores for mBERT with BPE:")
print(f"BLEU Score: {xlmr_unigram_trained_metrics['bleu'].mean()}")
print(f"ROUGE-1 Score: {xlmr_unigram_trained_metrics['rouge-1'].mean()}")
print(f"ROUGE-2 Score: {xlmr_unigram_trained_metrics['rouge-2'].mean()}")
print(f"ROUGE-L Score: {xlmr_unigram_trained_metrics['rouge-l'].mean()}")
print(f"chrF-S Score: {xlmr_unigram_trained_metrics['chrf-s'].mean()}")
print(f"BERT Score: {xlmr_unigram_trained_metrics['bert_score'].mean()}")

In [None]:
# save results
save_tmp_df(xlmr_unigram_trained_metrics, f"XLM-R_unigram_trained_metrics")

#### mT5

##### BPE

In [None]:
# load dataset
mt5_bpe_trained_metrics = load_model_variants_df(f"mT5_bpe_trained_predictions")

In [None]:
# compute metrics
print(f"Processing Data for mT5 with BPE..."),
compute_metrics_batch(mt5_bpe_trained_metrics)

In [None]:
# display
print(f"Metrics scores for mT5 with BPE:")
print(f"BLEU Score: {mt5_bpe_trained_metrics['bleu'].mean()}")
print(f"ROUGE-1 Score: {mt5_bpe_trained_metrics['rouge-1'].mean()}")
print(f"ROUGE-2 Score: {mt5_bpe_trained_metrics['rouge-2'].mean()}")
print(f"ROUGE-L Score: {mt5_bpe_trained_metrics['rouge-l'].mean()}")
print(f"chrF-S Score: {mt5_bpe_trained_metrics['chrf-s'].mean()}")
print(f"BERT Score: {mt5_bpe_trained_metrics['bert_score'].mean()}")

In [None]:
# save results
save_tmp_df(mt5_bpe_trained_metrics, f"mT5_bpe_trained_metrics")

##### Unigram

In [None]:
# load dataset
mt5_unigram_trained_metrics = load_model_variants_df(f"mT5_unigram_trained_predictions")

In [None]:
# compute metrics
print(f"Processing Data for mT5 with Unigram..."),
compute_metrics_batch(mt5_unigram_trained_metrics)

In [None]:
# display
print(f"Metrics scores for mBERT with BPE:")
print(f"BLEU Score: {mt5_unigram_trained_metrics['bleu'].mean()}")
print(f"ROUGE-1 Score: {mt5_unigram_trained_metrics['rouge-1'].mean()}")
print(f"ROUGE-2 Score: {mt5_unigram_trained_metrics['rouge-2'].mean()}")
print(f"ROUGE-L Score: {mt5_unigram_trained_metrics['rouge-l'].mean()}")
print(f"chrF-S Score: {mt5_unigram_trained_metrics['chrf-s'].mean()}")
print(f"BERT Score: {mt5_unigram_trained_metrics['bert_score'].mean()}")

In [None]:
# save results
save_tmp_df(mt5_unigram_trained_metrics, f"mT5_unigram_trained_metrics")

### Perplexity

#### mBERT

##### BPE

In [None]:
# load dataset
mbert_bpe_trained_perplexity = load_model_variants_df(f"mBERT_bpe_trained_predictions")

In [None]:
# batch size
mbert_bpe_trained_perplexity_batch_size = 8

In [None]:
# Load tokenizers & models for mBERT with BPE
mbert_bpe_trained_path = "model-variants/models/mBERT_SPT-BPE"
mbert_bpe_trained_tokenizer = AutoTokenizer.from_pretrained(mbert_bpe_trained_path)
mbert_bpe_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)

In [None]:
# Prepare dataset and DataLoader
mbert_bpe_trained_generated_texts = mbert_bpe_trained_perplexity["generated"].tolist()
mbert_bpe_trained_text_dataset = TextDataset(mbert_bpe_trained_generated_texts)
mbert_bpe_trained_dataloader = DataLoader(
    mbert_bpe_trained_text_dataset, 
    batch_size=mbert_bpe_trained_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# compute and store perplexity scores in DataFrame
mbert_bpe_trained_perplexity["perplexity"] = compute_multilingual_masked_perplexity_batch(
    mbert_bpe_trained_dataloader,
    mbert_bpe_trained_model,
    mbert_bpe_trained_tokenizer,
    device
)

In [None]:
# display perplexity
print(f"Perplexity Score: {mbert_bpe_trained_perplexity['perplexity'].mean()}")

In [None]:
# save perplexity
save_tmp_df(mbert_bpe_trained_perplexity, f"mBERT_bpe_trained_perplexity")

##### Unigram

In [None]:
# load dataset
mbert_unigram_trained_perplexity = load_model_variants_df(f"mBERT_unigram_trained_predictions")

In [None]:
# batch size
mbert_unigram_trained_perplexity_batch_size = 8

In [None]:
# Load tokenizers & models for mBERT with Unigram
mbert_unigram_trained_path = "model-variants/models/mBERT_SPT-UNIGRAM"
mbert_unigram_trained_tokenizer = AutoTokenizer.from_pretrained(mbert_unigram_trained_path)
mbert_unigram_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)

In [None]:
# Prepare dataset and DataLoader
mbert_unigram_trained_generated_texts = mbert_unigram_trained_perplexity["generated"].tolist()
mbert_unigram_trained_text_dataset = TextDataset(mbert_unigram_trained_generated_texts)
mbert_unigram_trained_dataloader = DataLoader(
    mbert_unigram_trained_text_dataset, 
    batch_size=mbert_unigram_trained_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# compute and store perplexity scores in DataFrame
mbert_unigram_trained_perplexity["perplexity"] = compute_multilingual_masked_perplexity_batch(
    mbert_unigram_trained_dataloader,
    mbert_unigram_trained_model,
    mbert_unigram_trained_tokenizer,
    device
)

In [None]:
# display perplexity
print(f"Perplexity Score: {mbert_bpe_trained_perplexity['perplexity'].mean()}")

In [None]:
# save perplexity
save_tmp_df(mbert_bpe_trained_perplexity, f"mBERT_bpe_trained_perplexity")

#### XLM-R

##### BPE

In [None]:
# load dataset
xlmr_bpe_trained_perplexity = load_model_variants_df(f"XLM-R_bpe_trained_predictions")

In [None]:
# batch size
xlmr_bpe_trained_perplexity_batch_size = 8

In [None]:
# Load tokenizers & models for XLMR-R with BPE
xlmr_bpe_trained_path = "model-variants/models/XLM-R_SPT-BPE"
xlmr_bpe_trained_tokenizer = AutoTokenizer.from_pretrained(xlmr_bpe_trained_path)
xlmr_bpe_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)

In [None]:
# Prepare dataset and DataLoader
xlmr_bpe_trained_generated_texts = xlmr_bpe_trained_perplexity["generated"].tolist()
xlmr_bpe_trained_text_dataset = TextDataset(xlmr_bpe_trained_generated_texts)
xlmr_bpe_trained_dataloader = DataLoader(
    xlmr_bpe_trained_text_dataset, 
    batch_size=xlmr_bpe_trained_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# compute and store perplexity scores in DataFrame
xlmr_bpe_trained_perplexity["perplexity"] = compute_multilingual_masked_perplexity_batch(
    xlmr_bpe_trained_dataloader,
    xlmr_bpe_trained_model,
    xlmr_bpe_trained_tokenizer,
    device
)

In [None]:
# display perplexity
print(f"Perplexity Score: {xlmr_bpe_trained_perplexity['perplexity'].mean()}")

In [None]:
# save perplexity
save_tmp_df(xlmr_bpe_trained_perplexity, f"XLM-R_bpe_trained_perplexity")

##### Unigram

In [None]:
# load dataset
xlmr_unigram_trained_perplexity = load_model_variants_df(f"XLM-R_unigram_trained_predictions")

In [None]:
# batch size
xlmr_unigram_trained_perplexity_batch_size = 8

In [None]:
# Load tokenizers & models for mBERT with BPE
xlmr_unigram_trained_path = "model-variants/models/XLM-R_SPT-UNIGRAM"
xlmr_unigram_trained_tokenizer = AutoTokenizer.from_pretrained(xlmr_unigram_trained_path)
xlmr_unigram_trained_model = AutoModelForMaskedLM.from_pretrained().to(device)

In [None]:
# Prepare dataset and DataLoader
xlmr_unigram_trained_generated_texts = xlmr_unigram_trained_perplexity["generated"].tolist()
xlmr_unigram_trained_text_dataset = TextDataset(xlmr_unigram_trained_generated_texts)
xlmr_unigram_trained_dataloader = DataLoader(
    xlmr_unigram_trained_text_dataset, 
    batch_size=xlmr_unigram_trained_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# compute and store perplexity scores in DataFrame
xlmr_unigram_trained_perplexity["perplexity"] = compute_multilingual_masked_perplexity_batch(
    xlmr_unigram_trained_dataloader,
    xlmr_unigram_trained_model,
    xlmr_unigram_trained_tokenizer,
    device
)

In [None]:
# display perplexity
print(f"Perplexity Score: {xlmr_unigram_trained_perplexity['perplexity'].mean()}")

In [None]:
# save perplexity
save_tmp_df(xlmr_unigram_trained_perplexity, f"XLM-R_unigram_trained_perplexity")

#### mt5

##### BPE

In [None]:
# load dataset
mt5_bpe_trained_perplexity = load_model_variants_df(f"mT5_bpe_trained_predictions")

In [None]:
# batch size
mt5_bpe_trained_perplexity_batch_size = 8

In [None]:
# Load tokenizers & models for mt5 with BPE
mt5_bpe_trained_path = "model-variants/models/mT5_SPT-BPE"
mt5_bpe_trained_tokenizer = AutoTokenizer.from_pretrained(mt5_bpe_trained_path)
mt5_bpe_trained_model = AutoModelForSeq2SeqLM.from_pretrained().to(device)

In [None]:
# Prepare dataset and DataLoader
mt5_bpe_trained_generated_texts = mt5_bpe_trained_perplexity["generated"].tolist()
mt5_bpe_trained_text_dataset = TextDataset(mt5_bpe_trained_generated_texts)
mt5_bpe_trained_dataloader = DataLoader(
    mt5_bpe_trained_text_dataset, 
    batch_size=mt5_bpe_trained_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# compute and store perplexity scores in DataFrame
mt5_bpe_trained_perplexity["perplexity"] = compute_multilingual_mt5_perplexity_batch(
    mt5_bpe_trained_dataloader,
    mt5_bpe_trained_model,
    mt5_bpe_trained_tokenizer,
    device
)

In [None]:
# display perplexity
print(f"Perplexity Score: {mt5_bpe_trained_perplexity['perplexity'].mean()}")

In [None]:
# save perplexity
save_tmp_df(mt5_bpe_trained_perplexity, f"mT5_bpe_trained_perplexity")

##### Unigram

In [None]:
# load dataset
mt5_unigram_trained_perplexity = load_model_variants_df(f"mT5_unigram_trained_predictions")

In [None]:
# batch size
mt5_unigram_trained_perplexity_batch_size = 8

In [None]:
# Load tokenizers & models for mBERT with BPE
mt5_unigram_trained_path = "model-variants/models/mBERT_SPT-UNIGRAM"
mt5_unigram_trained_tokenizer = AutoTokenizer.from_pretrained(mt5_unigram_trained_path)
mt5_unigram_trained_model = AutoModelForSeq2SeqLM.from_pretrained().to(device)

In [None]:
# Prepare dataset and DataLoader
mt5_unigram_trained_generated_texts = mt5_unigram_trained_perplexity["generated"].tolist()
mt5_unigram_trained_text_dataset = TextDataset(mt5_unigram_trained_generated_texts)
mt5_unigram_trained_dataloader = DataLoader(
    mt5_unigram_trained_text_dataset, 
    batch_size=mt5_unigram_trained_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# compute and store perplexity scores in DataFrame
mt5_unigram_trained_perplexity["perplexity"] = compute_multilingual_mt5_perplexity_batch(
    mt5_unigram_trained_dataloader,
    mt5_unigram_trained_model,
    mt5_unigram_trained_tokenizer,
    device
)

In [None]:
# display perplexity
print(f"Perplexity Score: {mt5_unigram_trained_perplexity['perplexity'].mean()}")

In [None]:
# save perplexity
save_tmp_df(mt5_unigram_trained_perplexity, f"mT5_unigram_trained_perplexity")

### Save Evaluation Results

In [None]:
# combine evaluation results
for model_name in train_model_names.keys():
    for spt_name in spt_models.keys():
        print(f"Processing {model_name}...")

        trained_evaluation_results = load_model_variants_df(f"{model_name}_{spt_name}_trained_predictions")

        # load metrics and set
        metrics = load_tmp_df(f"{model_name}_{spt_name}_metrics")
        trained_evaluation_results["bleu"] = metrics["bleu"]
        trained_evaluation_results["rouge-1"] = metrics["rouge-1"]
        trained_evaluation_results["rouge-2"] = metrics["rouge-2"]
        trained_evaluation_results["rouge-l"] = metrics["rouge-l"]
        trained_evaluation_results["chrf-s"] = metrics["chrf-s"]
        trained_evaluation_results["bert_score"] = metrics["bert_score"]

        # load perplexity and set
        perplexity = load_tmp_df(f"{model_name}_{spt_name}_perplexity")
        trained_evaluation_results["perplexity"] = perplexity["perplexity"]

        save_model_variants_df(trained_evaluation_results, f"{model_name}_{spt_name}_evaluation_results")

## Benchmarking and Analysis
Compare the performance of LSTM BPE, LSTM Unigram, mBERT, and XLM-R using BLEU, ROUGE, chrF-S, BERT Score and Perplexity.

In [None]:
# load data
trained_benchmarking_datasets = {}
for model_name in train_model_names.keys():
    for spt_name in spt_models.keys():
        df = load_model_variants_df(trained_evaluation_results, f"{model_name}_{spt_name}_evaluation_results")
        trained_benchmarking_datasets[f"{model_name} {spt_name.upper()}"] = df

In [None]:
# convert to mean score df
trained_benchmarking_mean_scores = convert_to_mean_scores_df(trained_benchmarking_datasets)

In [None]:
# Display mean scores
display(trained_benchmarking_mean_scores)

In [None]:
# save benchmarking results
save_model_variants_df(trained_benchmarking_mean_scores, "trained_evaluation_results")

# 2. Optimize Model Efficiency with Lightweight Transformers
- Optimizes mBERT, XLM-R, mT5-Small (BPE & Unigram).
- Trains TinyBERT, DistilBERT with Knowledge Distillation.
- Evaluates BLEU, ROUGE, chrF++ after optimization.

## Train

In [None]:
# Define Fine-Tuned Teacher Model Paths
teacher_models = {
    "mBERT_BPE": "model-variants/models/mBERT_SPT-BPE",
    "mBERT_Unigram": "model-variants/models/mBERT_SPT-UNIGRAM",
    "mT5_BPE": "model-variants/models/mT5_SPT-BPE",
    "mT5_Unigram": "model-variants/models/mT5_SPT-UNIGRAM",
    "XLM-R_BPE": "model-variants/models/XLM-R_SPT_BPE",
    "XLM-R_Unigram": "model-variants/models/XLM-R_SPT-UNIGRAM",
}

In [None]:
# Define Student Models (TinyBERT & DistilBERT)
student_models = {
    "TinyBERT": "huawei-noah/TinyBERT_General_6L_768D",
    "DistilBERT": "distilbert-base-uncased"
}

In [None]:
# Train Student Models with Knowledge Distillation
class DistillationTrainer(Trainer):
    def __init__(self, teacher_model, alpha=0.5, temperature=2.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.teacher_model.eval()
        self.alpha = alpha
        self.temperature = temperature

    def compute_loss(self, model, inputs, return_outputs=False):
        student_outputs = model(**inputs)
        student_logits = student_outputs.logits
        with torch.no_grad():
            teacher_outputs = self.teacher_model(**inputs)
            teacher_logits = teacher_outputs.logits.detach()

        loss_ce = nn.CrossEntropyLoss()(student_logits.view(-1, student_logits.size(-1)), inputs["labels"].view(-1))
        loss_kl = nn.KLDivLoss(reduction="batchmean")(
            torch.nn.functional.log_softmax(student_logits / self.temperature, dim=-1),
            torch.nn.functional.softmax(teacher_logits / self.temperature, dim=-1),
        ) * (self.temperature ** 2)

        loss = self.alpha * loss_ce + (1 - self.alpha) * loss_kl
        return (loss, student_outputs) if return_outputs else loss

In [None]:
# train fine tuned model with knowledge distillation
def train_distilled_model(student_model_name, teacher_model_name, teacher_path):
    print(f"🚀 Training {student_model_name} using {teacher_model_name} as a teacher...")
    tokenizer = AutoTokenizer.from_pretrained(teacher_path)

    # ✅ Select Correct Model Type
    if "t5" in teacher_model_name.lower():
        teacher_model = AutoModelForSeq2SeqLM.from_pretrained(teacher_path).to(device)
        student_model = AutoModelForSeq2SeqLM.from_pretrained(STUDENT_MODELS[student_model_name]).to(device)
    else:
        teacher_model = AutoModelForMaskedLM.from_pretrained(teacher_path).to(device)
        student_model = AutoModelForMaskedLM.from_pretrained(STUDENT_MODELS[student_model_name]).to(device)

    dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
    dataset = dataset.map(lambda x: tokenizer(x["article"], padding="max_length", truncation=True), batched=True)

    training_args = TrainingArguments(
        output_dir=f"./models/{student_model_name}_distilled",
        per_device_train_batch_size=8,
        num_train_epochs=3,
        learning_rate=5e-5,
        save_strategy="epoch",
        fp16=True
    )

    trainer = DistillationTrainer(teacher_model=teacher_model, model=student_model, args=training_args, tokenizer=tokenizer)
    trainer.train()
    student_model.save_pretrained(f"./models/{student_model_name}_distilled")