# S3

In [12]:
import boto3
import zipfile
import os

In [13]:
# S3 Setup
s3 = boto3.client("s3")
bucket_name = "ms-thesis-sagemaker"  # Replace with your S3 bucket
s3_file_path = "mbert_bpe_hf_dataset.zip"  # Replace with the file name in S3
local_zip_path = "/home/ec2-user/SageMaker/ms-thesis/model-variants/mbert_bpe_hf_dataset.zip"  # Where to save in SageMaker

# Download the ZIP file from S3
s3.download_file(bucket_name, s3_file_path, local_zip_path)
print("ZIP file downloaded from S3 successfully!")

ZIP file downloaded from S3 successfully!


In [14]:
extract_path = "/home/ec2-user/SageMaker/ms-thesis/model-variants/"  # Where to extract

# Unzip the file
with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
    zip_ref.extractall(extract_path)

print("ZIP file extracted successfully!")

ZIP file extracted successfully!


In [15]:
os.remove(local_zip_path)
print("ZIP file deleted to free space.")

ZIP file deleted to free space.


# Importing all the important libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers sentence_transformers sentencepiece evaluate sacrebleu bert-score peft

In [None]:
!conda update -n base -c conda-forge conda -y

In [None]:
!conda install conda-forge::rouge-score -y

In [1]:
import pandas as pd
import tensorflow as tf
import sentencepiece as spm
import torch
import torch.nn as nn
from copy import deepcopy
from utils.dataframe import (
    load_gen_df, save_tmp_df, load_tmp_df,
    save_model_variants_df, load_model_variants_df,
    save_model_variants_arrow, load_model_variants_arrow
)
from utils.gpu import get_device
from utils.custom_class import MaskedTextDataset, EvaluationDataset, TextDataset
from utils.common import (
    generate_masked_predictions_batch,
    generate_mt5_predictions_batch,
    compute_metrics_batch,
    compute_multilingual_masked_perplexity_batch,
    compute_multilingual_mt5_perplexity_batch,
    convert_to_mean_scores_df
)
from IPython.display import display
from tqdm.notebook import tqdm
from datasets import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils import prune
from transformers import (
    logging,
    AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM,
    Trainer, TrainingArguments, LongformerConfig, LongformerModel,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model
# from optimum.intel.openvino import OVModelForMaskedLM, OVModelForSeq2SeqLM

# Set settings

In [2]:
tqdm.pandas()

In [3]:
# Suppress specific warnings from the transformers library
logging.set_verbosity_error()

# Common

In [4]:
# gpu device 
device = get_device()


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


In [None]:
# spt models
spt_models = {
    "bpe": spm.SentencePieceProcessor("spt/spt_bpe.model"),
    "unigram": spm.SentencePieceProcessor("spt/spt_unigram.model"),
}

In [None]:
# model names
train_model_names = {
    "mBERT": "bert-base-multilingual-cased",
    "mT5": "google/mt5-small",
    "XLM-R": "xlm-roberta-base"
}

In [None]:
# train tokenizers
train_tokenizers = {
    "mBERT": AutoTokenizer.from_pretrained(train_model_names["mBERT"]),
    "mT5": AutoTokenizer.from_pretrained(train_model_names["mT5"], use_fast=False, legacy=True),
    "XLM-R": AutoTokenizer.from_pretrained(train_model_names["XLM-R"])
}

In [None]:
train_agrs = {
    "warmup_steps": 500,
    "weight_decay": 0.01,
    "save_strategy": "epoch",
    "save_total_limit": 2,
    "fp16": False,
    "bf16": True,
    "eval_strategy": "epoch",
    "load_best_model_at_end": True,
    "metric_for_best_model": "loss",
    "greater_is_better": False,
    "logging_dir": "./logs",
    "logging_steps": 1000,
    "optim": "adamw_torch_fused",
    "auto_find_batch_size": True,
    "disable_tqdm": False,
    "label_names": ["labels"],
}

# Function

In [None]:
def apply_lora(model, model_name, is_student):
    """
    Applies LoRA for efficient fine-tuning.
    """

    # Select correct LoRA target layers
    if "t5" in model_name.lower():
        target_modules = ["q", "v"]  # LoRA for T5/mT5
    else:
        target_modules = ["query", "value"]  # LoRA for BERT

    # Define LoRA Configuration
    if is_student:
        lora_config = LoraConfig(
            r=4,                    # Rank of LoRA matrices
            lora_alpha=8,           # Scaling factor
            target_modules=target_modules,  
            lora_dropout=0.05,      # Prevents overfitting
            bias="none"
        )
    else:
        lora_config = LoraConfig(
            r=8,                    # Rank of LoRA matrices
            lora_alpha=16,          # Scaling factor
            target_modules=target_modules,  
            lora_dropout=0.1,       # Prevents overfitting
            bias="none"
        )

    # Apply LoRA
    model = get_peft_model(model, lora_config)

    # Move model to GPU
    model.to(device)

    print(f"LoRA applied to {model_name} (Target Modules: {target_modules})")
    
    return model

# Data Preprocessing
Datasets used for training:
- myXNLI & ALT Corpus (normalized)
- Back-translated datasets (NLLB, Seamless M4T)
- Pseudo-parallel datasets (MiniLM, LaBSE)

## Data Preparation

In [14]:
# Load and process dataset
def load_and_rename_columns_multilingual(file_name):
    df = load_gen_df(f"{file_name}")

    column_mapping = {
        "english": "source",
        "burmese": "target",
        "english_back_translated": "source",
        "burmese_translated": "target",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["source", "target"]]

    return df

In [15]:
# datasets
datasets = {
    "normal": [
        "myxnli_normalized_1", 
        "myxnli_normalized_2", 
        "alt_combined_normalized"
    ],
    "nllb_back_translated": [
        "myxnli_nllb_back_translated_final_1", 
        "myxnli_nllb_back_translated_final_2", 
        "alt_combined_nllb_back_translated_final"
    ],
    "seamless_m4t_back_translated": [
        "myxnli_seamless_m4t_back_translated_final_1", 
        "myxnli_seamless_m4t_back_translated_final_2", 
        "alt_combined_seamless_m4t_back_translated_final"
    ]
}

In [16]:
# Load and process datasets
loaded_datasets = {}
for key, file_list in datasets.items():
    loaded_datasets[key] = [load_and_rename_columns_multilingual(file) for file in file_list]

In [17]:
# combine all datasets
combined = pd.concat(
    loaded_datasets["normal"] + 
    loaded_datasets["nllb_back_translated"] + 
    loaded_datasets["seamless_m4t_back_translated"],
    ignore_index=True
)

In [18]:
# Shuffle the data to prevent order bias
combined = combined.sample(frac=1).reset_index(drop=True)

In [19]:
# display combined dataset
display(combined.head())

Unnamed: 0,source,target
0,archaeologists think that a fire broke out in ...,ရှေးဟောင်းသုတေသီတွေက Knossos မှာ မီးလောင်တာ BC...
1,there are political meetings in every neighbor...,ရပ်ကွက်တိုင်းမှာ နိုင်ငံရေး အစည်းအဝေးတွေရှိတယ်။
2,the lawyer said that in article 712 (1) gao wa...,ရှေ့နေက ပုဒ်မ ၇၁၂ (၁) မှာ Gao ကို ငွေကြေးဆိုင်...
3,things can get confusing when talking about do...,Dordogne အကြောင်းပြောသောအခါ၊ ဝေးကွာသောနေရာများ...
4,making financial management a top priority acr...,ဘဏ္ဍာရေး စီမံခန့်ခွဲမှုကို ပြည်ထောင်စု အစိုးရတ...


In [23]:
# print length
print(f"Combined dataset length: {len(combined)}")

Combined dataset length: 1627576


In [20]:
# save data
save_model_variants_df(combined, "combined")

## Tokenize

In [8]:
def tokenize(examples, tokenizer, spt_tokenizer, model_name):
    """
    Tokenizes Burmese text using the selected SentencePiece model before applying Transformer tokenization.
    """
    # Apply SentencePiece Tokenization for Burmese target text
    spt_burmese = [" ".join(spt_tokenizer.encode_as_pieces(text)) for text in examples["target"]]
    examples["target"] = spt_burmese  # Overwrite with tokenized text

    if "t5" in model_name.lower():
        # mT5/T5 (Text-to-Text) - Tokenize source & target separately
        model_inputs = tokenizer(
            examples["source"], 
            padding="max_length", 
            truncation=True, 
            max_length=512
        )

        # Tokenize target`
        labels = tokenizer(
            examples["target"],  
            padding="max_length",  
            truncation=True,  
            max_length=512,
            return_special_tokens_mask=True  # Helps handle special tokens
        )["input_ids"]

        model_inputs["labels"] = labels
        model_inputs["decoder_input_ids"] = labels
        return model_inputs

    # BERT-based models (Masked/Causal LM)
    inputs = tokenizer(
        examples["source"],
        examples["target"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

    # Assign labels for causal LM (BERT-like models)
    inputs["labels"] = deepcopy(inputs["input_ids"])

    return inputs

In [None]:
# tokenize for each model and spt
for model_name, tokenizer in train_tokenizers.items():
    for spt_name, spt_tokenizer in spt_models.items():
        dataset = load_model_variants_df("combined")

        # Convert to Hugging Face Dataset
        dataset = Dataset.from_pandas(dataset)

        # apply tokenize
        dataset = dataset.map(
            lambda x, _: tokenize(x, tokenizer, spt_tokenizer, model_name),
            batched=True,
            desc=f"Tokenizing dataset for {model_name} with {spt_name}",
            with_indices=True,  # Passing index as a second argument
            num_proc=10
        )

        # save
        save_model_variants_arrow(dataset, f"{model_name.lower()}_{spt_name}")

# 1. Fine-Tuning Transformer Models for Burmese
This notebook fine-tunes three transformer models:
- mBERT (best perplexity, but weak BLEU/ROUGE)
- mT5 (best for generation, but requires more data)
- XLM-R (good BLEU/ROUGE, but poor perplexity)

Apply:
- Sentence-Piece Tokenization for Burmese segmentation
- LoRA for efficient fine-tuning
- Prefix-Tuning for lightweight adaptations
- Mixed Precision Training for speed improvements

In [None]:
# function to get fine tuned model
def get_fine_tuned_model(model_name, spt_name):
    model_path = f"model-variants/models/{model_name}_{spt_name.upper()}"

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    if "t5" in model_name.lower():
        model = AutoModelForSeq2SeqLM.from_pretrained().to(device)
    else:
        model = AutoModelForMaskedLM.from_pretrained().to(device)

    return model, tokenizer

## Fine-Tuning

In [None]:
# train models
train_models = {
    "mBERT": AutoModelForMaskedLM.from_pretrained(train_model_names["mBERT"], num_labels=1).to(device),
    "mT5": AutoModelForSeq2SeqLM.from_pretrained(train_model_names["mT5"]).to(device),
    "XLM-R": AutoModelForMaskedLM.from_pretrained(train_model_names["XLM-R"], num_labels=1).to(device)
}

In [None]:
# tokenized dataset
tokenized_datasets = {
    model_name: {
        spt_name: load_model_variants_arrow(f"{model_name.lower()}_{spt_name}")
        for spt_name in spt_models.keys()
    }
    for model_name in train_tokenizers.keys()
}

In [None]:
def fine_tune_model(model_name, spt_name, batch_size):
    """
    Fine-tunes the model with LoRA on the specified SentencePiece tokenization (SPT).
    """
    print(f"Fine-tuning {model_name} using SPT-{spt_name.upper()}...")

    # Load tokenizer & model
    tokenizer = train_tokenizers[model_name]
    model = train_models[model_name]

    # Move model to GPU before applying LoRA
    model.to(device)

    # Apply LoRA for efficient parameter tuning
    model = apply_lora(model, model_name, False)

    # Tokenize dataset & split into training and validation sets
    tokenized_dataset = tokenized_datasets[model_name][spt_name]
    split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
    
    train_data = split_dataset["train"]
    val_data = split_dataset["test"]

    # for debug, remove comment
    #train_data = train_data.select(range(100))
    #val_data = val_data.select(range(100))

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"model-variants/results/{model_name}_{spt_name.upper()}",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=5,
        learning_rate=3e-5,
        **train_agrs
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        processing_class=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train the model
    trainer.train()

    # Save trained model and tokenizer
    save_path = f"model-variants/models/{model_name}_{spt_name.upper()}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"Model `{model_name}` fine-tuned and saved at `{save_path}`.")

### mBert

In [None]:
# fine tune with SPT-BPE
fine_tune_model("mBERT", "bpe", 16)

Fine-tuning mBERT using SPT-BPE...
LoRA applied to mBERT (Target Modules: ['query', 'value'])


Epoch,Training Loss,Validation Loss
1,0.0002,3.3e-05


In [None]:
# fine tune with SPT-Unigram
fine_tune_model("mBERT", "unigram", 16)

### mT5

In [None]:
# fine tune with SPT-BPE
fine_tune_model("mT5", "bpe", 8)

Fine-tuning mT5 using SPT-BPE...
LoRA applied to mT5 (Target Modules: ['q', 'v'])


Epoch,Training Loss,Validation Loss


In [None]:
# fine tune with SPT-Unigram
fine_tune_model("mT5", "unigram", 8)

### XLM-R

In [None]:
# fine tune with SPT-BPE
fine_tune_model("XLM-R", "bpe", 8)

In [None]:
# fine tune with SPT-Unigram
fine_tune_model("XLM-R", "unigram", 8)

## Generate Predictions

In [None]:
# function to generate predictions of fine tuned model
def generate_predictions_fine_tuned_model(model_name, spt_name, batch_size):
    # Load tokenizers & models
    model, tokenizer = get_fine_tuned_model(model_name, spt_name)

    # load dataset
    predictions = load_model_variants_df("combined")

    # Convert to DataLoader
    predictions_texts = predictions["target"].tolist()
    predictions_dataset = MaskedTextDataset(predictions_texts, tokenizer)
    predictions_dataloader = DataLoader(
        predictions_dataset, 
        batch_size=batch_size, 
        shuffle=False
    )

    # Run text generation
    if "t5" in model_name.lower():
        predictions["generated"] = generate_mt5_predictions_batch(
            predictions_dataloader, 
            model, 
            tokenizer,
            device
        )
    else:
        predictions["generated"] = generate_masked_predictions_batch(
            predictions_dataloader, 
            model, 
            tokenizer,
            device
        )

    # display
    display(predictions.head())

    # save trained mbert predictions
    save_model_variants_df(predictions, f"{model_name}_{spt_name}_trained_predictions")

### mBERT

In [None]:
# generate predictions for mbert with BPE
generate_predictions_fine_tuned_model("mBERT", "bpe", 8)

In [None]:
# generate predictions for mbert with Unigram
generate_predictions_fine_tuned_model("mBERT", "unigram", 8)

### XLM-R

In [None]:
# generate predictions for XLM-R with BPE
generate_predictions_fine_tuned_model("XLM-R", "bpe", 8)

In [None]:
# generate predictions for XLM-R with Unigram
generate_predictions_fine_tuned_model("XLM-R", "unigram", 8)

### mT5

In [None]:
# generate predictions for mT5 with BPE
generate_predictions_fine_tuned_model("mT5", "bpe", 8)

In [None]:
# generate predictions for mT5 with Unigram
generate_predictions_fine_tuned_model("mT5", "unigram", 8)

## Evaluate Model Performance
Compute BLEU, ROUGE-1, ROUGE-2, ROUGE-3, ROUGE-L, chrF-S, BERTScore and Perplexity scores.

### Metrics

In [None]:
# function to compute metric for fine tuned model
def compute_metric_fine_tuned_model(model_name, spt_name, batch_size):
    # load dataset
    metrics = load_model_variants_df(f"{model_name}_{spt_name}_trained_predictions")

    # compute metrics
    print(f"Processing Data for {model_name} with {spt_name.upper()}..."),
    compute_metrics_batch(metrics, batch_size)

    # display
    print(f"Metrics scores for mBERT with BPE:")
    print(f"BLEU Score: {metrics['bleu'].mean()}")
    print(f"ROUGE-1 Score: {metrics['rouge-1'].mean()}")
    print(f"ROUGE-2 Score: {metrics['rouge-2'].mean()}")
    print(f"ROUGE-L Score: {metrics['rouge-l'].mean()}")
    print(f"chrF-S Score: {metrics['chrf-s'].mean()}")
    print(f"BERT Score: {metrics['bert_score'].mean()}")

    # save results
    save_tmp_df(metrics, f"{model_name}_{spt_name}_trained_metrics")

#### mBERT

In [None]:
# compute metric for fine tuned mBERT with BPE
compute_metric_fine_tuned_model("mBERT", "bpe", 64)

In [None]:
# compute metric for fine tuned mBERT with Unigram
compute_metric_fine_tuned_model("mBERT", "unigram", 64)

#### XLM-R

In [None]:
# compute metric for fine tuned XLM-R with BPE
compute_metric_fine_tuned_model("XLM-R", "bpe", 64)

In [None]:
# compute metric for fine tuned XLM-R with Unigram
compute_metric_fine_tuned_model("XLM-R", "unigram", 64)

#### mT5

In [None]:
# compute metric for fine tuned mT5 with BPE
compute_metric_fine_tuned_model("mT5", "bpe", 64)

In [None]:
# compute metric for fine tuned mT5 with Unigram
compute_metric_fine_tuned_model("mT5", "unigram", 64)

### Perplexity

In [None]:
# function to compute perplexity for fine tuned model
def compute_perplexity_fine_tuned_model(model_name, spt_name, batch_size):
    # load dataset
    perplexity = load_model_variants_df(f"{model_name}_{spt_name}_trained_predictions")

    # Load tokenizers & models
    model, tokenizer = get_fine_tuned_model(model_name, spt_name)

    # Prepare dataset and DataLoader
    generated_texts = perplexity["generated"].tolist()
    text_dataset = TextDataset(generated_texts)
    dataloader = DataLoader(
        text_dataset, 
        batch_size, 
        shuffle=False
    )

    # compute and store perplexity scores in DataFrame
    if "t5" in model_name.lower():
        perplexity["perplexity"] = compute_multilingual_mt5_perplexity_batch(
            dataloader,
            model,
            tokenizer,
            device
        )
    else:
        perplexity["perplexity"] = compute_multilingual_masked_perplexity_batch(
            dataloader,
            model,
            tokenizer,
            device
        )

    # display perplexity
    print(f"Perplexity Score: {perplexity['perplexity'].mean()}")

    # save perplexity
    save_tmp_df(perplexity, f"{model_name}_{spt_name}_trained_perplexity")

#### mBERT

In [None]:
# compute perplexity with BPE
compute_perplexity_fine_tuned_model("mBERT", "bpe", 64)

In [None]:
# compute perplexity with Unigram
compute_perplexity_fine_tuned_model("mBERT", "unigram", 64)

#### XLM-R

In [None]:
# compute perplexity with BPE
compute_perplexity_fine_tuned_model("XLM-R", "bpe", 64)

In [None]:
# compute perplexity with Unigram
compute_perplexity_fine_tuned_model("XLM-R", "unigram", 64)

#### mt5

In [None]:
# compute perplexity with BPE
compute_perplexity_fine_tuned_model("mT5", "bpe", 64)

In [None]:
# compute perplexity with Unigram
compute_perplexity_fine_tuned_model("mT5", "unigram", 64)

### Save Evaluation Results

In [None]:
# combine evaluation results
for model_name in train_model_names.keys():
    for spt_name in spt_models.keys():
        print(f"Processing {model_name}...")

        distilled_evaluation_results = load_model_variants_df(f"{model_name}_{spt_name}_trained_predictions")

        # load metrics and set
        metrics = load_tmp_df(f"{model_name}_{spt_name}_metrics")
        distilled_evaluation_results["bleu"] = metrics["bleu"]
        distilled_evaluation_results["rouge-1"] = metrics["rouge-1"]
        distilled_evaluation_results["rouge-2"] = metrics["rouge-2"]
        distilled_evaluation_results["rouge-l"] = metrics["rouge-l"]
        distilled_evaluation_results["chrf-s"] = metrics["chrf-s"]
        distilled_evaluation_results["bert_score"] = metrics["bert_score"]

        # load perplexity and set
        perplexity = load_tmp_df(f"{model_name}_{spt_name}_perplexity")
        distilled_evaluation_results["perplexity"] = perplexity["perplexity"]

        save_model_variants_df(distilled_evaluation_results, f"{model_name}_{spt_name}_evaluation_results")

## Benchmarking and Analysis
Compare the performance of LSTM BPE, LSTM Unigram, mBERT, and XLM-R using BLEU, ROUGE, chrF-S, BERT Score and Perplexity.

In [None]:
# load data
trained_benchmarking_datasets = {}
for model_name in train_model_names.keys():
    for spt_name in spt_models.keys():
        df = load_model_variants_df(f"{model_name}_{spt_name}_evaluation_results")
        trained_benchmarking_datasets[f"{model_name} {spt_name.upper()}"] = df

In [None]:
# convert to mean score df
trained_benchmarking_mean_scores = convert_to_mean_scores_df(trained_benchmarking_datasets)

In [None]:
# Display mean scores
display(trained_benchmarking_mean_scores)

In [None]:
# save benchmarking results
save_model_variants_df(trained_benchmarking_mean_scores, "trained_evaluation_results")

# 2. Optimize Model Efficiency with Lightweight Transformers
- Optimizes mBERT, XLM-R, mT5-Small (BPE & Unigram).
- Trains TinyBERT, DistilBERT with Knowledge Distillation.
- Evaluates BLEU, ROUGE, chrF-S and Perplexity after optimization.

In [None]:
# Define Student Models (TinyBERT & DistilBERT)
distill_model_names = {
    "TinyBERT": "huawei-noah/TinyBERT_General_6L_768D",
    "DistilBERT": "distilbert-base-uncased"
}

In [None]:
# function to get distilled model
def get_distilled_model(model_name, spt_name, distill_model_name):
    model_path = f"model-variants/models/{model_name}_{spt_name.upper()}_{distill_model_name}"

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    if "t5" in model_name.lower():
        model = AutoModelForSeq2SeqLM.from_pretrained().to(device)
    else:
        model = AutoModelForMaskedLM.from_pretrained().to(device)

    return model, tokenizer

## Train

In [None]:
# Train Student Models with Knowledge Distillation
class DistillationTrainer(Trainer):
    def __init__(self, teacher_model, alpha=0.5, temperature=2.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model
        self.teacher_model.eval()
        self.alpha = alpha
        self.temperature = temperature

    def compute_loss(self, model, inputs, return_outputs=False):
        student_outputs = model(**inputs)
        student_logits = student_outputs.logits

        # Ensure labels exist in inputs
        if "labels" not in inputs:
            raise ValueError("Missing 'labels' in input dictionary.")

        labels = inputs["labels"].view(-1)

        # Compute teacher logits without gradient computation
        with torch.no_grad():
            teacher_outputs = self.teacher_model(**inputs)
            teacher_logits = teacher_outputs.logits

        # Compute CrossEntropy Loss (ignoring padding tokens)
        loss_ce = nn.CrossEntropyLoss(ignore_index=-100)(
            student_logits.view(-1, student_logits.size(-1)), labels
        )

        # Compute KL Divergence Loss for Knowledge Distillation
        loss_kl = nn.KLDivLoss(reduction="batchmean")(
            torch.nn.functional.log_softmax(student_logits / self.temperature, dim=-1),
            torch.nn.functional.softmax(teacher_logits / self.temperature, dim=-1),
        ) * (self.temperature ** 2)

        # Final loss: Combination of CE loss and KL loss
        loss = self.alpha * loss_ce + (1 - self.alpha) * loss_kl

        return (loss, student_outputs) if return_outputs else loss

In [None]:
# train fine tuned model with knowledge distillation
def train_distilled_model(teacher_model_name, teacher_spt_name, student_model_name, batch_size):
    print(f"Training {student_model_name} using {teacher_model_name}_{teacher_spt_name.upper()} as a teacher...")

    # get teacher model and tokenizer
    teacher_model, tokenizer = get_fine_tuned_model(teacher_model_name, teacher_spt_name)

    # Select Correct Model Type
    if "t5" in teacher_model_name.lower():
        student_model = AutoModelForSeq2SeqLM.from_pretrained(distill_model_names[student_model_name]).to(device)
    else:
        student_model = AutoModelForMaskedLM.from_pretrained(distill_model_names[student_model_name]).to(device)

    # Apply LoRA for efficient parameter tuning
    student_model = apply_lora(student_model, teacher_model_name, True)

    # Tokenize dataset & split into training and validation sets
    tokenized_dataset = tokenized_datasets[teacher_model_name][teacher_spt_name]
    split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
    
    train_data = split_dataset["train"]
    val_data = split_dataset["test"]

    # for debug, remove comment
    #train_data = train_data.select(range(100))
    #val_data = val_data.select(range(100))

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"model-variants/results/{teacher_model_name}_{teacher_spt_name.upper()}_{student_model_name}",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=3,
        learning_rate=1e-5,
        **train_agrs
    )

    # Initialize Trainer
    trainer = DistillationTrainer(
        teacher_model=teacher_model,
        model=student_model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        processing_class=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train the model
    trainer.train()

    # Save trained model and tokenizer
    save_path = f"model-variants/models/{teacher_model_name}_{teacher_spt_name.upper()}_{student_model_name}"
    student_model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    print(f"Training {student_model_name} using {teacher_model_name}_{teacher_spt_name.upper()} as a teacher is finished and saved at `{save_path}`.")

### mBERT

#### TinyBERT

In [None]:
# train Knowledge Distillation TinyBERT with BPE
train_distilled_model("mBERT", "bpe", "TinyBERT", 32)

In [None]:
# train Knowledge Distillation TinyBERT with Unigram
train_distilled_model("mBERT", "unigram", "TinyBERT", 32)

#### DistilBERT

In [None]:
# train Knowledge Distillation TinyBERT with BPE
train_distilled_model("mBERT", "bpe", "DistilBERT", 32)

In [None]:
# train Knowledge Distillation TinyBERT with Unigram
train_distilled_model("mBERT", "unigram", "DistilBERT", 32)

### XLM-R

#### TinyBERT

In [None]:
# train Knowledge Distillation TinyBERT with BPE
train_distilled_model("XLM-R", "bpe", "TinyBERT", 16)

In [None]:
# train Knowledge Distillation TinyBERT with Unigram
train_distilled_model("XLM-R", "unigram", "TinyBERT", 16)

#### DistilBERT

In [None]:
# train Knowledge Distillation TinyBERT with BPE
train_distilled_model("XLM-R", "bpe", "DistilBERT", 16)

In [None]:
# train Knowledge Distillation TinyBERT with Unigram
train_distilled_model("XLM-R", "unigram", "DistilBERT", 16)

### mT5

#### TinyBERT

In [None]:
# train Knowledge Distillation TinyBERT with BPE
train_distilled_model("mT5", "bpe", "TinyBERT", 16)

In [None]:
# train Knowledge Distillation TinyBERT with Unigram
train_distilled_model("mT5", "unigram", "TinyBERT", 16)

#### DistilBERT

In [None]:
# train Knowledge Distillation TinyBERT with BPE
train_distilled_model("mT5", "bpe", "DistilBERT", 16)

In [None]:
# train Knowledge Distillation TinyBERT with Unigram
train_distilled_model("mT5", "unigram", "DistilBERT", 16)

## Generate Predictions

In [None]:
# function to generate predictions of distilled model
def generate_predictions_distilled_model(model_name, spt_name, distilled_model_name, batch_size):
    # Load tokenizers & models
    model, tokenizer = get_distilled_model(model_name, spt_name, distilled_model_name)

    # load dataset
    predictions = load_model_variants_df("combined")

    # Convert to DataLoader
    predictions_texts = predictions["target"].tolist()
    predictions_dataset = MaskedTextDataset(predictions_texts, tokenizer)
    predictions_dataloader = DataLoader(
        predictions_dataset, 
        batch_size=batch_size, 
        shuffle=False
    )

    # Run text generation
    if "t5" in model_name.lower():
        predictions["generated"] = generate_mt5_predictions_batch(
            predictions_dataloader, 
            model, 
            tokenizer,
            device
        )
    else:
        predictions["generated"] = generate_masked_predictions_batch(
            predictions_dataloader, 
            model, 
            tokenizer,
            device
        )

    # display
    display(predictions.head())

    # save trained mbert predictions
    save_model_variants_df(predictions, f"{model_name}_{spt_name}_{distilled_model_name}_predictions")

### mBERT

#### TinyBERT

In [None]:
# generate predictions for mbert with BPE and TinyBERT
generate_predictions_distilled_model("mBERT", "bpe", "TinyBERT")

In [None]:
# generate predictions for mbert with Unigram and TinyBERT
generate_predictions_distilled_model("mBERT", "unigram", "TinyBERT")

#### DistillBERT

In [None]:
# generate predictions for mbert with BPE and DistillBERT
generate_predictions_distilled_model("mBERT", "bpe", "DistillBERT")

In [None]:
# generate predictions for mbert with Unigram and DistillBERT
generate_predictions_distilled_model("mBERT", "unigram", "DistillBERT")

### XLM-R

#### TinyBERT

In [None]:
# generate predictions for XLM-R with BPE and TinyBERT
generate_predictions_distilled_model("XLM-R", "bpe", "TinyBERT")

In [None]:
# generate predictions for XLM-R with Unigram and TinyBERT
generate_predictions_distilled_model("XLM-R", "unigram", "TinyBERT")

#### DistillBERT

In [None]:
# generate predictions for XLM-R with BPE and DistillBERT
generate_predictions_distilled_model("XLM-R", "bpe", "DistillBERT")

In [None]:
# generate predictions for XLM-R with BPE and DistillBERT
generate_predictions_distilled_model("XLM-R", "bpe", "DistillBERT")

### mT5

#### TinyBERT

In [None]:
# generate predictions for mT5 with BPE and TinyBERT
generate_predictions_distilled_model("mT5", "bpe", "TinyBERT")

In [None]:
# generate predictions for mT5 with Unigram and TinyBERT
generate_predictions_distilled_model("mT5", "unigram", "TinyBERT")

#### DistillBERT

In [None]:
# generate predictions for mT5 with BPE and DistillBERT
generate_predictions_distilled_model("mT5", "bpe", "DistillBERT")

In [None]:
# generate predictions for mT5 with Unigram and DistillBERT
generate_predictions_distilled_model("mT5", "unigram", "DistillBERT")

## Evaluate Model Performance
Compute BLEU, ROUGE-1, ROUGE-2, ROUGE-3, ROUGE-L, chrF-S, BERTScore and Perplexity scores.

### Metrics

In [None]:
# function to compute metric for distilled model
def compute_metric_distilled_model(model_name, spt_name, distill_model_name, batch_size):
    # load dataset
    metrics = load_model_variants_df(f"{model_name}_{spt_name}_{distill_model_name}_predictions")

    # compute metrics
    print(f"Processing Data for {model_name} with {spt_name.upper()} and {distill_model_name}..."),
    compute_metrics_batch(metrics, batch_size)

    # display
    print(f"Metrics scores for mBERT with BPE:")
    print(f"BLEU Score: {metrics['bleu'].mean()}")
    print(f"ROUGE-1 Score: {metrics['rouge-1'].mean()}")
    print(f"ROUGE-2 Score: {metrics['rouge-2'].mean()}")
    print(f"ROUGE-L Score: {metrics['rouge-l'].mean()}")
    print(f"chrF-S Score: {metrics['chrf-s'].mean()}")
    print(f"BERT Score: {metrics['bert_score'].mean()}")

    # save results
    save_tmp_df(metrics, f"{model_name}_{spt_name}_{distill_model_name}_metrics")

#### mBERT

##### TinyBERT

In [None]:
# compute metric for distilled model with BPE
compute_metric_distilled_model("mBERT", "bpe", "TinyBERT", 64)

In [None]:
# compute metric for distilled model with Unigram
compute_metric_distilled_model("mBERT", "unigram", "TinyBERT", 64)

##### DistillBERT

In [None]:
# compute metric for distilled model with BPE
compute_metric_distilled_model("mBERT", "bpe", "DistillBERT", 64)

In [None]:
# compute metric for distilled model with Unigram
compute_metric_distilled_model("mBERT", "unigram", "DistillBERT", 64)

### XLM-R

##### TinyBERT

In [None]:
# compute metric for distilled model with BPE
compute_metric_distilled_model("XLM-R", "bpe", "TinyBERT", 64)

In [None]:
# compute metric for distilled model with Unigram
compute_metric_distilled_model("XLM-R", "unigram", "TinyBERT", 64)

##### DistillBERT

In [None]:
# compute metric for distilled model with BPE
compute_metric_distilled_model("XLM-R", "bpe", "DistillBERT", 64)

In [None]:
# compute metric for distilled model with Unigram
compute_metric_distilled_model("XLM-R", "unigram", "DistillBERT", 64)

#### mT5

##### TinyBERT

In [None]:
# compute metric for distilled model with BPE
compute_metric_distilled_model("mT5", "bpe", "TinyBERT", 64)

In [None]:
# compute metric for distilled model with Unigram
compute_metric_distilled_model("mT5", "unigram", "TinyBERT", 64)

##### DistillBERT

In [None]:
# compute metric for distilled model with BPE
compute_metric_distilled_model("mT5", "bpe", "DistillBERT", 64)

In [None]:
# compute metric for distilled model with Unigram
compute_metric_distilled_model("mT5", "unigram", "DistillBERT", 64)

### Perplexity

In [None]:
# function to compute perplexity for distilled model
def compute_perplexity_distilled_model(model_name, spt_name, distill_model_name, batch_size):
    # load dataset
    perplexity = load_model_variants_df(f"{model_name}_{spt_name}_{distill_model_name}_predictions")

    # Load tokenizers & models
    model, tokenizer = get_distilled_model(model_name, spt_name, distill_model_name)

    # Prepare dataset and DataLoader
    generated_texts = perplexity["generated"].tolist()
    text_dataset = TextDataset(generated_texts)
    dataloader = DataLoader(
        text_dataset, 
        batch_size, 
        shuffle=False
    )

    # compute and store perplexity scores in DataFrame
    if "t5" in model_name.lower():
        perplexity["perplexity"] = compute_multilingual_mt5_perplexity_batch(
            dataloader,
            model,
            tokenizer,
            device
        )
    else:
        perplexity["perplexity"] = compute_multilingual_masked_perplexity_batch(
            dataloader,
            model,
            tokenizer,
            device
        )

    # display perplexity
    print(f"Perplexity Score: {perplexity['perplexity'].mean()}")

    # save perplexity
    save_tmp_df(perplexity, f"{model_name}_{spt_name}_{distill_model_name}_perplexity")

#### mBERT

##### TinyBERT

In [None]:
# compute perplexity with bpe
compute_perplexity_distilled_model("mBERT", "bpe", "TinyBERT", 64)

In [None]:
# compute perplexity with unigram
compute_perplexity_distilled_model("mBERT", "unigram", "TinyBERT", 64)

##### DistillBERT

In [None]:
# compute perplexity with bpe
compute_perplexity_distilled_model("mBERT", "bpe", "DistillBERT", 64)

In [None]:
# compute perplexity with unigram
compute_perplexity_distilled_model("mBERT", "unigram", "DistillBERT", 64)

#### XLM-R

##### TinyBERT

In [None]:
# compute perplexity with bpe
compute_perplexity_distilled_model("XLM-R", "bpe", "TinyBERT", 64)

In [None]:
# compute perplexity with unigram
compute_perplexity_distilled_model("XLM-R", "unigram", "TinyBERT", 64)

##### DistillBERT

In [None]:
# compute perplexity with bpe
compute_perplexity_distilled_model("XLM-R", "bpe", "DistillBERT", 64)

In [None]:
# compute perplexity with unigram
compute_perplexity_distilled_model("XLM-R", "unigram", "DistillBERT", 64)

#### mT5

##### TinyBERT

In [None]:
# compute perplexity with bpe
compute_perplexity_distilled_model("mT5", "bpe", "TinyBERT", 64)

In [None]:
# compute perplexity with unigram
compute_perplexity_distilled_model("mT5", "unigram", "TinyBERT", 64)

##### DistillBERT

In [None]:
# compute perplexity with bpe
compute_perplexity_distilled_model("mT5", "bpe", "DistillBERT", 64)

In [None]:
# compute perplexity with unigram
compute_perplexity_distilled_model("mT5", "unigram", "DistillBERT", 64)

### Save Evaluation Results

In [None]:
# combine evaluation results
for model_name in train_model_names.keys():
    for spt_name in spt_models.keys():
        for distill_model_name in distill_model_names.keys():
            print(f"Processing {model_name} with {spt_name} and {distill_model_name}...")

            distilled_evaluation_results = load_model_variants_df(f"{model_name}_{spt_name}_{distill_model_name}_predictions")

            # load metrics and set
            metrics = load_tmp_df(f"{model_name}_{spt_name}_metrics")
            distilled_evaluation_results["bleu"] = metrics["bleu"]
            distilled_evaluation_results["rouge-1"] = metrics["rouge-1"]
            distilled_evaluation_results["rouge-2"] = metrics["rouge-2"]
            distilled_evaluation_results["rouge-l"] = metrics["rouge-l"]
            distilled_evaluation_results["chrf-s"] = metrics["chrf-s"]
            distilled_evaluation_results["bert_score"] = metrics["bert_score"]

            # load perplexity and set
            perplexity = load_tmp_df(f"{model_name}_{spt_name}_perplexity")
            distilled_evaluation_results["perplexity"] = perplexity["perplexity"]

            save_model_variants_df(distilled_evaluation_results, f"{model_name}_{spt_name}_{distill_model_name}_evaluation_results")

## Benchmarking and Analysis
Compare the performance of LSTM BPE, LSTM Unigram, mBERT, and XLM-R using BLEU, ROUGE, chrF-S, BERT Score and Perplexity.

In [None]:
# load data
distilled_benchmarking_datasets = {}
for model_name in train_model_names.keys():
    for spt_name in spt_models.keys():
        for distill_model_name in distill_model_names.keys():
            df = load_model_variants_df(f"{model_name}_{spt_name}_{distill_model_name}_evaluation_results")
            distilled_benchmarking_datasets[f"{model_name} {spt_name.upper()} {distill_model_name}"] = df

In [None]:
# convert to mean score df
distilled_benchmarking_mean_scores = convert_to_mean_scores_df(distilled_benchmarking_datasets)

In [None]:
# Display mean scores
display(distilled_benchmarking_mean_scores)

In [None]:
# save benchmarking results
save_model_variants_df(distilled_benchmarking_mean_scores, "distilled_evaluation_results")