# Importing all the important libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers sentence_transformers sentencepiece evaluate sacrebleu bert-score peft

In [None]:
!conda update -n base -c conda-forge conda -y

In [None]:
!conda install conda-forge::rouge-score -y

In [1]:
import torch
import pandas as pd
import tensorflow as tf
import sentencepiece as spm
from IPython.display import display
from tqdm.notebook import tqdm
from datasets import load_dataset, Dataset
from transformers import (
    logging,
    AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM,
    Trainer, TrainingArguments,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model
from evaluate import load

# Set GPU

## Mac

In [2]:
# for mac
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        details = tf.config.experimental.get_device_details(gpu)
        print("GPU details: ", details)
else:
    print("No GPU found. Using CPU.")

# set GPU device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


## Window / Linux

In [None]:
# for window
print("Tensorflow GPUs: ", tf.config.list_physical_devices('GPU'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using PyTorch device:", device)
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

# Functions

In [3]:
# function to save model-variants df
def save_model_variants_df(df, df_name):
    df.to_csv(f"model-variants/{df_name}.csv", index=False, encoding="utf-8")

In [4]:
def save_model_variants_df_arrow(df, df_name): 
    df.save_to_disk(f"model-variants/{df_name}_hf_dataset")

In [5]:
# function to load gen df
def load_gen_df(df_name):
    return pd.read_csv(f"gen/{df_name}.csv", header=0, encoding="utf-8")

In [6]:
# function to load model-variants df
def load_model_variants_df(df_name):
    return pd.read_csv(f"model-variants/{df_name}.csv", header=0, encoding="utf-8")

In [7]:
# function to load model-variants df with arrow
def load_model_variants_df_arrow(df_name):
    return load_dataset(f"model-variants/{df_name}_hf_dataset")

# Set settings

In [8]:
tqdm.pandas()

In [9]:
# Suppress specific warnings from the transformers library
logging.set_verbosity_error()

# Fine-Tuning Transformer Models for Burmese
This notebook fine-tunes three transformer models:
- mBERT (Multilingual BERT)
- mT5 (Multilingual T5)
- XLM-RoBERTa

Apply:
- Sentence-Piece Tokenization for Burmese segmentation
- LoRA for efficient fine-tuning
- Prefix-Tuning for lightweight adaptations
- Mixed Precision Training for speed improvements

In [10]:
# spt models
spt_models = {
    "bpe": spm.SentencePieceProcessor("spt/spt_bpe.model"),
    "unigram": spm.SentencePieceProcessor("spt/spt_unigram.model"),
}

In [11]:
# model names
train_model_names = {
    "mBERT": "bert-base-multilingual-cased",
    "mT5": "google/mt5-small",
    "XLM-R": "xlm-roberta-base"
}

In [12]:
# train tokenizers
train_tokenizers = {
    "mBERT": AutoTokenizer.from_pretrained(train_model_names["mBERT"]),
    "mT5": AutoTokenizer.from_pretrained(train_model_names["mT5"], use_fast=False, legacy=True),
    "XLM-R": AutoTokenizer.from_pretrained(train_model_names["XLM-R"])
}

In [13]:
# train models
train_models = {
    "mBERT": AutoModelForSequenceClassification.from_pretrained(train_model_names["mBERT"], num_labels=1).to(device),
    "mT5": AutoModelForSeq2SeqLM.from_pretrained(train_model_names["mT5"]).to(device),
    "XLM-R": AutoModelForSequenceClassification.from_pretrained(train_model_names["XLM-R"], num_labels=1).to(device)
}

## Data Preprocessing
Datasets used for training:
- myXNLI & ALT Corpus (normalized)
- Back-translated datasets (NLLB, Seamless M4T)
- Pseudo-parallel datasets (MiniLM, LaBSE)

### Data Preparation

In [14]:
# Load and process dataset
def load_and_rename_columns_multilingual(file_name):
    df = load_gen_df(f"{file_name}")

    column_mapping = {
        "english": "source",
        "burmese": "target",
        "english_back_translated": "source",
        "burmese_translated": "target",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["source", "target"]]

    return df

In [15]:
# datasets
datasets = {
    "normal": [
        "myxnli_normalized_1", 
        "myxnli_normalized_2", 
        "alt_combined_normalized"
    ],
    "nllb_back_translated": [
        "myxnli_nllb_back_translated_final_1", 
        "myxnli_nllb_back_translated_final_2", 
        "alt_combined_nllb_back_translated_final"
    ],
    "seamless_m4t_back_translated": [
        "myxnli_seamless_m4t_back_translated_final_1", 
        "myxnli_seamless_m4t_back_translated_final_2", 
        "alt_combined_seamless_m4t_back_translated_final"
    ]
}

In [16]:
# Load and process datasets
loaded_datasets = {}
for key, file_list in datasets.items():
    loaded_datasets[key] = [load_and_rename_columns_multilingual(file) for file in file_list]

In [17]:
# combine all datasets
combined = pd.concat(
    loaded_datasets["normal"] + 
    loaded_datasets["nllb_back_translated"] + 
    loaded_datasets["seamless_m4t_back_translated"],
    ignore_index=True
)

In [18]:
# Shuffle the data to prevent order bias
combined = combined.sample(frac=1).reset_index(drop=True)

In [19]:
# display combined dataset
display(combined.head())

Unnamed: 0,source,target
0,"economic literature is still being developed, ...",အချိန်မမီ သေဆုံးမှု အန္တရာယ် လျှော့ချမှုကို တန...
1,they also have other group excursions not listed.,၎င်းတို့တွင် စာရင်းမသွင်းထားသော အခြားအဖွဲ့လိုက...
2,"i think it's yours, but it's actually mine.",ဒါက သင့်ရဲ့လို့ ထင်ပေမဲ့ တကယ်က ကျွန်မပါ။
3,all counties benefit from the state income tax...,ခရိုင်အားလုံးဟာ အခွန်ထမ်းတွေရဲ့ ပြည်နယ် ဝင်ငွေ...
4,the deep breathing barik did helped him to cal...,Barik အသက်ပြင်းပြင်းရှူခြင်းက သူ့ကို စိတ်တည်ငြ...


In [20]:
# cleaning combined dataset
print(f"Original Records: {len(combined)}.")
combined = combined.drop_duplicates()  # Remove duplicates
combined = combined.dropna()  # Remove rows with missing values
print(f"Remaining records: {len(combined)}.")

Original Records: 1627576.
Remaining records: 1614484.


In [21]:
# save data
save_model_variants_df(combined, "combined")

### Tokenize

In [22]:
#  function to tokenize Burmese text using the selected SentencePiece model before applying Transformer tokenization.
def tokenize(examples, tokenizer, spt_tokenizer, model_name):
    spt_burmese = [spt_tokenizer.encode_as_pieces(text) for text in examples["target"]]
    examples["target"] = [" ".join(tokens) for tokens in spt_burmese]

    if "t5" in model_name:  # mT5: text-to-text format
        return tokenizer(
            examples["source"], 
            padding="max_length", 
            truncation=True, 
            max_length=512,
        )
    
    # BERT-based models: Masked/Causal LM
    return tokenizer(
        examples["source"],
        examples["target"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

In [23]:
# tokenize for each model and spt
for model_name, tokenizer in train_tokenizers.items():
    for spt_name, spt_tokenizer in spt_models.items():
        dataset = load_model_variants_df("combined")

        # Convert to Hugging Face Dataset
        dataset = Dataset.from_pandas(dataset)

        # apply tokenize
        dataset = dataset.map(
            lambda x, _: tokenize(x, tokenizer, spt_tokenizer, model_name),
            batched=True,
            desc=f"Tokenizing dataset for {model_name} with {spt_name}",
            with_indices=True,  # Passing index as a second argument
            num_proc=10
        )

        # save
        save_model_variants_df_arrow(dataset, f"{model_name.lower()}_{spt_name}")

Tokenizing dataset for mBERT with bpe (num_proc=10):   0%|          | 0/1614484 [00:00<?, ? examples/s]

Saving the dataset (0/12 shards):   0%|          | 0/1614484 [00:00<?, ? examples/s]

Tokenizing dataset for mBERT with unigram (num_proc=10):   0%|          | 0/1614484 [00:00<?, ? examples/s]

Saving the dataset (0/12 shards):   0%|          | 0/1614484 [00:00<?, ? examples/s]

Tokenizing dataset for mT5 with bpe (num_proc=10):   0%|          | 0/1614484 [00:00<?, ? examples/s]

Saving the dataset (0/10 shards):   0%|          | 0/1614484 [00:00<?, ? examples/s]

Tokenizing dataset for mT5 with unigram (num_proc=10):   0%|          | 0/1614484 [00:00<?, ? examples/s]

Saving the dataset (0/10 shards):   0%|          | 0/1614484 [00:00<?, ? examples/s]

Tokenizing dataset for XLM-R with bpe (num_proc=10):   0%|          | 0/1614484 [00:00<?, ? examples/s]

Saving the dataset (0/10 shards):   0%|          | 0/1614484 [00:00<?, ? examples/s]

Tokenizing dataset for XLM-R with unigram (num_proc=10):   0%|          | 0/1614484 [00:00<?, ? examples/s]

Saving the dataset (0/10 shards):   0%|          | 0/1614484 [00:00<?, ? examples/s]

## Fine Tuning
Fine-tuning for:
- mBERT (best perplexity, but weak BLEU/ROUGE)
- mT5 (best for generation, but requires more data)
- XLM-R (good BLEU/ROUGE, but poor perplexity)

In [None]:
tokenized_datasets = {
    model_name: {
        spt_name: load_model_variants_df_arrow(f"{model_name}_{spt_name}")
        for spt_name in spt_models.keys()
    }
    for model_name in train_tokenizers.keys()
}

In [None]:
# Training Hyperparameters
train_args = {
    "num_train_epochs": 5,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "gradient_accumulation_steps": 2,
    "learning_rate": 3e-5,
    "warmup_steps": 500,
    "weight_decay": 0.01,
    "save_strategy": "epoch",
    "save_total_limit": 3,
    "fp16": True,  # Mixed Precision Training
    "evaluation_strategy": "epoch",
    "load_best_model_at_end": True,
    "metric_for_best_model": "eval_loss",
    "greater_is_better": False,  # Lower loss is better
    "logging_dir": "./logs",
    "logging_steps": 100,
    "optim": "adamw_torch_fused",  # Optimized for GPU
    "use_cpu": False if torch.cuda.is_available() else True
}

In [None]:
def apply_lora(model):
    """
    LoRA reduces memory and computational costs.
    It fine-tunes only attention layers instead of the whole model.
    """
    lora_config = LoraConfig(
        r=8, lora_alpha=16, target_modules=["query", "value"], lora_dropout=0.1
    )
    return get_peft_model(model, lora_config)

In [None]:
def fine_tune_model(model_name, spt_name):
    print(f"Fine-tuning {model_name} on using SPT-{spt_name.upper()}...")

    # Load model and tokenizer
    tokenizer = train_tokenizers[model_name]
    model = train_models[model_name]

    # Apply LoRA for efficient fine-tuning
    model = apply_lora(model)

    # tokenize dataset
    tokenized_dataset = tokenized_datasets[model_name][spt_name]

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f"./model-variants/results/{model_name}_SPT-{spt_name.upper()}",
        **train_args
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        eval_dataset=tokenized_dataset,
        tokenizer=tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],  # Stops training if no improvement for 3 epochs
    )

    # Train the model
    trainer.train()

    # Save model
    name = f"./model-variants/models/{model_name}_SPT-{spt_name.upper()}"
    model.save_pretrained(name)
    tokenizer.save_pretrained(name)

    print(f"✅ Model {model_name} fine-tuned using SPT-{spt_name.upper()}.")

### mBert

In [None]:
# fine tune with SPT-BPE
fine_tune_model("mBERT", "bpe")

In [None]:
# fine tune with SPT-Unigram
fine_tune_model("mBERT", "unigram")

### mT5

In [None]:
# fine tune with SPT-BPE
fine_tune_model("mT5", "bpe")

In [None]:
# fine tune with SPT-Unigram
fine_tune_model("mT5", "unigram")

### XLM-R

In [None]:
# fine tune with SPT-BPE
fine_tune_model("XLM-R", "bpe")

In [None]:
# fine tune with SPT-Unigram
fine_tune_model("XLM-R", "unigram")

## Generate Predictions

In [None]:
# Function to generate masked predictions
def generate_masked_predictions_batch(dataloader, model, tokenizer):
    all_predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating Masked Predictions"):
            # Move batch data to GPU
            masked_input_ids, attention_mask, _ = [x.to(device) for x in batch]

            # Run model inference on GPU
            outputs = model(input_ids=masked_input_ids, attention_mask=attention_mask)

            # Replace masked tokens with predicted tokens
            predicted_tokens_batch = masked_input_ids.clone()
            for i in range(masked_input_ids.shape[0]):  # Loop over batch
                mask_positions = (masked_input_ids[i] == tokenizer.mask_token_id).nonzero(as_tuple=True)[0]
                for pos in mask_positions:
                    predicted_token_id = torch.argmax(outputs.logits[i, pos], dim=-1).item()
                    predicted_tokens_batch[i, pos] = predicted_token_id

            # Decode predictions
            batch_predictions = tokenizer.batch_decode(predicted_tokens_batch.cpu(), skip_special_tokens=True)
            all_predictions.extend(batch_predictions)

    return all_predictions

In [None]:
# function to generate predictions
def generate_predictions_batch(dataloader, model, tokenizer, spt_processor, model_name):
    predictions = []

    for batch in tqdm(dataloader, desc="Generating Predictions", unit="batch"):
        spt_encoded_batch = [" ".join(spt_processor.encode_as_pieces(text)) for text in batch]
        inputs = tokenizer(spt_encoded_batch, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            if "t5" in model_name.lower():  # ✅ Use `generate()` for mT5
                output_tokens = model.generate(**inputs, max_length=128)
                decoded_output = [tokenizer.decode(tokens, skip_special_tokens=True) for tokens in output_tokens]
            else:  # ✅ Use fill-mask approach for mBERT & XLM-R
                mask_token = tokenizer.mask_token
                masked_inputs = [text.replace("[MASK]", mask_token) for text in spt_encoded_batch]
                inputs = tokenizer(masked_inputs, return_tensors="pt", padding=True, truncation=True).to(device)
                outputs = model(**inputs)
                predictions_indices = torch.argmax(outputs.logits, dim=-1)
                decoded_output = tokenizer.batch_decode(predictions_indices, skip_special_tokens=True)

        predictions.extend(decoded_output)

    return predictions