# Importing all the important libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers sentence_transformers sentencepiece

In [None]:
!conda update -n base -c conda-forge conda -y

In [None]:
!conda install conda-forge::rouge-score -y

In [1]:
import pandas as pd
import tensorflow as tf
import torch
import ast
import numpy as np
from tqdm.notebook import tqdm
import sentencepiece as spm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoTokenizer, AutoModelForMaskedLM
from rouge_score import rouge_scorer
import torch.nn.functional as F
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

2025-01-31 02:54:53.584486: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-31 02:54:53.600550: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-31 02:54:53.625137: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-31 02:54:53.625161: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-31 02:54:53.640192: I tensorflow/core/platform/cpu_feature_gua

# Set GPU

## Mac

In [2]:
# for mac
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        details = tf.config.experimental.get_device_details(gpu)
        print("GPU details: ", details)
else:
    print("No GPU found. Using CPU.")

# set GPU device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


## Window / Linux

In [2]:
# for window
print("Tensorflow GPUs: ", tf.config.list_physical_devices('GPU'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using PyTorch device:", device)
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

Tensorflow GPUs:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Using PyTorch device: cuda
GPU Name: Tesla T4


2025-01-31 02:54:59.522330: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-31 02:54:59.571481: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-31 02:54:59.573859: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

# Functions

In [3]:
# function to save models df
def save_models_df(df, df_name):
    df.to_csv(f"models/{df_name}.csv", index=False, encoding="utf-8")

In [4]:
# function to save tmp df
def save_tmp_df(df, df_name):
    df.to_csv(f"tmp/{df_name}.csv", index=False, encoding="utf-8")

In [5]:
# function to load spt df
def load_spt_df(df_name):
    return pd.read_csv(f"spt/{df_name}.csv", header=0, encoding="utf-8")

In [6]:
# function to load models df
def load_models_df(df_name):
    return pd.read_csv(f"models/{df_name}.csv", header=0, encoding="utf-8")

In [7]:
# function to load gen df
def load_gen_df(df_name):
    return pd.read_csv(f"gen/{df_name}.csv", header=0, encoding="utf-8")

In [8]:
# function to load tmp df
def load_tmp_df(df_name):
    return pd.read_csv(f"tmp/{df_name}.csv", header=0, encoding="utf-8")

In [9]:
# function to compute bleu score
def compute_bleu(reference, prediction):
    return sentence_bleu([reference.split()], prediction.split())

In [10]:
def safe_eval(val):
    return ast.literal_eval(val) if isinstance(val, str) else val

# Set settings

In [11]:
tqdm.pandas()

# 1. Implementing RNN/LSTM Baseline

In [12]:
spt_models = {
    "bpe": spm.SentencePieceProcessor("spt/spt_bpe.model"),
    "unigram": spm.SentencePieceProcessor("spt/spt_unigram.model"),
}

## Data Preprocessing
Load SPT-tokenized datasets, convert to sequences, and apply padding.

### Load Data

In [10]:
# function to load datasets
def get_lstm_datasets(model_name):
    return {
        "normal": [
            f"tokenized_{model_name}_myxnli_normalized_1", 
            f"tokenized_{model_name}_myxnli_normalized_2", 
            f"tokenized_{model_name}_alt_combined_normalized"
        ],
        "nllb_back_translated": [
            f"tokenized_{model_name}_myxnli_nllb_back_translated_final_1", 
            f"tokenized_{model_name}_myxnli_nllb_back_translated_final_2", 
            f"tokenized_{model_name}_alt_combined_nllb_back_translated_final"
        ],
        "seamless_m4t_back_translated": [
            f"tokenized_{model_name}_myxnli_seamless_m4t_back_translated_final_1",
            f"tokenized_{model_name}_myxnli_seamless_m4t_back_translated_final_2",
            f"tokenized_{model_name}_alt_combined_seamless_m4t_back_translated_final"
        ],
    }

In [11]:
# Load and process dataset
def load_and_rename_columns_lstm(file_name):
    df = load_spt_df(f"{file_name}")

    column_mapping = {
        "english_back_translated": "english",
        "burmese_translated": "burmese",
        "english_back_translated_tokens": "english_tokens",
        "burmese_translated_tokens": "burmese_tokens",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["english", "burmese", "english_tokens", "burmese_tokens"]]

    return df

In [12]:
# Load all datasets
lstm_all_datasets = {}
for model_name in spt_models.keys():
    datasets = get_lstm_datasets(model_name)

    lstm_all_datasets[model_name] = {
        key: [load_and_rename_columns_lstm(file) for file in file_list] for key, file_list in datasets.items()
    }

In [13]:
# cobine all datasets
lstm_all_datasets_combined = {}
for model_name in lstm_all_datasets.keys():
    lstm_all_datasets_combined[model_name] = pd.concat(
        [pd.concat(datasets) for datasets in lstm_all_datasets[model_name].values()],
        ignore_index=True
    )

In [14]:
# Shuffle the data to prevent order bias
for model_name in lstm_all_datasets_combined.keys():
    lstm_all_datasets_combined[model_name] = lstm_all_datasets_combined[model_name].sample(frac=1).reset_index(drop=True)

In [15]:
# display of datasets
for model_name in lstm_all_datasets_combined.keys():
    print(f"{model_name} dataset length: {len(lstm_all_datasets_combined[model_name])}")

bpe dataset length: 1627576
unigram dataset length: 1627576


### Apply padding

In [16]:
# convert tokenized sequences to lists
for model_name in lstm_all_datasets_combined.keys():
    lstm_all_datasets_combined[model_name]["english_seq"] = lstm_all_datasets_combined[model_name]["english_tokens"].progress_apply(
        lambda x: spt_models[model_name].EncodeAsIds(str(x)) if isinstance(x, str) else []
    )
    lstm_all_datasets_combined[model_name]["burmese_seq"] = lstm_all_datasets_combined[model_name]["burmese_tokens"].progress_apply(
        lambda x:  spt_models[model_name].EncodeAsIds(str(x)) if isinstance(x, str) else []
    )

  0%|          | 0/1627576 [00:00<?, ?it/s]

  0%|          | 0/1627576 [00:00<?, ?it/s]

  0%|          | 0/1627576 [00:00<?, ?it/s]

  0%|          | 0/1627576 [00:00<?, ?it/s]

In [18]:
# Define maximum sequence length
lstm_max_seq_length = 128

In [19]:
# appply padding to sequences
for model_name in lstm_all_datasets_combined.keys():
    lstm_all_datasets_combined[model_name]["english_seq_padded"] = pad_sequences(
        lstm_all_datasets_combined[model_name]["english_seq"], maxlen=lstm_max_seq_length, padding="post"
    ).tolist()

    lstm_all_datasets_combined[model_name]["burmese_seq_padded"] = pad_sequences(
        lstm_all_datasets_combined[model_name]["burmese_seq"], maxlen=lstm_max_seq_length, padding="post"
    ).tolist()

In [20]:
# save lstm preprocess data
for model_name in lstm_all_datasets_combined.keys():
    save_models_df(lstm_all_datasets_combined[model_name], f"lstm_{model_name}_preprocessed")

## Define LSTM Model
Define an LSTM-based sequence-to-sequence (seq2seq) model with embedding layers.

In [13]:
# Hyperparameters
lstm_embedding_dim = 256
lstm_hidden_dim = 512

In [14]:
# Get vocabulary size from SentencePiece models
lstm_vocab_sizes = {model_name: sp.GetPieceSize() for model_name, sp in spt_models.items()}

In [15]:
# function to build lstm model
def build_lstm_model(vocab_size):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=lstm_embedding_dim, mask_zero=True),
        Bidirectional(LSTM(lstm_hidden_dim, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
        LSTM(lstm_hidden_dim, return_sequences=True, dropout=0.3, recurrent_dropout=0.3),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [16]:
# build lstm bpe model
lstm_bpe_model = build_lstm_model(lstm_vocab_sizes["bpe"])
lstm_bpe_model.summary()



2025-01-31 02:55:07.965352: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-31 02:55:07.968045: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-31 02:55:07.970038: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 256)         8192000   
                                                                 
 bidirectional (Bidirection  (None, None, 1024)        3149824   
 al)                                                             
                                                                 
 lstm_1 (LSTM)               (None, None, 512)         3147776   
                                                                 
 dense (Dense)               (None, None, 32000)       16416000  
                                                                 
Total params: 30905600 (117.90 MB)
Trainable params: 30905600 (117.90 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# build lstm bpe model
lstm_unigram_model = build_lstm_model(lstm_vocab_sizes["unigram"])
lstm_unigram_model.summary()

## Train the Model
Train the model using Categorical Cross-Entropy loss & Adam optimizer.

In [17]:
# lstm model train batch size
lstm_train_batch_size = 32
lstm_train_epochs = 5

### BPE

In [18]:
# model prefix
lstm_bpe_model_prefix = "models/lstm_bpe_model"

In [19]:
# load lstm bpe dataset
lstm_bpe_preprocessed = load_models_df("lstm_bpe_preprocessed")

In [20]:
lstm_bpe_preprocessed["burmese_seq_padded"] = lstm_bpe_preprocessed["burmese_seq_padded"].apply(safe_eval)
lstm_bpe_preprocessed["english_seq_padded"] = lstm_bpe_preprocessed["english_seq_padded"].apply(safe_eval)

In [21]:
# Convert to NumPy arrays
lstm_bpe_X_train = np.array(lstm_bpe_preprocessed["burmese_seq_padded"].tolist(), dtype=np.int32)
lstm_bpe_y_train = np.array(lstm_bpe_preprocessed["english_seq_padded"].tolist(), dtype=np.int32)

print(f"X_train shape: {lstm_bpe_X_train.shape}")
print(f"y_train shape: {lstm_bpe_y_train.shape}")

X_train shape: (1627576, 128)
y_train shape: (1627576, 128)


In [22]:
# Callbacks: Early Stopping + Model Checkpoint
lstm_bpe_early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lstm_bpe_checkpoint = ModelCheckpoint(f"{lstm_bpe_model_prefix}.h5", save_best_only=True, monitor='val_loss', mode='min')

In [None]:
# Train model
lstm_bpe_model.fit(
    lstm_bpe_X_train, 
    lstm_bpe_y_train, 
    batch_size=lstm_train_batch_size,
    epochs=lstm_train_epochs, 
    validation_split=0.1, 
    callbacks=[lstm_bpe_early_stopping, lstm_bpe_checkpoint]
)

Epoch 1/5


I0000 00:00:1738293014.912576    5404 service.cc:145] XLA service 0x7f27530fb0f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1738293014.912615    5404 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-01-31 03:10:14.952954: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-01-31 03:10:15.510909: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907
I0000 00:00:1738293016.068799    5404 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


   29/45776 [..............................] - ETA: 14:22:09 - loss: 5.2780 - accuracy: 0.2249      

In [None]:
# model summary
lstm_bpe_model.summary()

In [None]:
# save lstm bpe model
lstm_bpe_model.save(lstm_bpe_model_prefix)

### Unigram

In [None]:
# load lstm unigram dataset
lstm_unigram_preprocessed = load_models_df("lstm_unigram_preprocessed")

In [None]:
# convert sequences to numpy arrays
lstm_unigram_X_train = np.array(lstm_unigram_preprocessed["burmese_seq_padded"].tolist())
lstm_unigram_y_train = np.array(lstm_unigram_preprocessed["english_seq_padded"].tolist())

In [None]:
# train lstm unigram model
lstm_unigram_model.fit(lstm_unigram_X_train, lstm_unigram_y_train, batch_size=lstm_train_batch_size, epochs=lstm_train_epochs)

In [None]:
# save lstm unigram model
lstm_unigram_model.save(f"models/lstm_unigram_model")

## Generate Predictions
Use trained LSTM models to generate translations for evaluation.

In [None]:
# batch size for lstm
lstm_predictions_batch_size = 32

In [None]:
# function to generate predictions
def lstm_generate_preditions_batch(model, tokenizer, input_seqs):
    input_seqs = np.array(input_seqs)

    num_batches = int(np.ceil(len(input_seqs) / lstm_predictions_batch_size))
    translated_texts = []

    for i in tqdm(range(num_batches), desc="Translating in Batches"):
        batch_start = i * lstm_predictions_batch_size
        batch_end = min((i + 1) * lstm_predictions_batch_size, len(input_seqs))
        
        batch_input = input_seqs[batch_start:batch_end]  # Extract batch
        batch_predictions = model.predict(batch_input)  # Run model inference

        # Convert predictions to text
        batch_texts = [tokenizer.DecodeIds(pred.argmax(axis=-1).tolist()) for pred in batch_predictions]
        translated_texts.extend(batch_texts)

    return translated_texts

### BPE

In [None]:
# load lstm bpe dataset
lstm_bpe_predictions = load_models_df("lstm_bpe_preprocessed")

In [None]:
# load lstm bpe model
lstm_bpe_model = tf.keras.models.load_model(f"models/lstm_bpe_model")

In [None]:
# make predictions
lstm_bpe_predictions["generated"] = lstm_generate_preditions_batch(
    lstm_bpe_model, spt_models["bpe"], lstm_bpe_predictions["burmese_seq_padded"]
)

In [None]:
# save lstm bpe predictions
save_models_df(lstm_bpe_predictions, "lstm_bpe_predictions")

### Unigram

In [None]:
# load lstm unigram dataset
lstm_unigram_predictions = load_models_df("lstm_unigram_preprocessed")

In [None]:
# load lstm unigram model
lstm_unigram_model = tf.keras.models.load_model(f"models/lstm_unigram_model")

In [None]:
# make predictions
lstm_unigram_predictions["generated"] = lstm_generate_preditions_batch(
    lstm_unigram_model, spt_models["unigram"], lstm_unigram_predictions["burmese_seq_padded"]
)

In [None]:
# save lstm unigram predictions
save_models_df(lstm_unigram_predictions, "lstm_unigram_predictions")

## Evaluate Model with BLEU Score
Compute BLEU, ROUGE, and Perplexity scores.

In [None]:
# load lstm predictions
lstm_evaluation_results_datasets = {
    model_name: load_models_df(f"lstm_{model_name}_predictions") for model_name in spt_models.keys()
}

### Compute BLEU Score

In [None]:
# compute bleu score
for model_name in lstm_evaluation_results_datasets.keys():
    lstm_evaluation_results_datasets[model_name]["bleu"] = lstm_evaluation_results_datasets[model_name].progress_apply(
        lambda x: compute_bleu(x["english"], x["generated"]), axis=1
    )

In [None]:
# display bleu score
for model_name in lstm_evaluation_results_datasets.keys():
    print(f"{model_name} BLEU Score: {lstm_evaluation_results_datasets[model_name]['bleu'].mean()}")

In [None]:
# save bleu score
for model_name in lstm_evaluation_results_datasets.keys():
    save_tmp_df(lstm_evaluation_results_datasets, f"lstm_{model_name}_predictions_bleu")

### Compute ROUGE Score

In [None]:
# Compute ROUGE scores
lstm_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

In [None]:
# compute rouge score
for model_name in lstm_evaluation_results_datasets.keys():
    lstm_evaluation_results_datasets[model_name]["rouge"] = lstm_evaluation_results_datasets[model_name].progress_apply(
        lambda x: lstm_scorer.score(x["english"], x["generated"])["rougeL"].fmeasure, axis=1
    )

In [None]:
# display rouge score
for model_name in lstm_evaluation_results_datasets.keys():
    print(f"{model_name} ROUGE Score: {lstm_evaluation_results_datasets[model_name]['rouge'].mean()}")

In [None]:
# save rouge score
for model_name in lstm_evaluation_results_datasets.keys():
    save_tmp_df(lstm_evaluation_results_datasets, f"lstm_{model_name}_predictions_rouge")

### Compute Perplexity Score

In [None]:
# load lstm perplexity model
lstm_perplexity_model_name = "bert-base-multilingual-cased"

lstm_perplexity_tokenizer = AutoTokenizer.from_pretrained(model_name)
lstm_perplexity_model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
lstm_perplexity_model.eval()  # Set to evaluation mode

In [None]:
# batch size for compute perplexity
lstm_perplexity_batch_size = 64

In [None]:
# function to compute perplexity
def compute_perplexity_lstm_batch(texts):
    num_batches = int(np.ceil(len(texts) / lstm_perplexity_batch_size))
    perplexities = []

    for i in tqdm(range(num_batches), desc="Computing Perplexity in Batches"):
        batch_start = i * lstm_perplexity_batch_size
        batch_end = min((i + 1) * lstm_perplexity_batch_size, len(texts))

        batch_texts = texts[batch_start:batch_end]

        # Tokenize batch
        inputs = lstm_perplexity_tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True).to(device)

        # Compute perplexity
        with torch.no_grad():
            outputs = lstm_perplexity_model(**inputs)
            log_likelihood = F.log_softmax(outputs.logits, dim=-1)
            batch_perplexity = torch.exp(-log_likelihood.mean(dim=[1, 2])).cpu().numpy()  # Move to CPU for storage

        perplexities.extend(batch_perplexity)

    return perplexities

### BPE

In [None]:
# compute perplexity
lstm_evaluation_results_datasets["bpe"]["perplexity"] = compute_perplexity_lstm_batch(
    lstm_evaluation_results_datasets["bpe"]["generated"].tolist()
)

In [None]:
# display Perplexity
print(f"Perplexity Score: {lstm_evaluation_results_datasets[model_name]['perplexity'].mean()}")

In [None]:
# save perplexity
save_tmp_df(lstm_evaluation_results_datasets, f"lstm_bpe_predictions_perplexity")

### Unigram

In [None]:
# compute perplexity
lstm_evaluation_results_datasets["unigram"]["perplexity"] = compute_perplexity_lstm_batch(
    lstm_evaluation_results_datasets["unigram"]["generated"].tolist()
)

In [None]:
# display Perplexity
print(f"Perplexity Score: {lstm_evaluation_results_datasets[model_name]['perplexity'].mean()}")

In [None]:
# save perplexity
save_tmp_df(lstm_evaluation_results_datasets, f"lstm_unigram_predictions_perplexity")

### Save Evaluation Results

In [None]:
# combine evaluation results
for model_name in lstm_evaluation_results_datasets.keys():
    print(f"Processing {model_name}...")
    # load bleu and set
    bleu = load_tmp_df(f"lstm_{model_name}_predictions_bleu")
    lstm_evaluation_results_datasets[model_name]["bleu"] = bleu["bleu"]

    # load rouge and set
    rouge = load_tmp_df(f"lstm_{model_name}_predictions_rouge")
    lstm_evaluation_results_datasets[model_name]["rouge"] = rouge["rouge"]

    # load perplexity and set
    perplexity = load_tmp_df(f"lstm_{model_name}_predictions_perplexity")
    lstm_evaluation_results_datasets[model_name]["perplexity"] = rouge["perplexity"]

    save_models_df(lstm_evaluation_results_datasets[model_name], f"lstm_{model_name}_evaluation_results")

    display(lstm_evaluation_results_datasets[model_name].head())

# 2. Implementing Multilingual Transformer Baseline

## Data Preprocessing

In [10]:
# datasets
multilingual_datasets = {
    "normal": [
        "myxnli_normalized_1", 
        "myxnli_normalized_2", 
        "alt_combined_normalized"
    ],
    "nllb_back_translated": [
        "myxnli_nllb_back_translated_final_1", 
        "myxnli_nllb_back_translated_final_2", 
        "alt_combined_nllb_back_translated_final"
    ],
    "seamless_m4t_back_translated": [
        "myxnli_seamless_m4t_back_translated_final_1",
        "myxnli_seamless_m4t_back_translated_final_2",
        "alt_combined_seamless_m4t_back_translated_final"
    ],
}

In [11]:
# Load and process dataset
def load_and_rename_columns_multilingual(file_name):
    df = load_gen_df(f"{file_name}")

    column_mapping = {
        "english_back_translated": "english",
        "burmese_translated": "burmese",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["english", "burmese"]]

    return df

In [12]:
# Load and process datasets
mutlilingual_loaded_datasets = {}
for key, file_list in multilingual_datasets.items():
    mutlilingual_loaded_datasets[key] = [load_and_rename_columns_multilingual(file) for file in file_list]

In [13]:
# combine all datasets
multilingual_combined = pd.concat(
    mutlilingual_loaded_datasets["normal"] + 
    mutlilingual_loaded_datasets["nllb_back_translated"] + 
    mutlilingual_loaded_datasets["seamless_m4t_back_translated"],
    ignore_index=True
)

In [14]:
# Shuffle the data to prevent order bias
multilingual_combined = multilingual_combined.sample(frac=1).reset_index(drop=True)

In [16]:
# print length
print(f"Multilingual dataset length: {len(multilingual_combined)}")

Multilingual dataset length: 1627576


In [17]:
# save data
save_models_df(multilingual_combined, "multilingual_combined")

## Generate Predictions
Load ```mBERT``` and ```XLM-R``` for Masked Language Modeling (MLM).
MLM helps predict missing words in Burmese sequences.

In [9]:
# Define model names
multilingual_model_names = {
    "mbert": "bert-base-multilingual-cased",
    "xlmr": "xlm-roberta-base"
}

In [12]:
# Function to generate masked predictions
def generate_masked_predictions_batch(texts, tokenizer, model):
    # Ensure all inputs are strings, replace NaN/None with an empty string
    valid_texts = [str(text) if isinstance(text, str) else "" for text in texts]
    
    # Tokenize batch
    inputs = tokenizer(valid_texts, return_tensors="pt", truncation=True, padding=True).to(device)

    # Ensure at least 3 tokens (CLS + Masked + SEP)
    seq_lengths = inputs["input_ids"].shape[1]
    mask_indices = [
        torch.randint(1, seq_lengths - 1, (1,)).item() if seq_lengths > 2 else None
        for _ in valid_texts
    ]

    # Apply masking
    for i, idx in enumerate(mask_indices):
        if idx is not None:
            inputs["input_ids"][i, idx] = tokenizer.mask_token_id  # Replace token with [MASK]

    # Run model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get predicted tokens
    predicted_tokens = []
    for i, idx in enumerate(mask_indices):
        if idx is not None:
            predicted_token_id = torch.argmax(outputs.logits[i, idx], dim=-1).item()
            predicted_token = tokenizer.decode([predicted_token_id])
            predicted_tokens.append(predicted_token)
        else:
            predicted_tokens.append(valid_texts[i])  # Return original text if no masking was possible

    # Replace [MASK] with predicted tokens
    masked_replaced_texts = [
        text.replace(tokenizer.mask_token, pred) if tokenizer.mask_token in text else text
        for text, pred in zip(valid_texts, predicted_tokens)
    ]

    return masked_replaced_texts

In [None]:
# multinlingual batch size
multinlingual_predictions_batch_size = 32

### mBERT

In [None]:
# Load tokenizers & models for both mBERT
multilingual_mbert_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_names["mbert"])
multilingual_mbert_model = AutoModelForMaskedLM.from_pretrained(multilingual_model_names["mbert"]).to(device)

In [None]:
# load multilingual dataset
multilingual_mbert_predictions = load_models_df("multilingual_combined")

In [None]:
# generate predictions masks
multilingual_mbert_predictions["generated"] = [
        pred for batch in tqdm(
            [multilingual_mbert_predictions["burmese"][i : i + multinlingual_predictions_batch_size] for i in range(0, len(multilingual_mbert_predictions), multinlingual_predictions_batch_size)],
            desc=f"Generating {model_name}"
        )
        for pred in generate_masked_predictions_batch(batch, model_name)
    ]

In [None]:
# display predictions
display(multilingual_mbert_predictions.head())

In [None]:
# save prediction
save_models_df(multilingual_mbert_predictions, "multilingual_mbert_predictions")

### XLM-R

In [None]:
# Load tokenizers & models for both xlmr
multilingual_xlmr_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_names["xlmr"])
multilingual_xlmr_model = AutoModelForMaskedLM.from_pretrained(multilingual_model_names["xlmr"]).to(device)

In [None]:
# load multilingual dataset
multilingual_xlmr_predictions = load_models_df("multilingual_combined")

In [None]:
# generate predictions masks
multilingual_xlmr_predictions["generated"] = [
        pred for batch in tqdm(
            [multilingual_xlmr_predictions["burmese"][i : i + multinlingual_predictions_batch_size] for i in range(0, len(multilingual_xlmr_predictions), multinlingual_predictions_batch_size)],
            desc=f"Generating {model_name}"
        )
        for pred in generate_masked_predictions_batch(batch, model_name)
    ]

In [None]:
# display predictions
display(multilingual_xlmr_predictions.head())

In [None]:
# save prediction
save_models_df(multilingual_xlmr_predictions, "multilingual_xlmr_predictions")

## Evaluate Model Performance
Compare BLEU, ROUGE, and Perplexity scores between ```mBERT``` and ```XLM-R```.

In [None]:
# load multilingual predictions
multilingual_evaluation_results_datasets = {
    model_name: load_models_df(f"multilingual_{model_name}_predictions") for model_name in multilingual_model_names.keys()
}

### Compute BLEU Score

In [14]:
# Function to compute BLEU score
def compute_bleu_multilingual(reference, prediction):
    return sentence_bleu([reference.split()], prediction.split())

In [None]:
# process all models in batch
lstm_bleu_batch_size = 32
for model_name in multilingual_model_names:
    print(f"Processing {model_name}...")

    mBERT_XLMR_evaludation_results[f"{model_name}_bleu"] = [
        compute_bleu_multilingual(row["english"], row[f"{model_name}_generated"])
        for _, row in tqdm(mBERT_XLMR_evaludation_results.iterrows(), total=len(mBERT_XLMR_evaludation_results), desc=f"Computing BLEU {model_name}")
    ]

In [None]:
# Display BLEU scores
for model_name in multilingual_model_names:
    print(f"{model_name} BLEU Score: {mBERT_XLMR_evaludation_results[f'{model_name}_bleu'].mean()}")

In [None]:
# save bleu scores
save_tmp_df(mBERT_XLMR_evaludation_results, "mBERT_XLMR_evaludation_results_bleu")

### Compute ROUGE Score

In [None]:
# Compute ROUGE Score
multilingual_rouge_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
for model_name in multilingual_model_names:
    mBERT_XLMR_evaludation_results[f"{model_name}_rouge"] = mBERT_XLMR_evaludation_results.progress_apply(
        lambda row: multilingual_rouge_scorer.score(row["english"], row[f"{model_name}_generated"])["rougeL"].fmeasure, axis=1
    )

In [None]:
# Display ROUGE scores
for model_name in multilingual_model_names:
    print(f"{model_name} ROUGE Score: {mBERT_XLMR_evaludation_results[f'{model_name}_rouge'].mean()}")

In [None]:
# save ROUGE scores
save_tmp_df(mBERT_XLMR_evaludation_results, "mBERT_XLMR_evaludation_results_rouge")

### Compute Perplexity Score

In [None]:
# function to compute perplexity
def compute_perplexity_multilingual_batch(texts, model_name):
    tokenizer = multilingual_tokenizers[model_name]
    model = multilingual_models[model_name].to(device)

    # Ensure all inputs are valid strings and replace NaN/None
    valid_texts = [str(text) if isinstance(text, str) else "" for text in texts]

    # Tokenize batch
    inputs = tokenizer(valid_texts, return_tensors="pt", truncation=True, padding=True).to(device)

    # Run the model in batch
    with torch.no_grad():
        outputs = model(**inputs)

    # Compute log-likelihood
    log_likelihood = F.log_softmax(outputs.logits, dim=-1)

    # Compute Perplexity for each sentence in batch
    perplexities = torch.exp(-log_likelihood.mean(dim=(1, 2))).tolist()

    return perplexities

In [None]:
multilingual_perplexity_batch_size = 32

for model_name in multilingual_model_names:
    print(f"Computing Perplexity for {model_name}...")

    # Process in batches
    perplexity_scores = []
    for batch in tqdm(
        [mBERT_XLMR_evaludation_results[f"{model_name}_generated"][i : i + multilingual_perplexity_batch_size].dropna().tolist()
         for i in range(0, len(mBERT_XLMR_evaludation_results), multilingual_perplexity_batch_size)
        ],
        desc=f"Perplexity {model_name}"
    ):
        perplexity_scores.extend(compute_perplexity_multilingual_batch(batch, model_name))

    # Store perplexity scores in DataFrame
    mBERT_XLMR_evaludation_results[f"{model_name}_perplexity"] = perplexity_scores

In [None]:
# Display Perplexity scores
for model_name in multilingual_model_names:
    print(f"{model_name} BLEU Score: {mBERT_XLMR_evaludation_results[f'{model_name}_perplexity'].mean()}")

In [None]:
# save ROUGE scores
save_tmp_df(mBERT_XLMR_evaludation_results, "mBERT_XLMR_evaludation_results_perplexity")

### Save Model Results

In [None]:
# Save results
save_models_df(mBERT_XLMR_evaludation_results, "mBERT_XLMR_evaludation_results")
print("Results saved successfully!")

# 3. Benchmarking and Analysis
Compare the performance of LSTM, mBERT, and XLM-R using BLEU, ROUGE, and Perplexity.