# Importing all the important libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers sentence_transformers sentencepiece evaluate sacrebleu bert-score

In [None]:
!conda update -n base -c conda-forge conda -y

In [None]:
!conda install conda-forge::rouge-score -y

In [1]:
import pandas as pd
import tensorflow as tf
import torch
import ast
import numpy as np
import pprint
import sentencepiece as spm
import matplotlib.pyplot as plt
import torch.nn.functional as F
import bert_score
import evaluate
from tqdm.notebook import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import AutoTokenizer, AutoModelForMaskedLM, logging
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from torch.utils.data import DataLoader, Dataset
from rouge_score import rouge_scorer
from sacrebleu import corpus_chrf

<<<<<<< local
2025-02-03 08:36:18.426890: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-03 08:36:18.443135: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-03 08:36:18.468917: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-03 08:36:18.468946: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-03 08:36:18.484364: I tensorflow/core/platform/c

# Set GPU

## Mac

In [2]:
# for mac
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        details = tf.config.experimental.get_device_details(gpu)
        print("GPU details: ", details)
else:
    print("No GPU found. Using CPU.")

# set GPU device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


## Window / Linux

In [2]:
# for window
print("Tensorflow GPUs: ", tf.config.list_physical_devices('GPU'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using PyTorch device:", device)
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

Tensorflow GPUs:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Using PyTorch device: cuda
GPU Name: Tesla T4


<<<<<<< local
2025-02-03 08:36:25.177779: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-03 08:36:25.225423: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-03 08:36:25.227299: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs

# Common Var

In [3]:
# Load BLEU and ROUGE metric objects
bleu_metric = evaluate.load("bleu")
rouge_scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

# Class

In [4]:
# Custom dataset class for batching
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = [str(text) if text is not None else "" for text in texts] 

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

# Functions

In [5]:
# function to save models df
def save_models_df(df, df_name):
    df.to_csv(f"models/{df_name}.csv", index=False, encoding="utf-8")

In [6]:
# function to save tmp df
def save_tmp_df(df, df_name):
    df.to_csv(f"tmp/{df_name}.csv", index=False, encoding="utf-8")

In [7]:
# function to load spt df
def load_spt_df(df_name):
    return pd.read_csv(f"spt/{df_name}.csv", header=0, encoding="utf-8")

In [8]:
# function to load models df
def load_models_df(df_name):
    return pd.read_csv(f"models/{df_name}.csv", header=0, encoding="utf-8")

In [9]:
# function to load gen df
def load_gen_df(df_name):
    return pd.read_csv(f"gen/{df_name}.csv", header=0, encoding="utf-8")

In [10]:
# function to load tmp df
def load_tmp_df(df_name):
    return pd.read_csv(f"tmp/{df_name}.csv", header=0, encoding="utf-8")

In [11]:
# function to compute bleu score
def compute_bleu(reference, prediction):
    smoothing_fn = SmoothingFunction().method1
    return sentence_bleu([reference.split()], prediction.split(), weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_fn)

In [12]:
def safe_eval(val):
    return ast.literal_eval(val) if isinstance(val, str) else val

In [13]:
# function to compute BLEU, ROUGE, chrF-S, and BERTScore.
def compute_metrics(dataset):
    predictions = dataset["generated"].astype(str)  # Convert to string
    references = dataset["burmese"].astype(str)  # Convert to string

    print("Calculating BLEU...")
    smooth_fn = SmoothingFunction().method1
    dataset["bleu"] = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smooth_fn) for pred, ref in zip(predictions, references)]

    print("Calculating ROUGE...")
    rouge_scores = [rouge_scorer.score(pred, ref) for pred, ref in zip(predictions, references)]
    dataset["rouge-1"] = [r["rouge1"].fmeasure for r in rouge_scores]
    dataset["rouge-2"] = [r["rouge2"].fmeasure for r in rouge_scores]
    dataset["rouge-l"] = [r["rougeL"].fmeasure for r in rouge_scores]

    print("Calculating chrF-S...")
    dataset["chrf-s"] = corpus_chrf(
        [str(pred) for pred in predictions],  # Convert all predictions to strings
        [[str(ref)] for ref in references]    # Convert references to lists of strings
    ).score
    
    print("Calculating BERTScore...")
    bert_scores = bert_score.score(predictions, references, lang="my")
    dataset["bert_score"] = bert_scores[2].tolist()

In [14]:
# Function to compute perplexity in batch
def compute_perplexity_batch(texts, tokenizer, model):
    # Ensure all inputs are valid strings
    valid_texts = [str(text) if isinstance(text, str) else "" for text in texts]

    # Tokenize batch with padding & truncation
    inputs = tokenizer(valid_texts, return_tensors="pt", truncation=True, padding=True).to(device)

    # Get model outputs
    with torch.inference_mode():
        outputs = model(**inputs)

    logits = outputs.logits  # (batch_size, seq_len, vocab_size)
    log_probs = F.log_softmax(logits, dim=-1)  # Compute log probabilities

    # Get token log-likelihoods using true token IDs
    target_ids = inputs["input_ids"]  # (batch_size, seq_len)
    log_likelihood = log_probs.gather(dim=-1, index=target_ids.unsqueeze(-1)).squeeze(-1)

    # Apply attention mask to remove padding tokens
    mask = inputs["attention_mask"]  # (batch_size, seq_len)
    masked_log_likelihood = log_likelihood * mask  # Zero out padding contributions

    # Compute sentence-level mean log-likelihood (only over valid tokens)
    sentence_log_likelihood = masked_log_likelihood.sum(dim=1) / mask.sum(dim=1)

    # Clamp values to avoid numerical instability
    sentence_log_likelihood = torch.clamp(sentence_log_likelihood, min=-100, max=100)

    # Convert log-likelihood to perplexity
    log_perplexity = -sentence_log_likelihood
    perplexities = torch.exp(log_perplexity)

    # 🔍 Print warning if perplexities contain `inf`
    if torch.isinf(perplexities).any():
        print("\n [WARNING] Perplexity contains `inf` for some texts!")
        print(f"  Log-Likelihood Shape: {log_likelihood.shape}")
        print(f"  Log-Likelihood Mean: {sentence_log_likelihood.mean().item()}")
        print(f"  Computed Perplexity Values: {perplexities}")

    return perplexities.tolist()

# Set settings

In [15]:
tqdm.pandas()

In [16]:
# Suppress specific warnings from the transformers library
logging.set_verbosity_error()

# 1. Implementing RNN/LSTM Baseline

In [17]:
spt_models = {
    "bpe": spm.SentencePieceProcessor("spt/spt_bpe.model"),
    "unigram": spm.SentencePieceProcessor("spt/spt_unigram.model"),
}

## Data Preprocessing
Load SPT-tokenized datasets, convert to sequences, and apply padding.

### Load Data

In [18]:
# function to load datasets
def get_lstm_datasets(model_name):
    return {
        "normal": [
            f"tokenized_{model_name}_myxnli_normalized_1", 
            f"tokenized_{model_name}_myxnli_normalized_2", 
            f"tokenized_{model_name}_alt_combined_normalized"
        ],
        "nllb_back_translated": [
            f"tokenized_{model_name}_myxnli_nllb_back_translated_final_1", 
            f"tokenized_{model_name}_myxnli_nllb_back_translated_final_2", 
            f"tokenized_{model_name}_alt_combined_nllb_back_translated_final"
        ],
        "seamless_m4t_back_translated": [
            f"tokenized_{model_name}_myxnli_seamless_m4t_back_translated_final_1",
            f"tokenized_{model_name}_myxnli_seamless_m4t_back_translated_final_2",
            f"tokenized_{model_name}_alt_combined_seamless_m4t_back_translated_final"
        ],
    }

In [19]:
# Load and process dataset
def load_and_rename_columns_lstm(file_name):
    df = load_spt_df(f"{file_name}")

    column_mapping = {
        "english_back_translated": "english",
        "burmese_translated": "burmese",
        "english_back_translated_tokens": "english_tokens",
        "burmese_translated_tokens": "burmese_tokens",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["english", "burmese", "english_tokens", "burmese_tokens"]]

    return df

In [20]:
# Load all datasets
lstm_all_datasets = {}
for model_name in spt_models.keys():
    datasets = get_lstm_datasets(model_name)

    lstm_all_datasets[model_name] = {
        key: [load_and_rename_columns_lstm(file) for file in file_list] for key, file_list in datasets.items()
    }

In [21]:
# cobine all datasets
lstm_all_datasets_combined = {}
for model_name in lstm_all_datasets.keys():
    lstm_all_datasets_combined[model_name] = pd.concat(
        [pd.concat(datasets) for datasets in lstm_all_datasets[model_name].values()],
        ignore_index=True
    )

In [22]:
# Shuffle the data to prevent order bias and drop null
for model_name in lstm_all_datasets_combined.keys():
    lstm_all_datasets_combined[model_name] = lstm_all_datasets_combined[model_name].sample(frac=1).reset_index(drop=True)

In [23]:
# display of datasets
for model_name in lstm_all_datasets_combined.keys():
    print(f"{model_name} dataset length: {len(lstm_all_datasets_combined[model_name])}")

bpe dataset length: 1627576
unigram dataset length: 1627576


### Apply padding

In [16]:
# convert tokenized sequences to lists
for model_name in lstm_all_datasets_combined.keys():
    lstm_all_datasets_combined[model_name]["english_seq"] = lstm_all_datasets_combined[model_name]["english_tokens"].progress_apply(
        lambda x: spt_models[model_name].EncodeAsIds(str(x)) if isinstance(x, str) else []
    )
    lstm_all_datasets_combined[model_name]["burmese_seq"] = lstm_all_datasets_combined[model_name]["burmese_tokens"].progress_apply(
        lambda x:  spt_models[model_name].EncodeAsIds(str(x)) if isinstance(x, str) else []
    )

  0%|          | 0/1627576 [00:00<?, ?it/s]

  0%|          | 0/1627576 [00:00<?, ?it/s]

  0%|          | 0/1627576 [00:00<?, ?it/s]

  0%|          | 0/1627576 [00:00<?, ?it/s]

In [18]:
# Define maximum sequence length
lstm_max_seq_length = 128

In [19]:
# appply padding to sequences
for model_name in lstm_all_datasets_combined.keys():
    lstm_all_datasets_combined[model_name]["english_seq_padded"] = pad_sequences(
        lstm_all_datasets_combined[model_name]["english_seq"], maxlen=lstm_max_seq_length, padding="post"
    ).tolist()

    lstm_all_datasets_combined[model_name]["burmese_seq_padded"] = pad_sequences(
        lstm_all_datasets_combined[model_name]["burmese_seq"], maxlen=lstm_max_seq_length, padding="post"
    ).tolist()

In [20]:
# save lstm preprocess data
for model_name in lstm_all_datasets_combined.keys():
    save_models_df(lstm_all_datasets_combined[model_name], f"lstm_{model_name}_preprocessed")

## Define LSTM Model
Define an LSTM-based sequence-to-sequence (seq2seq) model with embedding layers.

In [18]:
# Hyperparameters
lstm_embedding_dim = 256
lstm_hidden_dim = 512

In [19]:
# Get vocabulary size from SentencePiece models
lstm_vocab_sizes = {model_name: sp.GetPieceSize() for model_name, sp in spt_models.items()}

In [20]:
# function to build lstm model
def build_lstm_model(vocab_size):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=lstm_embedding_dim, mask_zero=True),
        Bidirectional(LSTM(lstm_hidden_dim, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
        LSTM(lstm_hidden_dim, return_sequences=True, dropout=0.3, recurrent_dropout=0.3),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [21]:
# build lstm bpe model
lstm_bpe_model = build_lstm_model(lstm_vocab_sizes["bpe"])
lstm_bpe_model.summary()



2025-02-03 08:15:44.588870: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-03 08:15:44.591696: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-03 08:15:44.593767: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-













Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 256)         8192000   
                                                                 
 bidirectional (Bidirection  (None, None, 1024)        3149824   
 al)                                                             
                                                                 
 lstm_1 (LSTM)               (None, None, 512)         3147776   
                                                                 
 dense (Dense)               (None, None, 32000)       16416000  
                                                                 
Total params: 30905600 (117.90 MB)
Trainable params: 30905600 (117.90 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [21]:
# build lstm bpe model
lstm_unigram_model = build_lstm_model(lstm_vocab_sizes["unigram"])
lstm_unigram_model.summary()



2025-02-03 08:36:43.437264: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-03 08:36:43.512506: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-03 08:36:43.515652: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-













Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 256)         8192000   
                                                                 
 bidirectional (Bidirection  (None, None, 1024)        3149824   
 al)                                                             
                                                                 
 lstm_1 (LSTM)               (None, None, 512)         3147776   
                                                                 
 dense (Dense)               (None, None, 32000)       16416000  
                                                                 
Total params: 30905600 (117.90 MB)
Trainable params: 30905600 (117.90 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Train the Model
Train the model using Categorical Cross-Entropy loss & Adam optimizer.

In [22]:
# lstm model train batch size
lstm_train_batch_size = 64
lstm_train_epochs = 5

### BPE

In [25]:
# model prefix
lstm_bpe_model_prefix = "models/lstm_bpe_model"

In [26]:
# load lstm bpe dataset
lstm_bpe_preprocessed = load_models_df("lstm_bpe_preprocessed")

In [27]:
# Convert from string to list
lstm_bpe_preprocessed["burmese_seq_padded"] = lstm_bpe_preprocessed["burmese_seq_padded"].apply(safe_eval)
lstm_bpe_preprocessed["english_seq_padded"] = lstm_bpe_preprocessed["english_seq_padded"].apply(safe_eval)

In [28]:
# Convert to NumPy arrays
lstm_bpe_X_train = np.array(lstm_bpe_preprocessed["burmese_seq_padded"].tolist(), dtype=np.int32)
lstm_bpe_y_train = np.array(lstm_bpe_preprocessed["english_seq_padded"].tolist(), dtype=np.int32)

print(f"X_train shape: {lstm_bpe_X_train.shape}")
print(f"y_train shape: {lstm_bpe_y_train.shape}")

X_train shape: (1627576, 128)
y_train shape: (1627576, 128)


In [29]:
# Callbacks: Early Stopping + Model Checkpoint
lstm_bpe_early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lstm_bpe_checkpoint = ModelCheckpoint(
    f"{lstm_bpe_model_prefix}.keras", 
    save_best_only=True, 
    monitor='val_loss', 
    mode='min',
    save_weights_only=False
)

In [30]:
lstm_bpe_model.load_weights(f"{lstm_bpe_model_prefix}.h5")

In [None]:
# Train model
lstm_bpe_model.fit(
    lstm_bpe_X_train, 
    lstm_bpe_y_train, 
    batch_size=lstm_train_batch_size,
    epochs=lstm_train_epochs, 
    validation_split=0.1, 
    callbacks=[lstm_bpe_early_stopping, lstm_bpe_checkpoint]
)

# save lstm bpe model
lstm_bpe_model.save(f"{lstm_bpe_model_prefix}.keras", save_format="keras")

Epoch 1/5


I0000 00:00:1738571273.538957     644 service.cc:145] XLA service 0x7f13d31873b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1738571273.538996     644 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-02-03 08:27:53.545684: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-02-03 08:27:53.565445: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907
I0000 00:00:1738571273.632815     644 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


    4/22888 [..............................] - ETA: 7:51:19 - loss: 1.5636 - accuracy: 0.6750  

In [None]:
# model summary
lstm_bpe_model.summary()

### Unigram

In [23]:
# model prefix
lstm_unigram_model_prefix = "models/lstm_unigram_model"

In [24]:
# load lstm unigram dataset
lstm_unigram_preprocessed = load_models_df("lstm_unigram_preprocessed")

In [25]:
# Convert from string to list
lstm_unigram_preprocessed["burmese_seq_padded"] = lstm_unigram_preprocessed["burmese_seq_padded"].apply(safe_eval)
lstm_unigram_preprocessed["english_seq_padded"] = lstm_unigram_preprocessed["english_seq_padded"].apply(safe_eval)

In [26]:
# Convert to NumPy arrays
lstm_unigram_X_train = np.array(lstm_unigram_preprocessed["burmese_seq_padded"].tolist(), dtype=np.int32)
lstm_unigram_y_train = np.array(lstm_unigram_preprocessed["english_seq_padded"].tolist(), dtype=np.int32)

print(f"X_train shape: {lstm_unigram_X_train.shape}")
print(f"y_train shape: {lstm_unigram_y_train.shape}")

X_train shape: (1627576, 128)
y_train shape: (1627576, 128)


In [27]:
# Callbacks: Early Stopping + Model Checkpoint
lstm_unigram_early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lstm_unigram_checkpoint = ModelCheckpoint(
    f"{lstm_unigram_model_prefix}.keras", 
    save_best_only=True, 
    monitor='val_loss', 
    mode='min',
    save_weights_only=False
)

In [28]:
lstm_unigram_model.load_weights(f"{lstm_unigram_model_prefix}.h5")

In [None]:
# Train model
lstm_unigram_model.fit(
    lstm_unigram_X_train, 
    lstm_unigram_y_train, 
    batch_size=lstm_train_batch_size,
    epochs=lstm_train_epochs, 
    validation_split=0.1, 
    callbacks=[lstm_unigram_early_stopping, lstm_unigram_checkpoint]
)

# save lstm bpe model
lstm_unigram_model.save(lstm_unigram_model_prefix, save_format="keras")

Epoch 1/5


I0000 00:00:1738572496.380013   32555 service.cc:145] XLA service 0x7fa8b8006b30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1738572496.380050   32555 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-02-03 08:48:16.386695: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-02-03 08:48:16.406514: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907
I0000 00:00:1738572496.469539   32555 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


   45/22888 [..............................] - ETA: 7:53:16 - loss: 1.2307 - accuracy: 0.7582  

In [None]:
# model summary
lstm_unigram_model.summary()

## Generate Predictions
Use trained LSTM models to generate translations for evaluation.

In [None]:
# LSTM Text Dataset
class LstmTextDataset(Dataset):
    """Custom dataset for text sequences"""
    def __init__(self, texts, tokenizer, max_length=50):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokenized_text = self.tokenizer.encode(text, out_type=int)

        # Pad sequences to max_length
        tokenized_text = tokenized_text[:self.max_length]
        padding_length = self.max_length - len(tokenized_text)
        padded_sequence = tokenized_text + [0] * padding_length  # Padding with 0

        return torch.tensor(padded_sequence, dtype=torch.long)

In [None]:
# function to generate text using LSTM in auto-regressive manner for batch processing.
def generate_text_lstm_batch(dataloader, model, tokenizer, max_length=50):
    all_predictions = []

    for batch in tqdm(dataloader, desc="Generating Text with LSTM"):
        batch = batch.numpy()  # Convert to NumPy for TensorFlow processing
        batch_generated = []

        for seq in batch:
            generated_sequence = seq.tolist()

            for _ in range(max_length - len(seq)):
                input_padded = pad_sequences(
                    [generated_sequence], maxlen=max_length, padding='pre'
                )
                input_tensor = tf.convert_to_tensor(input_padded, dtype=tf.int32)

                predictions = model.predict(input_tensor, verbose=0)
                next_token = np.argmax(predictions[0], axis=-1)

                if next_token == tokenizer.eos_id():
                    break  # Stop at EOS token

                generated_sequence.append(next_token)

            batch_generated.append(tokenizer.decode(generated_sequence))

        all_predictions.extend(batch_generated)

    return all_predictions

### BPE

In [None]:
# load lstm bpe model and tokenizer
lstm_bpe_model = load_model(f"models/lstm_bpe_model.h5")
lstm_bpe_tokenizer = spt_models["bpe"]

In [None]:
# load lstm bpe dataset
lstm_bpe_predictions = load_models_df("lstm_bpe_preprocessed")

In [None]:
# Convert from string to list and NumPy arrays
lstm_bpe_predictions["burmese_seq_padded"] = lstm_bpe_predictions["burmese_seq_padded"].apply(safe_eval)
lstm_bpe_predictions_sequences = np.array(lstm_bpe_predictions["burmese_seq_padded"].tolist(), dtype=np.int32)

In [None]:
# batch size
lstm_bpe_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
lstm_bpe_predictions_dataset = LstmTextDataset(lstm_bpe_predictions_sequences, lstm_bpe_tokenizer)
lstm_bpe_predictions_dataloader = DataLoader(lstm_bpe_predictions_dataset, batch_size=lstm_bpe_predictions_batch_size, shuffle=False)

In [None]:
# Run text generation
lstm_bpe_predictions["generated"] = generate_text_lstm_batch(lstm_bpe_predictions_dataloader, lstm_bpe_model, lstm_bpe_tokenizer)

In [None]:
# remove some columns and display
lstm_bpe_predictions = lstm_bpe_predictions[lstm_bpe_predictions["english", "burmese", "generated"]]
display(lstm_bpe_predictions)

In [None]:
# save lstm bpe predictions
save_models_df(lstm_bpe_predictions, "lstm_bpe_predictions")

### Unigram

In [None]:
# load lstm unigram model and tokenizer
lstm_unigram_model = load_model(f"models/lstm_unigram_model.h5")
lstm_unigram_tokenizer = spt_models["unigram"]

In [None]:
# load lstm unigram dataset
lstm_unigram_predictions = load_models_df("lstm_unigram_preprocessed")

In [None]:
# Convert from string to list and NumPy arrays
lstm_unigram_predictions["burmese_seq_padded"] = lstm_unigram_predictions["burmese_seq_padded"].apply(safe_eval)
lstm_unigram_predictions_sequences = np.array(lstm_unigram_predictions["burmese_seq_padded"].tolist(), dtype=np.int32)

In [None]:
# batch size
lstm_unigram_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
lstm_unigram_predictions_dataset = LstmTextDataset(lstm_unigram_predictions_sequences, lstm_unigram_tokenizer)
lstm_unigram_predictions_dataloader = DataLoader(lstm_unigram_predictions_dataset, batch_size=lstm_unigram_predictions_batch_size, shuffle=False)

In [None]:
# Run text generation
lstm_unigram_predictions["generated"] = generate_text_lstm_batch(lstm_unigram_predictions_dataloader, lstm_unigram_model, lstm_unigram_tokenizer)

In [None]:
# remove some columns and display
lstm_unigram_predictions = lstm_unigram_predictions[lstm_unigram_predictions["english", "burmese", "generated"]]
display(lstm_unigram_predictions)

In [None]:
# save lstm unigram predictions
save_models_df(lstm_unigram_predictions, "lstm_unigram_predictions")

## Evaluate Model Performance
Compute BLEU, ROUGE-1, ROUGE-2, ROUGE-3, ROUGE-L, chrF-S, BERTScore and Perplexity scores.

In [None]:
# load lstm predictions
lstm_evaluation_results_datasets = {
    model_name: load_models_df(f"lstm_{model_name}_predictions") for model_name in spt_models.keys()
}

### Metrics

In [None]:
# compute metrics
for model_name, dataset in lstm_evaluation_results_datasets.items():
    compute_metrics(dataset)
    print(f"Metrics scores for {model_name}:")
    print(f" BLEU Score: {dataset['bleu'].mean()}")
    print(f" ROUGE-1 Score: {dataset['rouge-1'].mean()}")
    print(f" ROUGE-2 Score: {dataset['rouge-2'].mean()}")
    print(f" ROUGE-L Score: {dataset['rouge-l'].mean()}")
    print(f" chrF-S Score: {dataset['chrf-s'].mean()}")
    print(f" BERT Score: {dataset['bert'].mean()}")

In [None]:
# display metrics
for model_name, dataset in lstm_evaluation_results_datasets.items():
    print(f"Metrics scores for {model_name}:")
    print(f" BLEU Score: {dataset['bleu'].mean()}")
    print(f" ROUGE-1 Score: {dataset['rouge-1'].mean()}")
    print(f" ROUGE-2 Score: {dataset['rouge-2'].mean()}")
    print(f" ROUGE-L Score: {dataset['rouge-l'].mean()}")
    print(f" chrF-S Score: {dataset['chrf-s'].mean()}")
    print(f" BERT Score: {dataset['bert'].mean()}")

In [None]:
# save resutls
for model_name, dataset in lstm_evaluation_results_datasets.items():
    save_tmp_df(dataset, f"lstm_{model_name}_metrics")

### Perplexity

#### BPE

In [None]:
# batch size
lstm_bpe_perplexity_batch_size = 8

In [None]:
# load lstm bpe model and tokenizer
lstm_bpe_model = load_model(f"models/lstm_bpe_model.h5")
lstm_bpe_tokenizer = spt_models["bpe"]

In [None]:
# Prepare dataset and DataLoader
lstm_bpe_generated_texts = lstm_evaluation_results_datasets["bpe"]["generated"].tolist()
lstm_bpe_text_dataset = TextDataset(lstm_bpe_generated_texts)
lstm_bpe_dataloader = DataLoader(
    lstm_bpe_text_dataset, 
    batch_size=lstm_bpe_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# Compute perplexity in batches
lstm_bpe_perplexity_scores = []
for batch in tqdm(lstm_bpe_dataloader, desc="Computing Perplexity for LSTM BPE"):
    batch_perplexities = compute_perplexity_batch(batch, lstm_bpe_tokenizer, lstm_bpe_model)
    lstm_bpe_perplexity_scores.extend(batch_perplexities)

In [None]:
# Store perplexity scores in DataFrame and display
lstm_evaluation_results_datasets["bpe"]["perplexity"] = lstm_bpe_perplexity_scores
print(f"Perplexity Score: {lstm_evaluation_results_datasets['bpe']['perplexity'].mean()}")

In [None]:
# save perplexity
save_tmp_df(lstm_evaluation_results_datasets["bpe"], f"lstm_bpe_perplexity")

#### Unigram

In [None]:
# batch size
lstm_unigram_perplexity_batch_size = 8

In [None]:
# load lstm unigram model and tokenizer
lstm_unigram_model = load_model(f"models/lstm_unigram_model.h5")
lstm_unigram_tokenizer = spt_models["unigram"]

In [None]:
# Prepare dataset and DataLoader
lstm_unigram_generated_texts = lstm_evaluation_results_datasets["unigram"]["generated"].tolist()
lstm_unigram_text_dataset = TextDataset(lstm_unigram_generated_texts)
lstm_unigram_dataloader = DataLoader(
    lstm_unigram_text_dataset, 
    batch_size=lstm_unigram_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# Compute perplexity in batches
lstm_unigram_perplexity_scores = []
for batch in tqdm(lstm_unigram_dataloader, desc="Computing Perplexity for LSTM Unigram"):
    batch_perplexities = compute_perplexity_batch(batch, lstm_unigram_tokenizer, lstm_unigram_model)
    lstm_unigram_perplexity_scores.extend(batch_perplexities)

In [None]:
# Store perplexity scores in DataFrame and display
lstm_evaluation_results_datasets["unigram"]["perplexity"] = lstm_unigram_perplexity_scores
print(f"Perplexity Score: {lstm_evaluation_results_datasets['unigram']['perplexity'].mean()}")

In [None]:
# save perplexity
save_tmp_df(lstm_evaluation_results_datasets["unigram"], f"lstm_unigram_perplexity")

### Save Evaluation Results

In [None]:
# combine evaluation results
for model_name in lstm_evaluation_results_datasets.keys():
    print(f"Processing {model_name}...")

    # load metrics and set
    metrics = load_tmp_df(f"lstm_{model_name}_metrics")
    lstm_evaluation_results_datasets[model_name]["bleu"] = metrics["bleu"]
    lstm_evaluation_results_datasets[model_name]["rouge-1"] = metrics["rouge-1"]
    lstm_evaluation_results_datasets[model_name]["rouge-2"] = metrics["rouge-2"]
    lstm_evaluation_results_datasets[model_name]["rouge-l"] = metrics["rouge-l"]
    lstm_evaluation_results_datasets[model_name]["chrf-s"] = metrics["chrf-s"]
    lstm_evaluation_results_datasets[model_name]["bert_score"] = metrics["bert_score"]

    # load perplexity and set
    perplexity = load_tmp_df(f"lstm_{model_name}_perplexity")
    lstm_evaluation_results_datasets[model_name]["perplexity"] = perplexity["perplexity"]

    save_models_df(lstm_evaluation_results_datasets[model_name], f"lstm_{model_name}_evaluation_results")

    display(lstm_evaluation_results_datasets[model_name].head())

# 2. Implementing Multilingual Transformer Baseline

In [17]:
# Define model names
multilingual_model_names = {
    "mbert": "bert-base-multilingual-cased",
    "xlmr": "xlm-roberta-base"
}

## Data Preprocessing

In [18]:
# datasets
multilingual_datasets = {
    "normal": [
        "myxnli_normalized_1", 
        "myxnli_normalized_2", 
        "alt_combined_normalized"
    ],
    "nllb_back_translated": [
        "myxnli_nllb_back_translated_final_1", 
        "myxnli_nllb_back_translated_final_2", 
        "alt_combined_nllb_back_translated_final"
    ],
    "seamless_m4t_back_translated": [
        "myxnli_seamless_m4t_back_translated_final_1",
        "myxnli_seamless_m4t_back_translated_final_2",
        "alt_combined_seamless_m4t_back_translated_final"
    ],
}

In [11]:
# Load and process dataset
def load_and_rename_columns_multilingual(file_name):
    df = load_gen_df(f"{file_name}")

    column_mapping = {
        "english_back_translated": "english",
        "burmese_translated": "burmese",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["english", "burmese"]]

    return df

In [12]:
# Load and process datasets
mutlilingual_loaded_datasets = {}
for key, file_list in multilingual_datasets.items():
    mutlilingual_loaded_datasets[key] = [load_and_rename_columns_multilingual(file) for file in file_list]

In [13]:
# combine all datasets
multilingual_combined = pd.concat(
    mutlilingual_loaded_datasets["normal"] + 
    mutlilingual_loaded_datasets["nllb_back_translated"] + 
    mutlilingual_loaded_datasets["seamless_m4t_back_translated"],
    ignore_index=True
)

In [14]:
# Shuffle the data to prevent order bias
multilingual_combined = multilingual_combined.sample(frac=1).reset_index(drop=True)

In [16]:
# print length
print(f"Multilingual dataset length: {len(multilingual_combined)}")

Multilingual dataset length: 1627576


In [17]:
# save data
save_models_df(multilingual_combined, "multilingual_combined")

## Generate Predictions
Load ```mBERT``` and ```XLM-R``` for Masked Language Modeling (MLM).
MLM helps predict missing words in Burmese sequences.

In [18]:
# dataset class for masked
class MaskedTextDataset(Dataset):
    def __init__(self, texts, tokenizer, mask_ratio=0.15, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.mask_ratio = mask_ratio
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx] if isinstance(self.texts[idx], str) else ""

        # Tokenize and move tensors to GPU
        inputs = self.tokenizer(
            text, return_tensors="pt", truncation=True, padding="max_length", max_length=self.max_length
        )
        
        input_ids = inputs["input_ids"].squeeze(0).to(device)
        attention_mask = inputs["attention_mask"].squeeze(0).to(device)

        # Apply random masking
        seq_length = input_ids.shape[0]
        num_to_mask = max(1, int(self.mask_ratio * (seq_length - 2)))  # Avoid CLS/SEP
        mask_indices = torch.randperm(seq_length - 2)[:num_to_mask] + 1  # Avoid first and last token

        masked_input_ids = input_ids.clone()
        masked_input_ids[mask_indices] = self.tokenizer.mask_token_id  # Replace with [MASK] token

        return masked_input_ids, attention_mask, input_ids

In [19]:
# Function to generate masked predictions
def generate_masked_predictions_batch(dataloader, model, tokenizer):
    all_predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating Masked Predictions"):
            # Move batch data to GPU
            masked_input_ids, attention_mask, original_input_ids = [x.to(device) for x in batch]

            # Run model inference on GPU
            outputs = model(input_ids=masked_input_ids, attention_mask=attention_mask)

            # Replace masked tokens with predicted tokens
            predicted_tokens_batch = masked_input_ids.clone()
            for i in range(masked_input_ids.shape[0]):  # Loop over batch
                mask_positions = (masked_input_ids[i] == tokenizer.mask_token_id).nonzero(as_tuple=True)[0]
                for pos in mask_positions:
                    predicted_token_id = torch.argmax(outputs.logits[i, pos], dim=-1).item()
                    predicted_tokens_batch[i, pos] = predicted_token_id

            # Decode predictions
            batch_predictions = tokenizer.batch_decode(predicted_tokens_batch.cpu(), skip_special_tokens=True)
            all_predictions.extend(batch_predictions)

    return all_predictions

### mBert

In [20]:
# Load tokenizers & models for both mBERT
multilingual_mbert_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_names["mbert"])
multilingual_mbert_model = AutoModelForMaskedLM.from_pretrained(multilingual_model_names["mbert"]).to(device)
multilingual_mbert_model = torch.compile(multilingual_mbert_model)
multilingual_mbert_model.eval()

OptimizedModule(
  (_orig_mod): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bia

In [21]:
# load multilingual dataset
multilingual_mbert_predictions = load_models_df("multilingual_combined")

In [22]:
# batch size
multilingual_mbert_predictions_batch_size = 8

In [23]:
# Convert to DataLoader
multilingual_mbert_predictions_texts = multilingual_mbert_predictions["burmese"].tolist()
multilingual_mbert_predictions_dataset = MaskedTextDataset(multilingual_mbert_predictions_texts, multilingual_mbert_tokenizer)
multilingual_mbert_predictions_dataloader = DataLoader(
    multilingual_mbert_predictions_dataset, 
    batch_size=multilingual_mbert_predictions_batch_size, 
    shuffle=False
)

In [24]:
# Run text generation
multilingual_mbert_predictions["generated"] = generate_masked_predictions_batch(
    multilingual_mbert_predictions_dataloader, 
    multilingual_mbert_model, 
    multilingual_mbert_tokenizer
)

Generating Masked Predictions:   0%|          | 0/203447 [00:00<?, ?it/s]

In [25]:
# display
display(multilingual_mbert_predictions.head())

Unnamed: 0,english,burmese,generated
0,it's not worth seeing the nubian floor exhibit...,အထက် အီဂျစ်မှာ နူဘီးယား ကြမ်းပြင်ပြပွဲကို ကြည့...,အထက် အီဂျစ်မှာ နူဘီးယား ကြမ်းပြင်ပြပွဲကို ကြည့...
1,there are remote whitewashed villages that adv...,စွန့်စားချင်သူတွေ လည်ပတ်ချင်ကြတဲ့ ဝေးလံခေါင်သီ...,စွန့်စားချင်သီတွေ လည်ပတ်ချင်ကြတဲ့ ဝေးလံခေါင်သီ...
2,"she makes these little tricks, very good, and ...",သူမက ဒီပျဉ်းစေ့ကြိုးတွေ လုပ်ပေးတယ် အရမ်းကောင်း...,သူမက စီပျဉ်းစေ့ကြိုးတွေ လုပ်ပေးတဲ့ ၊ိမ်းကောင်း...
3,the pair regained zimbabwe's times and finishe...,ထိုစုံတွဲသည် ဇင်ဘာဘွေ၏ အကြိမ်များကို ပြန်လည်ရရ...,ထိုစုံပွဲသည် ဇင်ဘာတွေ ၏ အကြိမ်များကို ပြန်လည်ရ...
4,potential of clarifying its notices to taxpaye...,အခွန်ထမ်းများထံ ၎င်း၏သတိပေးချက်များကို ရှင်းလင...,အခွက်ထမ်းများထံ ၎ င်း ၏ သတိပြချက်များကို ရှင်း...


In [26]:
# save multilingual mbert predictions
save_models_df(multilingual_mbert_predictions, "multilingual_mbert_predictions")

### XLM-R

In [19]:
# Load tokenizers & models for both xlmr
multilingual_xlmr_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_names["xlmr"])
multilingual_xlmr_model = AutoModelForMaskedLM.from_pretrained(multilingual_model_names["xlmr"]).to(device)
multilingual_xlmr_model = torch.compile(multilingual_xlmr_model)
multilingual_xlmr_model.eval()

OptimizedModule(
  (_orig_mod): XLMRobertaForMaskedLM(
    (roberta): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): XLMRobertaSelfOutput(
    

In [20]:
# load multilingual dataset
multilingual_xlmr_predictions = load_models_df("multilingual_combined")

In [21]:
# multinlingual batch size
multilingual_xlmr_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
multilingual_xlmr_predictions_texts = multilingual_xlmr_predictions["burmese"].tolist()
multilingual_xlmr_predictions_dataset = MaskedTextDataset(multilingual_xlmr_predictions_texts, multilingual_xlmr_tokenizer)
multilingual_xlmr_predictions_dataloader = DataLoader(
    multilingual_xlmr_predictions_dataset, 
    batch_size=multilingual_xlmr_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
multilingual_xlmr_predictions["generated"] = generate_masked_predictions_batch(
    multilingual_xlmr_predictions_dataloader, 
    multilingual_xlmr_model, 
    multilingual_xlmr_tokenizer
)

In [None]:
# display
display(multilingual_xlmr_predictions.head())

In [19]:
# save multilingual xlmr predictions
save_models_df(multilingual_xlmr_predictions, "multilingual_xlmr_predictions")

## Evaluate Model Performance
Compute BLEU, ROUGE-1, ROUGE-2, ROUGE-3, ROUGE-L, chrF-S, BERTScore and Perplexity scores.

In [18]:
# load multilingual predictions
multilingual_evaluation_results_datasets = {
    model_name: load_models_df(f"multilingual_{model_name}_predictions") for model_name in multilingual_model_names.keys()
}

### Metrics

In [21]:
tmp = multilingual_evaluation_results_datasets["xlmr"]
null_rows = tmp[tmp.isnull().any(axis=1)]
print(null_rows)

                                                   english  \
69601    mr qarase drove to the estate, but was told by...   
126205   program coordinators estimate that tutors volu...   
432052   "i maintain my demands and the deadline still ...   
517986                                        - i 'm not .   
572578                                        - i 'm not .   
712910                                          they're...   
824587   expedition 10 is scheduled to conduct its seco...   
1080759  fiji's great council of chiefs called for calm...   
1599299  and studs and leather is basically heaven's on...   

                                                   burmese generated  bleu  \
69601                                                   \n       NaN   0.0   
126205                                                  \n       NaN   0.0   
432052                                                  \n       NaN   0.0   
517986                                                 ...       Na

In [None]:
# compute metrics
for model_name, dataset in multilingual_evaluation_results_datasets.items():
    compute_metrics(dataset)

Calculating BLEU...


In [None]:
# display metrics
for model_name, dataset in multilingual_evaluation_results_datasets.items():
    print(f"Metrics scores for {model_name}:")
    print(f" BLEU Score: {dataset['bleu'].mean()}")
    print(f" ROUGE-1 Score: {dataset['rouge-1'].mean()}")
    print(f" ROUGE-2 Score: {dataset['rouge-2'].mean()}")
    print(f" ROUGE-L Score: {dataset['rouge-l'].mean()}")
    print(f" chrF-S Score: {dataset['chrf-s'].mean()}")
    print(f" BERT Score: {dataset['bert'].mean()}")

In [None]:
# save results
for model_name, dataset in multilingual_evaluation_results_datasets.items():
    save_tmp_df(dataset, f"multilingual_{model_name}_metrics")

### Perplexity

#### mBert

In [19]:
# batch size
multilingual_mbert_perplexity_batch_size = 8

In [20]:
# Load tokenizers & models for both mbert
multilingual_mbert_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_names["mbert"])
multilingual_mbert_model = AutoModelForMaskedLM.from_pretrained(multilingual_model_names["mbert"]).to(device)
multilingual_mbert_model = torch.compile(multilingual_mbert_model)
multilingual_mbert_model.eval()

OptimizedModule(
  (_orig_mod): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bia

In [21]:
# Prepare dataset and DataLoader
multilingual_mbert_generated_texts = multilingual_evaluation_results_datasets["mbert"]["generated"].tolist()
multilingual_mbert_text_dataset = TextDataset(multilingual_mbert_generated_texts)
multilingual_mbert_dataloader = DataLoader(
    multilingual_mbert_text_dataset, 
    batch_size=multilingual_mbert_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# Compute perplexity in batches
multilingual_mbert_perplexity_scores = []
for batch in tqdm(multilingual_mbert_dataloader, desc="Computing Perplexity for mBert"):
    batch_perplexities = compute_perplexity_batch(batch, multilingual_mbert_tokenizer, multilingual_mbert_model)
    multilingual_mbert_perplexity_scores.extend(batch_perplexities)

Computing Perplexity for mBert:   0%|          | 0/203447 [00:00<?, ?it/s]

W0203 15:42:02.372000 13899 site-packages/torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode


In [None]:
# Store perplexity scores in DataFrame and display
multilingual_evaluation_results_datasets["mbert"]["perplexity"] = multilingual_mbert_perplexity_scores
print(f"Perplexity Score: {multilingual_evaluation_results_datasets['mbert']['perplexity'].mean()}")

Perplexity Score: 2.2175958210491413


In [None]:
# save perplexity
save_tmp_df(multilingual_evaluation_results_datasets["mbert"], f"multilingual_mbert_perplexity")

#### XLM-R

In [None]:
# batch size
multilingual_xlmr_perplexity_batch_size = 8

In [None]:
# Load tokenizers & models for both xlmr
multilingual_xlmr_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_names["xlmr"])
multilingual_xlmr_model = AutoModelForMaskedLM.from_pretrained(multilingual_model_names["xlmr"]).to(device)
multilingual_xlmr_model = torch.compile(multilingual_xlmr_model)
multilingual_xlmr_model.eval()

In [None]:
# Prepare dataset and DataLoader
multilingual_xlmr_generated_texts = multilingual_evaluation_results_datasets["xlmr"]["generated"].tolist()
multilingual_xlmr_text_dataset = TextDataset(multilingual_xlmr_generated_texts)
multilingual_xlmr_dataloader = DataLoader(
    multilingual_xlmr_text_dataset, 
    batch_size=multilingual_xlmr_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# Compute perplexity in batches
multilingual_xlmr_perplexity_scores = []
for batch in tqdm(multilingual_xlmr_dataloader, desc="Computing Perplexity for XLM-R"):
    batch_perplexities = compute_perplexity_batch(batch, multilingual_xlmr_tokenizer, multilingual_xlmr_model)
    multilingual_xlmr_perplexity_scores.extend(batch_perplexities)

In [None]:
# Store perplexity scores in DataFrame and display
multilingual_evaluation_results_datasets["xlmr"]["perplexity"] = multilingual_xlmr_perplexity_scores
print(f"Perplexity Score: {multilingual_evaluation_results_datasets['xlmr']['perplexity'].mean()}")

In [None]:
# save perplexity
save_tmp_df(multilingual_evaluation_results_datasets["xlmr"], f"multilingual_xlmr_perplexity")

### Save Evaluation Results

In [None]:
# combine evaluation results
for model_name in multilingual_evaluation_results_datasets.keys():
    print(f"Processing {model_name}...")

    # load metrics and set
    metrics = load_tmp_df(f"multilingual_{model_name}_metrics")
    multilingual_evaluation_results_datasets[model_name]["bleu"] = metrics["bleu"]
    multilingual_evaluation_results_datasets[model_name]["rouge-1"] = metrics["rouge-1"]
    multilingual_evaluation_results_datasets[model_name]["rouge-2"] = metrics["rouge-2"]
    multilingual_evaluation_results_datasets[model_name]["rouge-l"] = metrics["rouge-l"]
    multilingual_evaluation_results_datasets[model_name]["chrf-s"] = metrics["chrf-s"]
    multilingual_evaluation_results_datasets[model_name]["bert_score"] = metrics["bert_score"]

    # load perplexity and set
    perplexity = load_tmp_df(f"multilingual_{model_name}_perplexity")
    multilingual_evaluation_results_datasets[model_name]["perplexity"] = perplexity["perplexity"]

    save_models_df(multilingual_evaluation_results_datasets[model_name], f"multilingual_{model_name}_evaluation_results")

    display(multilingual_evaluation_results_datasets[model_name].head())

# 3. Benchmarking and Analysis
Compare the performance of LSTM BPE, LSTM Unigram, mBERT, and XLM-R using BLEU, ROUGE, and Perplexity.

## Data Preprocessing

In [None]:
# datasets
benchmarking_models = {
    "lstm": [
        "bpe", 
        "unigram"
    ],
    "multilingual": [
        "mbert", 
        "xlmr"
    ]
}

In [None]:
# Load and process dataset
def load_and_rename_columns_benchmarking(key, model_name):
    df = load_models_df(f"{key}_{model_name}_evaluation_results")

    # Ensure only required columns exist
    df = df[["english", "bleu", "rouge", "perplexity"]]

    column_mapping = {
        "bleu": f"bleu_{model_name}",
        "rouge": f"rouge_{model_name}",
        "perplexity": f"perplexity_{model_name}",
    }
    
    df = df.rename(columns=column_mapping)

    return df

In [None]:
# load datasets
benchmarking_loaded_datasets = []
for key, file_list in benchmarking_models.items():
    df_list= [load_and_rename_columns_multilingual(file) for file in file_list]
    benchmarking_loaded_datasets.append(df_list)

In [None]:
# Merge all datasets on 'english' using an outer join
benchmarking_results = benchmarking_loaded_datasets[0]
for df in benchmarking_loaded_datasets[1:]:
    benchmarking_results = benchmarking_results.merge(df, on="english", how="outer")

In [None]:
# display merge dataset
display(benchmarking_results.head())

## Compute Average Scores for Comparison
Get mean BLEU, ROUGE, and Perplexity for LSTM (BPE & Unigram), mBERT, and XLM-R.

In [None]:
# Define the model names and their respective column prefixes
benchmarking_model_names = ["LSTM BPE", "LSTM Unigram", "mBERT", "XLM-R"]
benchmarking_column_prefixes = ["bpe", "unigram", "mBERT", "XLM-R"]

# Compute mean scores dynamically using a dictionary comprehension
benchmarking_mean_scores = {
    model: {
        "BLEU": df[f"bleu_{prefix}"].mean(),
        "ROUGE": df[f"rouge_{prefix}"].mean(),
        "Perplexity": df[f"perplexity_{prefix}"].mean(),
    }
    for model, prefix in zip(benchmarking_model_names, benchmarking_column_prefixes)
}

# Display mean scores
pprint.pprint(benchmarking_mean_scores)

## Visualize Benchmarking Results
Plot BLEU, ROUGE, and Perplexity scores for comparison.

In [None]:
# Plot each metric
benchmarking_metrics = ["BLEU", "ROUGE", "Perplexity"]
for metric in benchmarking_metrics:
    values = [benchmarking_mean_scores["LSTM BPE"][metric], 
              benchmarking_mean_scores["LSTM Unigram"][metric], 
              benchmarking_mean_scores["mBERT"][metric], 
              benchmarking_mean_scores["XLM-R"][metric]]

    plt.figure(figsize=(6, 4))
    plt.bar(benchmarking_model_names, values, color=["blue", "green", "orange", "red"])
    plt.title(f"{metric} Score Comparison")
    plt.xlabel("Models")
    plt.ylabel(metric)
    plt.show()

### Save Benchmarking Results

In [None]:
# save results
save_models_df(benchmarking_results, "benchmarking_results")