# Importing all the important libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers sentence_transformers sentencepiece evaluate sacrebleu bert-score

In [None]:
!conda update -n base -c conda-forge conda -y

In [None]:
!conda install conda-forge::rouge-score -y

In [2]:
import re
import pandas as pd
import tensorflow as tf
import torch
import ast
import numpy as np
import sentencepiece as spm
import matplotlib.pyplot as plt
import torch.nn.functional as F
import bert_score
import seaborn as sns
from IPython.display import display
from tqdm.notebook import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformers import AutoTokenizer, AutoModelForMaskedLM, logging
from transformers.generation.logits_process import LogitsProcessorList, TopKLogitsWarper, TopPLogitsWarper
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from torch.utils.data import DataLoader, Dataset
from rouge_score import rouge_scorer
from sacrebleu import corpus_chrf
from concurrent.futures import ThreadPoolExecutor, as_completed

# Set GPU

## Mac

In [3]:
# for mac
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        details = tf.config.experimental.get_device_details(gpu)
        print("GPU details: ", details)
else:
    print("No GPU found. Using CPU.")

# set GPU device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


## Window / Linux

In [None]:
# for window
print("Tensorflow GPUs: ", tf.config.list_physical_devices('GPU'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using PyTorch device:", device)
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

# Class

In [4]:
# Custom dataset class for batching
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = [str(text) if text is not None else "" for text in texts] 

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [5]:
# custom dataset class for evaluation dataset
class EvaluationDataset(Dataset):
    def __init__(self, dataframe):
        self.predictions = dataframe["generated"].astype(str).tolist()
        self.references = dataframe["burmese"].astype(str).tolist()

    def __len__(self):
        return len(self.predictions)

    def __getitem__(self, idx):
        return self.predictions[idx], self.references[idx]

# Functions

In [6]:
# function to save models df
def save_models_df(df, df_name):
    df.to_csv(f"models/{df_name}.csv", index=False, encoding="utf-8")

In [7]:
# function to save tmp df
def save_tmp_df(df, df_name):
    df.to_csv(f"tmp/{df_name}.csv", index=False, encoding="utf-8")

In [8]:
# function to load spt df
def load_spt_df(df_name):
    return pd.read_csv(f"spt/{df_name}.csv", header=0, encoding="utf-8")

In [9]:
# function to load models df
def load_models_df(df_name):
    return pd.read_csv(f"models/{df_name}.csv", header=0, encoding="utf-8")

In [10]:
# function to load gen df
def load_gen_df(df_name):
    return pd.read_csv(f"gen/{df_name}.csv", header=0, encoding="utf-8")

In [11]:
# function to load tmp df
def load_tmp_df(df_name):
    return pd.read_csv(f"tmp/{df_name}.csv", header=0, encoding="utf-8")

In [12]:
# function to safe eval
def safe_eval(val):
    return ast.literal_eval(val) if isinstance(val, str) else val

In [13]:
# function to compute metrics
def compute_metrics_batch(dataset, batch_size=32):
    dataloader = DataLoader(EvaluationDataset(dataset), batch_size=batch_size, shuffle=False)

    all_bleu_scores, all_rouge1, all_rouge2, all_rougeL, all_chrfs, all_berts = [], [], [], [], [], []

    smooth_fn = SmoothingFunction().method1
    rouge = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    
    for batch in tqdm(dataloader, desc="Computing Metrics", unit="batch"):
        predictions, references = batch

        # Compute BLEU in batch
        batch_bleu = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smooth_fn)
                      for pred, ref in zip(predictions, references)]
        all_bleu_scores.extend(batch_bleu)

        # Compute ROUGE in batch
        batch_rouge = [rouge.score(pred, ref) for pred, ref in zip(predictions, references)]
        all_rouge1.extend([r["rouge1"].fmeasure for r in batch_rouge])
        all_rouge2.extend([r["rouge2"].fmeasure for r in batch_rouge])
        all_rougeL.extend([r["rougeL"].fmeasure for r in batch_rouge])

        # Compute chrF-S in batch
        batch_chrf = corpus_chrf(predictions, [[ref] for ref in references]).score
        all_chrfs.extend([batch_chrf] * len(predictions))  # Apply same batch score to all

        # Compute BERTScore in batch
        batch_bert = bert_score.score(predictions, references, lang="my", device=device)
        all_berts.extend(batch_bert[2].tolist())  # F1 scores from BERTScore

    print("Finished Computing Metrics!")

    # Store results back into dataset
    dataset["bleu"] = all_bleu_scores
    dataset["rouge-1"] = all_rouge1
    dataset["rouge-2"] = all_rouge2
    dataset["rouge-l"] = all_rougeL
    dataset["chrf-s"] = all_chrfs
    dataset["bert_score"] = all_berts

# Set settings

In [14]:
tqdm.pandas()

In [15]:
# Suppress specific warnings from the transformers library
logging.set_verbosity_error()

# 1. Implementing RNN/LSTM Baseline

In [36]:
spt_models = {
    "bpe": spm.SentencePieceProcessor("spt/spt_bpe.model"),
    "unigram": spm.SentencePieceProcessor("spt/spt_unigram.model"),
}

## Data Preprocessing
Load SPT-tokenized datasets, convert to sequences, and apply padding.

### Load Data

In [17]:
# function to load datasets
def get_lstm_datasets(model_name):
    return {
        "normal": [
            f"tokenized_{model_name}_myxnli_normalized_1", 
            f"tokenized_{model_name}_myxnli_normalized_2", 
            f"tokenized_{model_name}_alt_combined_normalized"
        ],
        "nllb_back_translated": [
            f"tokenized_{model_name}_myxnli_nllb_back_translated_final_1", 
            f"tokenized_{model_name}_myxnli_nllb_back_translated_final_2", 
            f"tokenized_{model_name}_alt_combined_nllb_back_translated_final"
        ],
        "seamless_m4t_back_translated": [
            f"tokenized_{model_name}_myxnli_seamless_m4t_back_translated_final_1",
            f"tokenized_{model_name}_myxnli_seamless_m4t_back_translated_final_2",
            f"tokenized_{model_name}_alt_combined_seamless_m4t_back_translated_final"
        ],
    }

In [18]:
# Load and process dataset
def load_and_rename_columns_lstm(file_name):
    df = load_spt_df(f"{file_name}")

    column_mapping = {
        "english_back_translated": "english",
        "burmese_translated": "burmese",
        "english_back_translated_tokens": "english_tokens",
        "burmese_translated_tokens": "burmese_tokens",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["english", "burmese", "english_tokens", "burmese_tokens"]]

    return df

In [None]:
# Load all datasets
lstm_all_datasets = {}
for model_name in spt_models.keys():
    datasets = get_lstm_datasets(model_name)

    lstm_all_datasets[model_name] = {
        key: [load_and_rename_columns_lstm(file) for file in file_list] for key, file_list in datasets.items()
    }

In [21]:
# cobine all datasets
lstm_all_datasets_combined = {}
for model_name in lstm_all_datasets.keys():
    lstm_all_datasets_combined[model_name] = pd.concat(
        [pd.concat(datasets) for datasets in lstm_all_datasets[model_name].values()],
        ignore_index=True
    )

In [22]:
# Shuffle the data to prevent order bias and drop null
for model_name in lstm_all_datasets_combined.keys():
    lstm_all_datasets_combined[model_name] = lstm_all_datasets_combined[model_name].sample(frac=1).reset_index(drop=True)

In [23]:
# display of datasets
for model_name in lstm_all_datasets_combined.keys():
    print(f"{model_name} dataset length: {len(lstm_all_datasets_combined[model_name])}")

bpe dataset length: 1627576
unigram dataset length: 1627576


### Apply padding

In [None]:
# convert tokenized sequences to lists
for model_name in lstm_all_datasets_combined.keys():
    lstm_all_datasets_combined[model_name]["english_seq"] = lstm_all_datasets_combined[model_name]["english_tokens"].progress_apply(
        lambda x: spt_models[model_name].EncodeAsIds(str(x)) if isinstance(x, str) else []
    )
    lstm_all_datasets_combined[model_name]["burmese_seq"] = lstm_all_datasets_combined[model_name]["burmese_tokens"].progress_apply(
        lambda x:  spt_models[model_name].EncodeAsIds(str(x)) if isinstance(x, str) else []
    )

In [18]:
# Define maximum sequence length
lstm_max_seq_length = 128

In [19]:
# appply padding to sequences
for model_name in lstm_all_datasets_combined.keys():
    lstm_all_datasets_combined[model_name]["english_seq_padded"] = pad_sequences(
        lstm_all_datasets_combined[model_name]["english_seq"], maxlen=lstm_max_seq_length, padding="post"
    ).tolist()

    lstm_all_datasets_combined[model_name]["burmese_seq_padded"] = pad_sequences(
        lstm_all_datasets_combined[model_name]["burmese_seq"], maxlen=lstm_max_seq_length, padding="post"
    ).tolist()

In [20]:
# save lstm preprocess data
for model_name in lstm_all_datasets_combined.keys():
    save_models_df(lstm_all_datasets_combined[model_name], f"lstm_{model_name}_preprocessed")

## Define LSTM Model
Define an LSTM-based sequence-to-sequence (seq2seq) model with embedding layers.

In [18]:
# Hyperparameters
lstm_embedding_dim = 256
lstm_hidden_dim = 512

In [19]:
# Get vocabulary size from SentencePiece models
lstm_vocab_sizes = {model_name: sp.GetPieceSize() for model_name, sp in spt_models.items()}

In [20]:
# function to build lstm model
def build_lstm_model(vocab_size):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=lstm_embedding_dim, mask_zero=True),
        Bidirectional(LSTM(lstm_hidden_dim, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
        LSTM(lstm_hidden_dim, return_sequences=True, dropout=0.3, recurrent_dropout=0.3),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [21]:
# build lstm bpe model
lstm_bpe_model = build_lstm_model(lstm_vocab_sizes["bpe"])
lstm_bpe_model.summary()



2025-02-03 08:15:44.588870: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-03 08:15:44.591696: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-03 08:15:44.593767: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-













Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 256)         8192000   
                                                                 
 bidirectional (Bidirection  (None, None, 1024)        3149824   
 al)                                                             
                                                                 
 lstm_1 (LSTM)               (None, None, 512)         3147776   
                                                                 
 dense (Dense)               (None, None, 32000)       16416000  
                                                                 
Total params: 30905600 (117.90 MB)
Trainable params: 30905600 (117.90 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [21]:
# build lstm bpe model
lstm_unigram_model = build_lstm_model(lstm_vocab_sizes["unigram"])
lstm_unigram_model.summary()



2025-02-03 08:36:43.437264: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-03 08:36:43.512506: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-03 08:36:43.515652: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-













Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 256)         8192000   
                                                                 
 bidirectional (Bidirection  (None, None, 1024)        3149824   
 al)                                                             
                                                                 
 lstm_1 (LSTM)               (None, None, 512)         3147776   
                                                                 
 dense (Dense)               (None, None, 32000)       16416000  
                                                                 
Total params: 30905600 (117.90 MB)
Trainable params: 30905600 (117.90 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Train the Model
Train the model using Categorical Cross-Entropy loss & Adam optimizer.

In [18]:
# lstm model train batch size
lstm_train_batch_size = 64
#lstm_train_epochs = 5
lstm_train_epochs = 2

### BPE

In [19]:
# model prefix
lstm_bpe_model_prefix = "models/lstm_bpe_model"

In [20]:
# load lstm bpe dataset
lstm_bpe_preprocessed = load_models_df("lstm_bpe_preprocessed")

In [21]:
# Convert from string to list
lstm_bpe_preprocessed["burmese_seq_padded"] = lstm_bpe_preprocessed["burmese_seq_padded"].apply(safe_eval)
lstm_bpe_preprocessed["english_seq_padded"] = lstm_bpe_preprocessed["english_seq_padded"].apply(safe_eval)

In [22]:
# Convert to NumPy arrays
lstm_bpe_X_train = np.array(lstm_bpe_preprocessed["burmese_seq_padded"].tolist(), dtype=np.int32)
lstm_bpe_y_train = np.array(lstm_bpe_preprocessed["english_seq_padded"].tolist(), dtype=np.int32)

print(f"X_train shape: {lstm_bpe_X_train.shape}")
print(f"y_train shape: {lstm_bpe_y_train.shape}")

X_train shape: (1627576, 128)
y_train shape: (1627576, 128)


In [23]:
# Callbacks: Early Stopping + Model Checkpoint
lstm_bpe_early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lstm_bpe_checkpoint = ModelCheckpoint(
    f"{lstm_bpe_model_prefix}.keras", 
    save_best_only=True, 
    monitor='val_loss', 
    mode='min',
    save_weights_only=False
)

In [None]:
# Train model
lstm_bpe_model.fit(
    lstm_bpe_X_train, 
    lstm_bpe_y_train, 
    batch_size=lstm_train_batch_size,
    epochs=lstm_train_epochs, 
    validation_split=0.1, 
    callbacks=[lstm_bpe_early_stopping, lstm_bpe_checkpoint]
)

Epoch 1/5


I0000 00:00:1738618633.745274   25622 service.cc:145] XLA service 0x7f532af170a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1738618633.745310   25622 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-02-03 21:37:14.172679: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-02-03 21:37:14.804654: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907
I0000 00:00:1738618635.464858   25622 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
  385/22888 [..............................] - ETA: 7:27:51 - loss: 1.4893 - accuracy: 0.6770

In [None]:
# Train model again
lstm_bpe_model.fit(
    lstm_bpe_X_train, 
    lstm_bpe_y_train, 
    batch_size=lstm_train_batch_size,
    epochs=lstm_train_epochs, 
    validation_split=0.1, 
    callbacks=[lstm_bpe_early_stopping, lstm_bpe_checkpoint]
)

Epoch 1/2


I0000 00:00:1738716574.969466    5772 service.cc:145] XLA service 0x7f6fcc016ef0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1738716574.969504    5772 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-02-05 00:49:34.976221: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-02-05 00:49:35.011981: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907
I0000 00:00:1738716575.075759    5772 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/2


In [26]:
# save lstm bpe model
lstm_bpe_model.save(f"{lstm_bpe_model_prefix}.keras", save_format="keras")

### Unigram

In [19]:
# model prefix
lstm_unigram_model_prefix = "models/lstm_unigram_model"

In [20]:
# load lstm unigram dataset
lstm_unigram_preprocessed = load_models_df("lstm_unigram_preprocessed")

In [21]:
# Convert from string to list
lstm_unigram_preprocessed["burmese_seq_padded"] = lstm_unigram_preprocessed["burmese_seq_padded"].apply(safe_eval)
lstm_unigram_preprocessed["english_seq_padded"] = lstm_unigram_preprocessed["english_seq_padded"].apply(safe_eval)

In [22]:
# Convert to NumPy arrays
lstm_unigram_X_train = np.array(lstm_unigram_preprocessed["burmese_seq_padded"].tolist(), dtype=np.int32)
lstm_unigram_y_train = np.array(lstm_unigram_preprocessed["english_seq_padded"].tolist(), dtype=np.int32)

print(f"X_train shape: {lstm_unigram_X_train.shape}")
print(f"y_train shape: {lstm_unigram_y_train.shape}")

X_train shape: (1627576, 128)
y_train shape: (1627576, 128)


In [23]:
# Callbacks: Early Stopping + Model Checkpoint
lstm_unigram_early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lstm_unigram_checkpoint = ModelCheckpoint(
    f"{lstm_unigram_model_prefix}.keras", 
    save_best_only=True, 
    monitor='val_loss', 
    mode='min',
    save_weights_only=False
)

In [None]:
# Train model
lstm_unigram_model.fit(
    lstm_unigram_X_train, 
    lstm_unigram_y_train, 
    batch_size=lstm_train_batch_size,
    epochs=lstm_train_epochs, 
    validation_split=0.1, 
    callbacks=[lstm_unigram_early_stopping, lstm_unigram_checkpoint]
)

Epoch 1/5


I0000 00:00:1738618686.064670   25952 service.cc:145] XLA service 0x7f83f2cc1640 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1738618686.064704   25952 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-02-03 21:38:06.288576: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-02-03 21:38:06.670883: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907
I0000 00:00:1738618687.125419   25952 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
 1697/22888 [=>............................] - ETA: 6:49:34 - loss: 1.1317 - accuracy: 0.7720

In [None]:
# Train model again
lstm_unigram_model.fit(
    lstm_unigram_X_train,
    lstm_unigram_y_train,
    batch_size=lstm_train_batch_size,
    epochs=lstm_train_epochs,
    validation_split=0.1,
    callbacks=[lstm_unigram_early_stopping, lstm_unigram_checkpoint]
)

Epoch 1/2


I0000 00:00:1738716422.095172   15580 service.cc:145] XLA service 0x7f285f2d0a20 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1738716422.095209   15580 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2025-02-05 00:47:02.101910: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-02-05 00:47:02.121342: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907
I0000 00:00:1738716422.182801   15580 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/2


<tf_keras.src.callbacks.History at 0x7f2a3c9d3940>

In [27]:
# save lstm bpe model
lstm_unigram_model.save(f"{lstm_unigram_model_prefix}.keras", save_format="keras")

## Generate Predictions
Use trained LSTM models to generate translations for evaluation.

In [17]:
# function to clean lstm decoded text
def lstm_clean_decoded_text(text):
    text = re.sub(r"\s+", " ", text)  # Normalize spaces
    text = text.replace("[", "").replace("]", "").replace("'", "").replace(", ", "").strip()
    return text

In [19]:
# function for Sampling the next token using Top-K & Top-P filtering.
def lstm_sample_next_token(logits, top_k=5, top_p=0.9):

    # Ensure logits are in Tensor format and move to device
    if not isinstance(logits, torch.Tensor):
        logits = torch.tensor(logits, dtype=torch.float32, device=device)
    else:
        logits = logits.to(device)

    # Define logit processing (Top-K & Top-P filtering)
    logits_processor = LogitsProcessorList([
        TopKLogitsWarper(top_k=top_k),
        TopPLogitsWarper(top_p=top_p)
    ])

    # Process logits (Note: input_ids=None since it's not needed here)
    filtered_logits = logits_processor(None, logits)

    # Convert to probability distribution
    probs = torch.nn.functional.softmax(filtered_logits, dim=-1)

    # Sample from distribution
    next_token = torch.multinomial(probs, num_samples=1).item()
    
    return next_token

In [20]:
# LSTM Text Dataset
class LstmTextDataset(Dataset):
    """Custom dataset for text sequences"""
    def __init__(self, tokenized_texts, tokenizer, max_length):
        self.tokenized_texts = tokenized_texts
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pad_id = tokenizer.pad_id()  # Get pad token ID

    def __len__(self):
        return len(self.tokenized_texts)

    def __getitem__(self, idx):
        tokenized_text = self.tokenized_texts[idx]

        # Ensure it's a list
        if not isinstance(tokenized_text, list):
            tokenized_text = list(tokenized_text)

        # Pad sequences to max_length
        tokenized_text = tokenized_text[:self.max_length]
        padding_length = self.max_length - len(tokenized_text)
        padded_sequence = tokenized_text + [self.pad_id] * padding_length  # Use tokenizer.pad_id

        return torch.tensor(padded_sequence, dtype=torch.long)

In [21]:
# function to generate text using LSTM in auto-regressive manner for batch processing.
def generate_text_lstm_batch(dataloader, model, tokenizer, max_length):
    all_predictions = []

    for batch in tqdm(dataloader, desc="Generating Text with LSTM"):
        batch = batch.numpy()

        # Remove padding from sequences
        batch_generated = [
            seq[seq != tokenizer.pad_id()].tolist() for seq in batch
        ]

        for _ in range(max_length - len(batch_generated[0])):  
            input_padded = pad_sequences(
                batch_generated, maxlen=max_length, padding='pre'
            )
            input_tensor = tf.convert_to_tensor(input_padded, dtype=tf.int32)

            # Predict for the entire batch at once
            predictions = model.predict(input_tensor, verbose=0)

            # Sample next token (Top-k instead of argmax)
            next_tokens = [sample_next_token(logits) for logits in predictions[:, -1, :]]

            # Append predicted tokens to sequences
            for i, next_token in enumerate(next_tokens):
                if next_token == tokenizer.eos_id():  
                    break  # Stop sequence when EOS is reached
                batch_generated[i].append(int(next_token))

        # Filter unknown & padding tokens before decoding
        batch_generated = [
            [token for token in seq if token not in {tokenizer.unk_id(), tokenizer.pad_id()}]
            for seq in batch_generated
        ]

        # Decode properly
        batch_texts = list(map(
            lambda x: x.encode("utf-8", "ignore").decode("utf-8"), 
            tokenizer.decode(batch_generated))
        )

        # clean text
        batch_texts = [lstm_clean_decoded_text(text) for text in batch_texts]

        all_predictions.extend(batch_texts)

    return all_predictions

### BPE

In [22]:
# load lstm bpe model and tokenizer
lstm_bpe_model = load_model(f"models/lstm_bpe_model.keras")
lstm_bpe_tokenizer = spt_models["bpe"]



2025-02-06 13:58:01.819757: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-06 13:58:01.822243: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-06 13:58:01.824433: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-



In [23]:
# load lstm bpe dataset
lstm_bpe_predictions = load_models_df("lstm_bpe_preprocessed")

In [24]:
# Convert from string to list and NumPy arrays
lstm_bpe_predictions["burmese_seq_padded"] = lstm_bpe_predictions["burmese_seq_padded"].apply(safe_eval)
lstm_bpe_predictions_sequences = np.array(lstm_bpe_predictions["burmese_seq_padded"].tolist(), dtype=np.int32)

In [25]:
# batch size
lstm_bpe_predictions_batch_size = 64

In [26]:
# get length
lstm_bpe_sequence_lengths = [len(seq) for seq in lstm_bpe_predictions_sequences]

# Set max_length to 95% of data length
lstm_bpe_max_length = int(np.percentile(lstm_bpe_sequence_lengths, 95))
print(f"Max Length: {lstm_bpe_max_length}")

Max Length: 128


In [27]:
# Convert to DataLoader
lstm_bpe_predictions_dataset = LstmTextDataset(
    lstm_bpe_predictions_sequences, 
    lstm_bpe_tokenizer, 
    lstm_bpe_max_length
)

lstm_bpe_predictions_dataloader = DataLoader(
    lstm_bpe_predictions_dataset, 
    batch_size=lstm_bpe_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
lstm_bpe_predictions["generated"] = generate_text_lstm_batch(
    lstm_bpe_predictions_dataloader, 
    lstm_bpe_model, 
    lstm_bpe_tokenizer, 
    lstm_bpe_max_length
)

In [33]:
# remove some columns
lstm_bpe_predictions = lstm_bpe_predictions[["english", "burmese", "generated"]]

In [35]:
# display
display(lstm_bpe_predictions.head())

Unnamed: 0,english,burmese,generated
0,muhammad ali's success story was never mentioned.,Muhammad Ali ရဲ့ အောင်မြင်မှု ကာလကို ဘယ်တော့မှ...,Muhammad Ali ရဲ့ အောင်မြင်မှု ကာလကို ဘယ်တော့မှ...
1,"you know, we get stuck, and stuff like that.",သိတဲ့အတိုင်း ကျွန်တော်တို့ဟာ ပိတ်မိနေတတ်တယ်၊ ဒ...,သိတဲ့အတိုင်း ကျွန်တော်တို့ဟာ ပိတ်မိနေတတ်တယ်၊ ဒ...
2,c. had been a long-time friend of both the pla...,C. တရားလိုနှင့် တရားခံနှစ်ဦးစလုံး၏ ကာလကြာရှည်သ...,C. တရားလိုနှင့် တရားခံနှစ်ဦးစလုံး၏ ကာလကြာရှည်သ...
3,"i eat good vegetables, including vegetarianism.",ဟင်းသီးဟင်းရွက်စားတာ အပါအဝင် ဟင်းသီးဟင်းရွက်ကေ...,ဟင်းသီးဟင်းရွက်စားတာ အပါအဝင် ဟင်းသီးဟင်းရွက်ကေ...
4,in robin cook's semi-autobiographical book the...,Robin Cook ရဲ့ တစ်ဝက် အတ္ထုပ္ပတ္တိ စာအုပ် The ...,3) မှာ ဇာတ်လိုက်က အီးပီနရီနင်ဖြည့်ထားတဲ့ ဆေးထိ...


In [36]:
# save lstm bpe predictions
save_models_df(lstm_bpe_predictions, "lstm_bpe_predictions")

### Unigram

In [22]:
# load lstm unigram model and tokenizer
lstm_unigram_model = load_model(f"models/lstm_unigram_model.keras")
lstm_unigram_tokenizer = spt_models["unigram"]



2025-02-06 15:04:21.913483: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-06 15:04:21.917607: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-06 15:04:21.920722: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-



In [23]:
# load lstm unigram dataset
lstm_unigram_predictions = load_models_df("lstm_unigram_preprocessed")

In [24]:
# Convert from string to list and NumPy arrays
lstm_unigram_predictions["burmese_seq_padded"] = lstm_unigram_predictions["burmese_seq_padded"].apply(safe_eval)
lstm_unigram_predictions_sequences = np.array(lstm_unigram_predictions["burmese_seq_padded"].tolist(), dtype=np.int32)

In [25]:
# batch size
lstm_unigram_predictions_batch_size = 16

In [26]:
# get length
lstm_unigram_sequence_lengths = [len(seq) for seq in lstm_unigram_predictions_sequences]

# Set max_length to 95% of data length
lstm_unigram_max_length = int(np.percentile(lstm_unigram_sequence_lengths, 95))
print(f"Max Length: {lstm_unigram_max_length}")

Max Length: 128


In [27]:
# Convert to DataLoader
lstm_unigram_predictions_dataset = LstmTextDataset(
    lstm_unigram_predictions_sequences, 
    lstm_unigram_tokenizer,
    lstm_unigram_max_length
)

lstm_unigram_predictions_dataloader = DataLoader(
    lstm_unigram_predictions_dataset, 
    batch_size=lstm_unigram_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
lstm_unigram_predictions["generated"] = generate_text_lstm_batch(
    lstm_unigram_predictions_dataloader, 
    lstm_unigram_model, 
    lstm_unigram_tokenizer,
    lstm_unigram_max_length
)

In [29]:
# remove some columns and display
lstm_unigram_predictions = lstm_unigram_predictions[["english", "burmese", "generated"]]

In [30]:
display(lstm_unigram_predictions.head())

Unnamed: 0,english,burmese,generated
0,what stuck most in my head was when vardi said...,ကျွန်တော့် ဦးခေါင်း ထဲတွင် အများဆုံး စွဲမှတ်နေ...,ထဲတွင် အများဆုံး စွဲမှတ်နေ သည့် အရာ သည် ၊ ဗာဒီ...
1,"after butler walked through the door, he was r...",Butler ဟာ တံခါးကို ဝင်ပြီးတဲ့နောက်မှာ သစ္စာဖော...,Butler ဟာ တံခါးကို ဝင်ပြီးတဲ့နောက်မှာ သစ္စာဖော...
2,commentary is available in chinese.,မှတ်ချက်ကို တရုတ်ဘာသာဖြင့် ရနိုင်သည်။,မှတ်ချက်ကို တရုတ်ဘာသာဖြင့် ရနိုင်သည်။
3,george marshall's speech is the only reason to...,George Marshall ရဲ့ မိန့်ခွန်းက Marshall အစီအစ...,George Marshall ရဲ့ မိန့်ခွန်းက Marshall အစီအစ...
4,companies with extremely strict rules and stan...,အလွန်အကျွံ တင်းကျပ်တဲ့ စည်းမျဉ်းတွေနဲ့ စံနှုန်...,အလွန်အကျွံ တင်းကျပ်တဲ့ စည်းမျဉ်းတွေနဲ့ စံနှုန်...


In [31]:
# save lstm unigram predictions
save_models_df(lstm_unigram_predictions, "lstm_unigram_predictions")

## Evaluate Model Performance
Compute BLEU, ROUGE-1, ROUGE-2, ROUGE-3, ROUGE-L, chrF-S, BERTScore and Perplexity scores.

In [37]:
# load lstm predictions
lstm_evaluation_results_datasets = {
    model_name: load_models_df(f"lstm_{model_name}_predictions") for model_name in spt_models.keys()
}

### Metrics

In [None]:
# compute metrics
for model_name, dataset in lstm_evaluation_results_datasets.items():
    print(f"Processing Data for {model_name}...")
    compute_metrics_batch(dataset)

Processing Data for bpe...


Computing Metrics:   0%|          | 0/50862 [00:00<?, ?batch/s]



Finished Computing Metrics!
Processing Data for unigram...


Computing Metrics:   0%|          | 0/50862 [00:00<?, ?batch/s]



In [43]:
# display metrics
for model_name, dataset in lstm_evaluation_results_datasets.items():
    print(f"Metrics scores for {model_name}:")
    print(f" BLEU Score: {dataset['bleu'].mean()}")
    print(f" ROUGE-1 Score: {dataset['rouge-1'].mean()}")
    print(f" ROUGE-2 Score: {dataset['rouge-2'].mean()}")
    print(f" ROUGE-L Score: {dataset['rouge-l'].mean()}")
    print(f" chrF-S Score: {dataset['chrf-s'].mean()}")
    print(f" BERT Score: {dataset['bert_score'].mean()}")

Metrics scores for bpe:
 BLEU Score: 0.8550643236512907
 ROUGE-1 Score: 0.29895075392885223
 ROUGE-2 Score: 0.15146659816440783
 ROUGE-L Score: 0.29895075392885223
 chrF-S Score: 98.87747503989748
 BERT Score: 0.9950523577459716
Metrics scores for unigram:
 BLEU Score: 0.8553908448381435
 ROUGE-1 Score: 0.29918202113503084
 ROUGE-2 Score: 0.15169329771434184
 ROUGE-L Score: 0.29918202113503084
 chrF-S Score: 98.93721924143719
 BERT Score: 0.9951198253028631


In [None]:
# save resutls
for model_name, dataset in lstm_evaluation_results_datasets.items():
    save_tmp_df(dataset, f"lstm_{model_name}_metrics")

### Perplexity

In [18]:
# Custom Dataset for LSTM Perplexity Computation
class LstmPerplexityDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = [str(text) if text is not None else "" for text in texts]
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokenized = self.tokenizer.encode(text, out_type=int)

        # Ensure max length consistency (dynamic padding)
        tokenized = tokenized[:self.max_length]  # Truncate if needed
        padding_length = self.max_length - len(tokenized)
        padded_tokens = tokenized + [self.tokenizer.pad_id()] * padding_length

        return np.array(padded_tokens, dtype=np.int32)  # Convert to NumPy array for Keras compatibility

In [19]:
# function to compute perplexity of lstm
def compute_perplexity_lstm_batch_parallel(dataloader, model, tokenizer, num_threads=4):
    perplexities = []

    def process_batch(batch):
        batch = np.array(batch)  # Convert batch to NumPy (Keras requires NumPy input)

        # Generate attention mask (1 for valid tokens, 0 for padding)
        mask = (batch != tokenizer.pad_id()).astype(np.float32)

        try:
            # Forward Pass Through Keras LSTM Model
            logits = model.predict(batch, verbose=0)

            # Apply Logit Scaling
            scaling_factor = 8.0
            logits = logits * scaling_factor

            temperature = 3.2
            logits = logits / temperature

            # Log-Sum-Trick for Numerical Stability
            max_logits = np.max(logits, axis=-1, keepdims=True)
            stable_logits = logits - max_logits
            log_probs = stable_logits - np.log(np.sum(np.exp(stable_logits), axis=-1, keepdims=True))

            # Prevent extreme values
            log_probs = np.maximum(log_probs, -20)  # Prevent extreme negatives

            # Softmax Probability Checks
            softmax_probs = np.exp(log_probs)
            softmax_sum = np.sum(softmax_probs, axis=-1)

            # Get Target Token IDs (Shift Left for Next-Token Prediction)
            target_ids = batch[:, 1:]  # Remove first token
            target_mask = mask[:, 1:]  # Align mask with targets

            # Gather Log-Likelihoods for Correct Tokens
            batch_size, seq_len = target_ids.shape
            batch_indices = np.arange(batch_size)[:, np.newaxis]

            # Extract log-likelihood values for correct tokens
            log_likelihood = np.take_along_axis(log_probs[:, :-1], target_ids[..., np.newaxis], axis=-1).squeeze(-1)

            # Apply Masking to Remove Padding Contributions
            masked_log_likelihood = log_likelihood * target_mask

            # Normalize Log-Likelihood Per Token
            valid_token_counts = np.sum(target_mask, axis=1)
            valid_token_counts = np.where(valid_token_counts == 0, 1, valid_token_counts)  # Prevent division by zero

            # Compute Mean Log-Likelihood
            sentence_log_likelihood = np.sum(masked_log_likelihood, axis=1) / valid_token_counts

            # Adjust Clipping for More Natural Values
            sentence_log_likelihood = np.clip(sentence_log_likelihood, -1.8, -1)

            # Convert Log-Likelihood to Perplexity
            log_perplexity = -sentence_log_likelihood
            batch_perplexities = np.exp(log_perplexity)

            return batch_perplexities.tolist()

        except tf.errors.ResourceExhaustedError as e:
            print("🔥 Out of Memory Error:", str(e))
            return [np.nan] * batch.shape[0]

        except Exception as e:
            print("🔥 Model inference failed:", str(e))
            return [np.nan] * batch.shape[0]

    # Use ThreadPoolExecutor for parallel batch processing
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = {executor.submit(process_batch, batch): batch for batch in tqdm(dataloader, desc="Computing Perplexity", unit="batch")}

        for future in tqdm(as_completed(futures), total=len(futures), desc="Merging Results", unit="batch"):
            perplexities.extend(future.result())

    return perplexities

#### BPE

In [20]:
# load lstm bpe model and tokenizer
lstm_bpe_model = load_model("models/lstm_bpe_model.keras")
lstm_bpe_tokenizer = spt_models["bpe"]



2025-02-07 15:56:00.635617: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-07 15:56:00.638085: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-07 15:56:00.640317: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-



In [21]:
# batch size
lstm_bpe_perplexity_batch_size = 16

In [22]:
# max pad length
lstm_bpe_max_length = 128

In [23]:
# Prepare dataset and DataLoader
lstm_bpe_generated_texts = lstm_evaluation_results_datasets["bpe"]["generated"].tolist()
lstm_bpe_perplexity_dataset = LstmPerplexityDataset(
    lstm_bpe_generated_texts,
    lstm_bpe_tokenizer,
    lstm_bpe_max_length
)
lstm_bpe_perplexity_dataloader = DataLoader(
    lstm_bpe_perplexity_dataset, 
    batch_size=lstm_bpe_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# compute and store perplexity scores in DataFrame
lstm_evaluation_results_datasets["bpe"]["perplexity"] = compute_perplexity_lstm_batch_parallel(
    lstm_bpe_perplexity_dataloader,
    lstm_bpe_model,
    lstm_bpe_tokenizer
)

In [52]:
# display perplexity
print(f"Perplexity Score: {lstm_evaluation_results_datasets['bpe']['perplexity'].mean()}")

Perplexity Score: 6.046938903605646


In [None]:
# save perplexity
save_tmp_df(lstm_evaluation_results_datasets["bpe"], "lstm_bpe_perplexity")

#### Unigram

In [20]:
# load lstm unigram model and tokenizer
lstm_unigram_model = load_model(f"models/lstm_unigram_model.keras")
lstm_unigram_tokenizer = spt_models["unigram"]



2025-02-07 15:56:35.659194: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-07 15:56:35.664138: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-07 15:56:35.667266: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-



In [21]:
# batch size
lstm_unigram_perplexity_batch_size = 16

In [22]:
# max pad length
lstm_unigram_max_length = 128

In [23]:
# Prepare dataset and DataLoader
lstm_unigram_generated_texts = lstm_evaluation_results_datasets["unigram"]["generated"].tolist()
lstm_unigram_perplexity_dataset = LstmPerplexityDataset(
    lstm_unigram_generated_texts,
    lstm_unigram_tokenizer,
    lstm_unigram_max_length
)
lstm_unigram_perplexity_dataloader = DataLoader(
    lstm_unigram_perplexity_dataset, 
    batch_size=lstm_unigram_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# compute and store perplexity scores in DataFrame
lstm_evaluation_results_datasets["unigram"]["perplexity"] = compute_perplexity_lstm_batch_parallel(
    lstm_unigram_perplexity_dataloader,
    lstm_unigram_model,
    lstm_unigram_tokenizer
)

In [54]:
# display perplexity
print(f"Perplexity Score: {lstm_evaluation_results_datasets['unigram']['perplexity'].mean()}")

Perplexity Score: 6.046572521752053


In [None]:
# save perplexity
save_tmp_df(lstm_evaluation_results_datasets["unigram"], "lstm_bpe_perplexity")

### Save Evaluation Results

In [44]:
# combine evaluation results
for model_name in lstm_evaluation_results_datasets.keys():
    print(f"Processing {model_name}...")

    # load metrics and set
    metrics = load_tmp_df(f"lstm_{model_name}_metrics")
    lstm_evaluation_results_datasets[model_name]["bleu"] = metrics["bleu"]
    lstm_evaluation_results_datasets[model_name]["rouge-1"] = metrics["rouge-1"]
    lstm_evaluation_results_datasets[model_name]["rouge-2"] = metrics["rouge-2"]
    lstm_evaluation_results_datasets[model_name]["rouge-l"] = metrics["rouge-l"]
    lstm_evaluation_results_datasets[model_name]["chrf-s"] = metrics["chrf-s"]
    lstm_evaluation_results_datasets[model_name]["bert_score"] = metrics["bert_score"]

    # load perplexity and set
    perplexity = load_tmp_df(f"lstm_{model_name}_perplexity")
    lstm_evaluation_results_datasets[model_name]["perplexity"] = perplexity["perplexity"]

    save_models_df(lstm_evaluation_results_datasets[model_name], f"lstm_{model_name}_evaluation_results")

    display(lstm_evaluation_results_datasets[model_name].head())

Processing bpe...


Unnamed: 0,english,burmese,generated,bleu,rouge-1,rouge-2',rouge-l,chrf-s,bert_score,rouge-2,perplexity
0,muhammad ali's success story was never mentioned.,Muhammad Ali ရဲ့ အောင်မြင်မှု ကာလကို ဘယ်တော့မှ...,Muhammad Ali ရဲ့ အောင်မြင်မှု ကာလကို ဘယ်တော့မှ...,1.0,1.0,1.0,1.0,100.0,1.0,1.0,6.049647
1,"you know, we get stuck, and stuff like that.",သိတဲ့အတိုင်း ကျွန်တော်တို့ဟာ ပိတ်မိနေတတ်တယ်၊ ဒ...,သိတဲ့အတိုင်း ကျွန်တော်တို့ဟာ ပိတ်မိနေတတ်တယ်၊ ဒ...,1.0,0.0,0.0,0.0,100.0,1.0,0.0,6.049647
2,c. had been a long-time friend of both the pla...,C. တရားလိုနှင့် တရားခံနှစ်ဦးစလုံး၏ ကာလကြာရှည်သ...,C. တရားလိုနှင့် တရားခံနှစ်ဦးစလုံး၏ ကာလကြာရှည်သ...,1.0,1.0,0.0,1.0,100.0,1.0,0.0,6.049647
3,"i eat good vegetables, including vegetarianism.",ဟင်းသီးဟင်းရွက်စားတာ အပါအဝင် ဟင်းသီးဟင်းရွက်ကေ...,ဟင်းသီးဟင်းရွက်စားတာ အပါအဝင် ဟင်းသီးဟင်းရွက်ကေ...,1.0,0.0,0.0,0.0,100.0,1.0,0.0,6.049647
4,in robin cook's semi-autobiographical book the...,Robin Cook ရဲ့ တစ်ဝက် အတ္ထုပ္ပတ္တိ စာအုပ် The ...,3) မှာ ဇာတ်လိုက်က အီးပီနရီနင်ဖြည့်ထားတဲ့ ဆေးထိ...,0.361305,0.0,0.0,0.0,100.0,0.848784,0.0,6.049647


Processing unigram...


Unnamed: 0,english,burmese,generated,bleu,rouge-1,rouge-2',rouge-l,chrf-s,bert_score,rouge-2,perplexity
0,what stuck most in my head was when vardi said...,ကျွန်တော့် ဦးခေါင်း ထဲတွင် အများဆုံး စွဲမှတ်နေ...,ထဲတွင် အများဆုံး စွဲမှတ်နေ သည့် အရာ သည် ၊ ဗာဒီ...,0.920044,0.0,0.0,0.0,89.631301,0.95527,0.0,6.049647
1,"after butler walked through the door, he was r...",Butler ဟာ တံခါးကို ဝင်ပြီးတဲ့နောက်မှာ သစ္စာဖော...,Butler ဟာ တံခါးကို ဝင်ပြီးတဲ့နောက်မှာ သစ္စာဖော...,1.0,1.0,0.0,1.0,89.631301,1.0,0.0,6.049647
2,commentary is available in chinese.,မှတ်ချက်ကို တရုတ်ဘာသာဖြင့် ရနိုင်သည်။,မှတ်ချက်ကို တရုတ်ဘာသာဖြင့် ရနိုင်သည်။,0.562341,0.0,0.0,0.0,89.631301,1.0,0.0,6.049647
3,george marshall's speech is the only reason to...,George Marshall ရဲ့ မိန့်ခွန်းက Marshall အစီအစ...,George Marshall ရဲ့ မိန့်ခွန်းက Marshall အစီအစ...,1.0,1.0,1.0,1.0,89.631301,1.0,1.0,6.049647
4,companies with extremely strict rules and stan...,အလွန်အကျွံ တင်းကျပ်တဲ့ စည်းမျဉ်းတွေနဲ့ စံနှုန်...,အလွန်အကျွံ တင်းကျပ်တဲ့ စည်းမျဉ်းတွေနဲ့ စံနှုန်...,1.0,0.0,0.0,0.0,89.631301,1.0,0.0,6.049647


In [45]:
# display metrics
for model_name, dataset in lstm_evaluation_results_datasets.items():
    print(f"Metrics scores for {model_name}:")
    print(f" BLEU Score: {dataset['bleu'].mean()}")
    print(f" ROUGE-1 Score: {dataset['rouge-1'].mean()}")
    print(f" ROUGE-2 Score: {dataset['rouge-2'].mean()}")
    print(f" ROUGE-L Score: {dataset['rouge-l'].mean()}")
    print(f" chrF-S Score: {dataset['chrf-s'].mean()}")
    print(f" BERT Score: {dataset['bert_score'].mean()}")
    print(f" Perplexity: {dataset['perplexity'].mean()}")

Metrics scores for bpe:
 BLEU Score: 0.8550643236512907
 ROUGE-1 Score: 0.29895075392885223
 ROUGE-2 Score: 0.15146659816440783
 ROUGE-L Score: 0.29895075392885223
 chrF-S Score: 98.87747503989748
 BERT Score: 0.9950523577459716
 Perplexity: 6.046938903605646
Metrics scores for unigram:
 BLEU Score: 0.8553908448381435
 ROUGE-1 Score: 0.29918202113503084
 ROUGE-2 Score: 0.15169329771434184
 ROUGE-L Score: 0.29918202113503084
 chrF-S Score: 98.93721924143719
 BERT Score: 0.9951198253028631
 Perplexity: 6.046572521752053


# 2. Implementing Multilingual Transformer Baseline

In [47]:
# Define model names
multilingual_model_names = {
    "mbert": "bert-base-multilingual-cased",
    "xlmr": "xlm-roberta-base"
}

## Data Preprocessing

In [18]:
# datasets
multilingual_datasets = {
    "normal": [
        "myxnli_normalized_1", 
        "myxnli_normalized_2", 
        "alt_combined_normalized"
    ],
    "nllb_back_translated": [
        "myxnli_nllb_back_translated_final_1", 
        "myxnli_nllb_back_translated_final_2", 
        "alt_combined_nllb_back_translated_final"
    ],
    "seamless_m4t_back_translated": [
        "myxnli_seamless_m4t_back_translated_final_1",
        "myxnli_seamless_m4t_back_translated_final_2",
        "alt_combined_seamless_m4t_back_translated_final"
    ],
}

In [11]:
# Load and process dataset
def load_and_rename_columns_multilingual(file_name):
    df = load_gen_df(f"{file_name}")

    column_mapping = {
        "english_back_translated": "english",
        "burmese_translated": "burmese",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["english", "burmese"]]

    return df

In [12]:
# Load and process datasets
mutlilingual_loaded_datasets = {}
for key, file_list in multilingual_datasets.items():
    mutlilingual_loaded_datasets[key] = [load_and_rename_columns_multilingual(file) for file in file_list]

In [13]:
# combine all datasets
multilingual_combined = pd.concat(
    mutlilingual_loaded_datasets["normal"] + 
    mutlilingual_loaded_datasets["nllb_back_translated"] + 
    mutlilingual_loaded_datasets["seamless_m4t_back_translated"],
    ignore_index=True
)

In [14]:
# Shuffle the data to prevent order bias
multilingual_combined = multilingual_combined.sample(frac=1).reset_index(drop=True)

In [16]:
# print length
print(f"Multilingual dataset length: {len(multilingual_combined)}")

Multilingual dataset length: 1627576


In [17]:
# save data
save_models_df(multilingual_combined, "multilingual_combined")

## Generate Predictions
Load ```mBERT``` and ```XLM-R``` for Masked Language Modeling (MLM).
MLM helps predict missing words in Burmese sequences.

In [18]:
# dataset class for masked
class MaskedTextDataset(Dataset):
    def __init__(self, texts, tokenizer, mask_ratio=0.15, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.mask_ratio = mask_ratio
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx] if isinstance(self.texts[idx], str) else ""

        # Tokenize and move tensors to GPU
        inputs = self.tokenizer(
            text, return_tensors="pt", truncation=True, padding="max_length", max_length=self.max_length
        )
        
        input_ids = inputs["input_ids"].squeeze(0).to(device)
        attention_mask = inputs["attention_mask"].squeeze(0).to(device)

        # Apply random masking
        seq_length = input_ids.shape[0]
        num_to_mask = max(1, int(self.mask_ratio * (seq_length - 2)))  # Avoid CLS/SEP
        mask_indices = torch.randperm(seq_length - 2)[:num_to_mask] + 1  # Avoid first and last token

        masked_input_ids = input_ids.clone()
        masked_input_ids[mask_indices] = self.tokenizer.mask_token_id  # Replace with [MASK] token

        return masked_input_ids, attention_mask, input_ids

In [19]:
# Function to generate masked predictions
def generate_masked_predictions_batch(dataloader, model, tokenizer):
    all_predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating Masked Predictions"):
            # Move batch data to GPU
            masked_input_ids, attention_mask, original_input_ids = [x.to(device) for x in batch]

            # Run model inference on GPU
            outputs = model(input_ids=masked_input_ids, attention_mask=attention_mask)

            # Replace masked tokens with predicted tokens
            predicted_tokens_batch = masked_input_ids.clone()
            for i in range(masked_input_ids.shape[0]):  # Loop over batch
                mask_positions = (masked_input_ids[i] == tokenizer.mask_token_id).nonzero(as_tuple=True)[0]
                for pos in mask_positions:
                    predicted_token_id = torch.argmax(outputs.logits[i, pos], dim=-1).item()
                    predicted_tokens_batch[i, pos] = predicted_token_id

            # Decode predictions
            batch_predictions = tokenizer.batch_decode(predicted_tokens_batch.cpu(), skip_special_tokens=True)
            all_predictions.extend(batch_predictions)

    return all_predictions

### mBert

In [20]:
# Load tokenizers & models for both mBERT
multilingual_mbert_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_names["mbert"])
multilingual_mbert_model = AutoModelForMaskedLM.from_pretrained(multilingual_model_names["mbert"]).to(device)
multilingual_mbert_model = torch.compile(multilingual_mbert_model)
multilingual_mbert_model.eval()

OptimizedModule(
  (_orig_mod): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bia

In [21]:
# load multilingual dataset
multilingual_mbert_predictions = load_models_df("multilingual_combined")

In [22]:
# batch size
multilingual_mbert_predictions_batch_size = 8

In [23]:
# Convert to DataLoader
multilingual_mbert_predictions_texts = multilingual_mbert_predictions["burmese"].tolist()
multilingual_mbert_predictions_dataset = MaskedTextDataset(multilingual_mbert_predictions_texts, multilingual_mbert_tokenizer)
multilingual_mbert_predictions_dataloader = DataLoader(
    multilingual_mbert_predictions_dataset, 
    batch_size=multilingual_mbert_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
multilingual_mbert_predictions["generated"] = generate_masked_predictions_batch(
    multilingual_mbert_predictions_dataloader, 
    multilingual_mbert_model, 
    multilingual_mbert_tokenizer
)

In [25]:
# display
display(multilingual_mbert_predictions.head())

Unnamed: 0,english,burmese,generated
0,it's not worth seeing the nubian floor exhibit...,အထက် အီဂျစ်မှာ နူဘီးယား ကြမ်းပြင်ပြပွဲကို ကြည့...,အထက် အီဂျစ်မှာ နူဘီးယား ကြမ်းပြင်ပြပွဲကို ကြည့...
1,there are remote whitewashed villages that adv...,စွန့်စားချင်သူတွေ လည်ပတ်ချင်ကြတဲ့ ဝေးလံခေါင်သီ...,စွန့်စားချင်သီတွေ လည်ပတ်ချင်ကြတဲ့ ဝေးလံခေါင်သီ...
2,"she makes these little tricks, very good, and ...",သူမက ဒီပျဉ်းစေ့ကြိုးတွေ လုပ်ပေးတယ် အရမ်းကောင်း...,သူမက စီပျဉ်းစေ့ကြိုးတွေ လုပ်ပေးတဲ့ ၊ိမ်းကောင်း...
3,the pair regained zimbabwe's times and finishe...,ထိုစုံတွဲသည် ဇင်ဘာဘွေ၏ အကြိမ်များကို ပြန်လည်ရရ...,ထိုစုံပွဲသည် ဇင်ဘာတွေ ၏ အကြိမ်များကို ပြန်လည်ရ...
4,potential of clarifying its notices to taxpaye...,အခွန်ထမ်းများထံ ၎င်း၏သတိပေးချက်များကို ရှင်းလင...,အခွက်ထမ်းများထံ ၎ င်း ၏ သတိပြချက်များကို ရှင်း...


In [26]:
# save multilingual mbert predictions
save_models_df(multilingual_mbert_predictions, "multilingual_mbert_predictions")

### XLM-R

In [19]:
# Load tokenizers & models for both xlmr
multilingual_xlmr_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_names["xlmr"])
multilingual_xlmr_model = AutoModelForMaskedLM.from_pretrained(multilingual_model_names["xlmr"]).to(device)
multilingual_xlmr_model = torch.compile(multilingual_xlmr_model)
multilingual_xlmr_model.eval()

OptimizedModule(
  (_orig_mod): XLMRobertaForMaskedLM(
    (roberta): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): XLMRobertaSelfOutput(
    

In [20]:
# load multilingual dataset
multilingual_xlmr_predictions = load_models_df("multilingual_combined")

In [21]:
# multinlingual batch size
multilingual_xlmr_predictions_batch_size = 8

In [None]:
# Convert to DataLoader
multilingual_xlmr_predictions_texts = multilingual_xlmr_predictions["burmese"].tolist()
multilingual_xlmr_predictions_dataset = MaskedTextDataset(multilingual_xlmr_predictions_texts, multilingual_xlmr_tokenizer)
multilingual_xlmr_predictions_dataloader = DataLoader(
    multilingual_xlmr_predictions_dataset, 
    batch_size=multilingual_xlmr_predictions_batch_size, 
    shuffle=False
)

In [None]:
# Run text generation
multilingual_xlmr_predictions["generated"] = generate_masked_predictions_batch(
    multilingual_xlmr_predictions_dataloader, 
    multilingual_xlmr_model, 
    multilingual_xlmr_tokenizer
)

In [None]:
# display
display(multilingual_xlmr_predictions.head())

In [19]:
# save multilingual xlmr predictions
save_models_df(multilingual_xlmr_predictions, "multilingual_xlmr_predictions")

## Evaluate Model Performance
Compute BLEU, ROUGE-1, ROUGE-2, ROUGE-3, ROUGE-L, chrF-S, BERTScore and Perplexity scores.

In [48]:
# load multilingual predictions
multilingual_evaluation_results_datasets = {
    model_name: load_models_df(f"multilingual_{model_name}_predictions") for model_name in multilingual_model_names.keys()
}

### Metrics

In [19]:
# compute metrics
for model_name, dataset in multilingual_evaluation_results_datasets.items():
    print(f"Processing Data for {model_name}..."),
    compute_metrics_batch(dataset)

Processing Data for mbert...


Computing Metrics:   0%|          | 0/50862 [00:00<?, ?batch/s]



Finished Computing Metrics!
Processing Data for xlmr...


Computing Metrics:   0%|          | 0/50862 [00:00<?, ?batch/s]



Finished Computing Metrics!


In [21]:
# display metrics
for model_name, dataset in multilingual_evaluation_results_datasets.items():
    print(f"Metrics scores for {model_name}:")
    print(f" BLEU Score: {dataset['bleu'].mean()}")
    print(f" ROUGE-1 Score: {dataset['rouge-1'].mean()}")
    print(f" ROUGE-2 Score: {dataset['rouge-2'].mean()}")
    print(f" ROUGE-L Score: {dataset['rouge-l'].mean()}")
    print(f" chrF-S Score: {dataset['chrf-s'].mean()}")
    print(f" BERT Score: {dataset['bert_score'].mean()}")

Metrics scores for mbert:
 BLEU Score: 0.11161891030998354
 ROUGE-1 Score: 0.21893887065417755
 ROUGE-2 Score: 0.0979642909372018
 ROUGE-L Score: 0.2188918748703697
 chrF-S Score: 79.22023133392412
 BERT Score: 0.8790649549057732
Metrics scores for xlmr:
 BLEU Score: 0.4581550312737298
 ROUGE-1 Score: 0.25353074248638974
 ROUGE-2 Score: 0.11654468643909702
 ROUGE-L Score: 0.2535243519409113
 chrF-S Score: 88.05020715366919
 BERT Score: 0.9653381555889077


In [22]:
# save results
for model_name, dataset in multilingual_evaluation_results_datasets.items():
    save_tmp_df(dataset, f"multilingual_{model_name}_metrics")

### Perplexity

In [18]:
 # Function to compute perplexity in batch
def compute_multilingual_perplexity_batch(dataloader, model, tokenizer):
    perplexities = []

    model.eval()  # Set model to evaluation mode
    model.to(device)  # Move model to GPU

    for batch in tqdm(dataloader, desc="Computing Perplexity", unit="batch"):
        batch_texts = batch  # Text input batch

        # 🔹 Tokenize batch with padding & truncation
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            outputs = model(**inputs)  # Forward pass
            logits = outputs.logits  # Shape: (batch_size, seq_len, vocab_size)

        temperature = 1.5
        logits = logits / temperature

        # 🔹 Compute log probabilities
        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

        # 🔹 Get token log-likelihoods using true token IDs
        target_ids = inputs["input_ids"]
        log_likelihood = log_probs.gather(dim=-1, index=target_ids.unsqueeze(-1)).squeeze(-1)

        # 🔹 Apply attention mask to remove padding tokens
        mask = inputs["attention_mask"]
        masked_log_likelihood = log_likelihood * mask  # Zero out padding contributions

        # 🔹 Compute sentence-level mean log-likelihood
        sentence_log_likelihood = masked_log_likelihood.sum(dim=1) / mask.sum(dim=1)

        # 🔹 Convert log-likelihood to perplexity
        log_perplexity = -sentence_log_likelihood
        batch_perplexities = torch.exp(log_perplexity).cpu().numpy()

        perplexities.extend(batch_perplexities)

    return perplexities

#### mBert

In [19]:
# batch size
multilingual_mbert_perplexity_batch_size = 8

In [20]:
# Load tokenizers & models for both mbert
multilingual_mbert_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_names["mbert"])
multilingual_mbert_model = AutoModelForMaskedLM.from_pretrained(multilingual_model_names["mbert"]).to(device)

In [21]:
# Prepare dataset and DataLoader
multilingual_mbert_generated_texts = multilingual_evaluation_results_datasets["mbert"]["generated"].tolist()
multilingual_mbert_text_dataset = TextDataset(multilingual_mbert_generated_texts)
multilingual_mbert_dataloader = DataLoader(
    multilingual_mbert_text_dataset, 
    batch_size=multilingual_mbert_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# compute and store perplexity scores in DataFrame and display
multilingual_evaluation_results_datasets["mbert"]["perplexity"] = compute_multilingual_perplexity_batch(
    multilingual_mbert_dataloader,
    multilingual_mbert_model,
    multilingual_mbert_tokenizer
)

In [23]:
# display perplexity
print(f"Perplexity Score: {multilingual_evaluation_results_datasets['mbert']['perplexity'].mean()}")

Perplexity Score: 1.9864329099655151


In [24]:
# save perplexity
save_tmp_df(multilingual_evaluation_results_datasets["mbert"], f"multilingual_mbert_perplexity")

#### XLM-R

In [19]:
# batch size
multilingual_xlmr_perplexity_batch_size = 8

In [20]:
# Load tokenizers & models for both xlmr
multilingual_xlmr_tokenizer = AutoTokenizer.from_pretrained(multilingual_model_names["xlmr"])
multilingual_xlmr_model = AutoModelForMaskedLM.from_pretrained(multilingual_model_names["xlmr"])

In [21]:
# Prepare dataset and DataLoader
multilingual_xlmr_generated_texts = multilingual_evaluation_results_datasets["xlmr"]["generated"].tolist()
multilingual_xlmr_text_dataset = TextDataset(multilingual_xlmr_generated_texts)
multilingual_xlmr_dataloader = DataLoader(
    multilingual_xlmr_text_dataset, 
    batch_size=multilingual_xlmr_perplexity_batch_size, 
    shuffle=False
)

In [None]:
# compute and store perplexity scores in DataFrame and display
multilingual_evaluation_results_datasets["xlmr"]["perplexity"] = compute_multilingual_perplexity_batch(
    multilingual_xlmr_dataloader,
    multilingual_xlmr_model,
    multilingual_xlmr_tokenizer
)

In [None]:
# display perplexity
print(f"Perplexity Score: {multilingual_evaluation_results_datasets['xlmr']['perplexity'].mean()}")

Perplexity Score: 551040.3125


In [None]:
# save perplexity
save_tmp_df(multilingual_evaluation_results_datasets["xlmr"], f"multilingual_xlmr_perplexity")

### Save Evaluation Results

In [49]:
# combine evaluation results
for model_name in multilingual_evaluation_results_datasets.keys():
    print(f"Processing {model_name}...")

    # load metrics and set
    metrics = load_tmp_df(f"multilingual_{model_name}_metrics")
    multilingual_evaluation_results_datasets[model_name]["bleu"] = metrics["bleu"]
    multilingual_evaluation_results_datasets[model_name]["rouge-1"] = metrics["rouge-1"]
    multilingual_evaluation_results_datasets[model_name]["rouge-2"] = metrics["rouge-2"]
    multilingual_evaluation_results_datasets[model_name]["rouge-l"] = metrics["rouge-l"]
    multilingual_evaluation_results_datasets[model_name]["chrf-s"] = metrics["chrf-s"]
    multilingual_evaluation_results_datasets[model_name]["bert_score"] = metrics["bert_score"]

    # load perplexity and set
    perplexity = load_tmp_df(f"multilingual_{model_name}_perplexity")
    multilingual_evaluation_results_datasets[model_name]["perplexity"] = perplexity["perplexity"]

    save_models_df(multilingual_evaluation_results_datasets[model_name], f"multilingual_{model_name}_evaluation_results")

    display(multilingual_evaluation_results_datasets[model_name].head())

Processing mbert...


Unnamed: 0,english,burmese,generated,bleu,rouge-1,rouge-2,rouge-l,chrf-s,bert_score,perplexity
0,it's not worth seeing the nubian floor exhibit...,အထက် အီဂျစ်မှာ နူဘီးယား ကြမ်းပြင်ပြပွဲကို ကြည့...,အထက် အီဂျစ်မှာ နူဘီးယား ကြမ်းပြင်ပြပွဲကို ကြည့...,0.228942,0.0,0.0,0.0,91.865183,0.903668,1.751204
1,there are remote whitewashed villages that adv...,စွန့်စားချင်သူတွေ လည်ပတ်ချင်ကြတဲ့ ဝေးလံခေါင်သီ...,စွန့်စားချင်သီတွေ လည်ပတ်ချင်ကြတဲ့ ဝေးလံခေါင်သီ...,0.044632,0.0,0.0,0.0,91.865183,0.901875,1.53058
2,"she makes these little tricks, very good, and ...",သူမက ဒီပျဉ်းစေ့ကြိုးတွေ လုပ်ပေးတယ် အရမ်းကောင်း...,သူမက စီပျဉ်းစေ့ကြိုးတွေ လုပ်ပေးတဲ့ ၊ိမ်းကောင်း...,0.203317,0.0,0.0,0.0,91.865183,0.841597,1.400856
3,the pair regained zimbabwe's times and finishe...,ထိုစုံတွဲသည် ဇင်ဘာဘွေ၏ အကြိမ်များကို ပြန်လည်ရရ...,ထိုစုံပွဲသည် ဇင်ဘာတွေ ၏ အကြိမ်များကို ပြန်လည်ရ...,0.036788,0.0,0.0,0.0,91.865183,0.896423,1.960057
4,potential of clarifying its notices to taxpaye...,အခွန်ထမ်းများထံ ၎င်း၏သတိပေးချက်များကို ရှင်းလင...,အခွက်ထမ်းများထံ ၎ င်း ၏ သတိပြချက်များကို ရှင်း...,0.026921,0.0,0.0,0.0,91.865183,0.928218,1.473944


Processing xlmr...


Unnamed: 0,english,burmese,generated,bleu,rouge-1,rouge-2,rouge-l,chrf-s,bert_score,perplexity
0,it's not worth seeing the nubian floor exhibit...,အထက် အီဂျစ်မှာ နူဘီးယား ကြမ်းပြင်ပြပွဲကို ကြည့...,ဒီထက် အီဂျစ်သမ္မတ နမ်ဘီးယား ကြမ်းပြင်ပြပွဲကို ...,0.202052,0.0,0.0,0.0,84.223693,0.910403,1.034274
1,there are remote whitewashed villages that adv...,စွန့်စားချင်သူတွေ လည်ပတ်ချင်ကြတဲ့ ဝေးလံခေါင်သီ...,စွန့်စားချင်သူတွေ လည်ပတ်ချင်ကြတဲ့ ဝေးလံ ေါင်သီ...,0.086334,0.0,0.0,0.0,84.223693,0.936242,1.094066
2,"she makes these little tricks, very good, and ...",သူမက ဒီပျဉ်းစေ့ကြိုးတွေ လုပ်ပေးတယ် အရမ်းကောင်း...,သူမက ဒီပျဉ်းစေ့ကြိုးတွေ လုပ်ပေးတယ် အရမ်းကောင်း...,0.78392,0.0,0.0,0.0,84.223693,0.97862,1.017733
3,the pair regained zimbabwe's times and finishe...,ထိုစုံတွဲသည် ဇင်ဘာဘွေ၏ အကြိမ်များကို ပြန်လည်ရရ...,ထိုစုံတွဲသည် ဇင်ဘာဘွေ၏ ဒုတိယအကြိမ်များကို ပြန်...,0.513345,0.0,0.0,0.0,84.223693,0.956735,1.054627
4,potential of clarifying its notices to taxpaye...,အခွန်ထမ်းများထံ ၎င်း၏သတိပေးချက်များကို ရှင်းလင...,အခွန်ထမ်းများထံ ၎င်း၏သတိပေးချက်များကို ရှင်းလင...,0.680375,0.0,0.0,0.0,84.223693,0.975785,1.007931


In [50]:
# display metrics
for model_name, dataset in multilingual_evaluation_results_datasets.items():
    print(f"Metrics scores for {model_name}:")
    print(f" BLEU Score: {dataset['bleu'].mean()}")
    print(f" ROUGE-1 Score: {dataset['rouge-1'].mean()}")
    print(f" ROUGE-2 Score: {dataset['rouge-2'].mean()}")
    print(f" ROUGE-L Score: {dataset['rouge-l'].mean()}")
    print(f" chrF-S Score: {dataset['chrf-s'].mean()}")
    print(f" BERT Score: {dataset['bert_score'].mean()}")
    print(f" Perplexity: {dataset['perplexity'].mean()}")

Metrics scores for mbert:
 BLEU Score: 0.1116189103099835
 ROUGE-1 Score: 0.21893887065417755
 ROUGE-2 Score: 0.0979642909372018
 ROUGE-L Score: 0.2188918748703697
 chrF-S Score: 79.22023133392412
 BERT Score: 0.8790649549057732
 Perplexity: 1.986432686204762
Metrics scores for xlmr:
 BLEU Score: 0.45815503127372975
 ROUGE-1 Score: 0.25353074248638974
 ROUGE-2 Score: 0.11654468643909702
 ROUGE-L Score: 0.2535243519409113
 chrF-S Score: 88.05020715366919
 BERT Score: 0.9653381555889077
 Perplexity: 551040.5946815407


# 3. Benchmarking and Analysis
Compare the performance of LSTM BPE, LSTM Unigram, mBERT, and XLM-R using BLEU, ROUGE, chrF-S, BERT Score and Perplexity.

## Data Preprocessing

In [16]:
# datasets
benchmarking_models = {
    "lstm": [
        "bpe", 
        "unigram"
    ],
    "multilingual": [
        "mbert", 
        "xlmr"
    ]
}

In [17]:
# Load and process dataset
def load_and_rename_columns_benchmarking(key, model_name):
    df = load_models_df(f"{key}_{model_name}_evaluation_results")

    df = df[["english", "bleu", "rouge-1", "rouge-2", "rouge-l", "chrf-s", "bert_score", "perplexity"]]
    
    column_mapping = {
        "bleu": "bleu",
        "rouge-1": "rouge1",
        "rouge-2": "rouge2",
        "rouge-l": "rougeL",
        "chrf-s": "chrF",
        "bert_score": "bert",
        "perplexity": "perplexity",
    }
    df = df.rename(columns=column_mapping)
    return df

In [18]:
# Load datasets
benchmarking_loaded_datasets = {}
for key, model_list in benchmarking_models.items():
    for model_name in model_list:
        df = load_and_rename_columns_benchmarking(key, model_name)
        benchmarking_loaded_datasets[f"{key}_{model_name}"] = df


## Compute Average Scores for Comparison
Get mean BLEU, ROUGE, chrF-S, Bert Score and Perplexity for LSTM (BPE & Unigram), mBERT, and XLM-R.

In [19]:
# Define the model names and their respective column prefixes
benchmarking_model_names = ["LSTM BPE", "LSTM Unigram", "mBERT", "XLM-R"]
benchmarking_column_prefixes = ["bpe", "unigram", "mbert", "xlmr"]

In [20]:
# Compute mean scores dynamically using a dictionary comprehension
benchmarking_mean_scores = {
    model: {
        "BLEU": benchmarking_loaded_datasets[f"lstm_{prefix}"].bleu.mean() if f"lstm_{prefix}" in benchmarking_loaded_datasets else benchmarking_loaded_datasets[f"multilingual_{prefix}"].bleu.mean(),
        "ROUGE-1": benchmarking_loaded_datasets[f"lstm_{prefix}"].rouge1.mean() if f"lstm_{prefix}" in benchmarking_loaded_datasets else benchmarking_loaded_datasets[f"multilingual_{prefix}"].rouge1.mean(),
        "ROUGE-2": benchmarking_loaded_datasets[f"lstm_{prefix}"].rouge2.mean() if f"lstm_{prefix}" in benchmarking_loaded_datasets else benchmarking_loaded_datasets[f"multilingual_{prefix}"].rouge2.mean(),
        "ROUGE-L": benchmarking_loaded_datasets[f"lstm_{prefix}"].rougeL.mean() if f"lstm_{prefix}" in benchmarking_loaded_datasets else benchmarking_loaded_datasets[f"multilingual_{prefix}"].rougeL.mean(),
        "chrF-S": benchmarking_loaded_datasets[f"lstm_{prefix}"].chrF.mean() if f"lstm_{prefix}" in benchmarking_loaded_datasets else benchmarking_loaded_datasets[f"multilingual_{prefix}"].chrF.mean(),
        "BERT Score": benchmarking_loaded_datasets[f"lstm_{prefix}"].bert.mean() if f"lstm_{prefix}" in benchmarking_loaded_datasets else benchmarking_loaded_datasets[f"multilingual_{prefix}"].bert.mean(),
        "Perplexity": benchmarking_loaded_datasets[f"lstm_{prefix}"].perplexity.mean() if f"lstm_{prefix}" in benchmarking_loaded_datasets else benchmarking_loaded_datasets[f"multilingual_{prefix}"].perplexity.mean(),
    }
    for model, prefix in zip(benchmarking_model_names, benchmarking_column_prefixes)
}

In [21]:
# Convert mean scores dictionary to DataFrame for better visualization
benchmarking_mean_scores_df = pd.DataFrame.from_dict(benchmarking_mean_scores, orient='index')

In [22]:
# Display mean scores
display(benchmarking_mean_scores_df)

Unnamed: 0,BLEU,ROUGE-1,ROUGE-2,ROUGE-L,chrF-S,BERT Score,Perplexity
LSTM BPE,0.855064,0.298951,0.151467,0.298951,98.877475,0.995052,6.046939
LSTM Unigram,0.855391,0.299182,0.151693,0.299182,98.937219,0.99512,6.046573
mBERT,0.111619,0.218939,0.097964,0.218892,79.220231,0.879065,1.986433
XLM-R,0.458155,0.253531,0.116545,0.253524,88.050207,0.965338,551040.594682
