# Importing all the important libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers sentence_transformers sentencepiece

In [1]:
import pandas as pd
import tensorflow as tf
import torch
import numpy as np
from tqdm.notebook import tqdm
import sentencepiece as spm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoTokenizer, AutoModelForMaskedLM
from rouge_score import rouge_scorer
import torch.nn.functional as F

# Set GPU

## Mac

In [2]:
# for mac
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        details = tf.config.experimental.get_device_details(gpu)
        print("GPU details: ", details)
else:
    print("No GPU found. Using CPU.")

# set GPU device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


## Window / Linux

In [None]:
# for window
print("Tensorflow GPUs: ", tf.config.list_physical_devices('GPU'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using PyTorch device:", device)
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

# Functions

In [3]:
# function to save models df
def save_models_df(df, df_name):
    df.to_csv(f"models/{df_name}.csv", index=False, encoding="utf-8")

In [4]:
# function to load spt df
def load_spt_df(df_name):
    return pd.read_csv(f"spt/{df_name}.csv", header=0, encoding="utf-8")

In [5]:
# function to load models df
def load_models_df(df_name):
    return pd.read_csv(f"models/{df_name}.csv", header=0, encoding="utf-8")

# Set settings

In [6]:
tqdm.pandas()

# Dataset Preparation

In [7]:
# Datasets and file paths
datasets = {
    "normal": [
        "tokenized_bpe_myxnli_normalized_1", 
        "tokenized_bpe_myxnli_normalized_2", 
        "tokenized_bpe_alt_combined_normalized"
    ],
    "nllb_back_translated": [
        "tokenized_bpe_myxnli_nllb_back_translated_final_1", 
        "tokenized_bpe_myxnli_nllb_back_translated_final_2", 
        "tokenized_bpe_alt_combined_nllb_back_translated_final"
    ],
    "seamless_m4t_back_translated": [
        "tokenized_bpe_myxnli_seamless_m4t_back_translated_final_1",
        "tokenized_bpe_myxnli_seamless_m4t_back_translated_final_2",
        "tokenized_bpe_alt_combined_seamless_m4t_back_translated_final"
    ],
}

In [8]:
# Load and process dataset
def load_and_rename_columns(file_name):
    df = load_spt_df(f"{file_name}")

    column_mapping = {
        "english_back_translated": "english",
        "burmese_translated": "burmese",
        "english_back_translated_tokens": "english_tokens",
        "burmese_translated_tokens": "burmese_tokens",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["english", "burmese", "english_tokens", "burmese_tokens"]]

    return df

In [9]:
# Load all datasets
all_datasets = {
    key: [load_and_rename_columns(file) for file in file_list] for key, file_list in datasets.items()
}

In [10]:
# Combine all datasets
combined_data = pd.concat(
    all_datasets["normal"] + 
    all_datasets["nllb_back_translated"] + 
    all_datasets["seamless_m4t_back_translated"],
    ignore_index=True  # Reset index to avoid duplicates
)

In [11]:
# Shuffle the data to prevent order bias
combined_data = combined_data.sample(frac=1).reset_index(drop=True)

In [12]:
print(f"Total training samples: {len(combined_data)}")
display(combined_data.head())

Total training samples: 1627576


Unnamed: 0,english,burmese,english_tokens,burmese_tokens
0,the correspondent must be given a different rate.,စာပို့သူကိုနှုန်း ကွာခြားချက်တစ်ခု ပေးအပ်ရပါမယ်။,"['▁the', '▁correspond', 'ent', '▁must', '▁be',...","['▁စာပို့', 'သူကို', 'နှုန်း', '▁ကွာခြား', 'ချ..."
1,octopi can be found in tiny crevices in the me...,Octopi ကို မြေထဲပင်လယ်ရှိ သေးငယ်သော အပေါက်များ...,"['▁oct', 'op', 'i', '▁can', '▁be', '▁found', '...","['▁O', 'ct', 'op', 'i', '▁ကို', '▁မြေထဲပင်လယ်'..."
2,the model shows where we can cut costs most ef...,မော်ဒယ်သည် ကုန်ကျစရိတ်များကို အထိရောက်ဆုံး လျှ...,"['▁the', '▁model', '▁shows', '▁where', '▁we', ...","['▁မော်ဒယ်', 'သည်', '▁ကုန်ကျစရိတ်', 'များကို',..."
3,he always wanted to be a journalist.,သူဟာ သတင်းစာသမားဖြစ်ချင်ခဲ့တာ အမြဲတမ်းပါ။,"['▁he', '▁always', '▁wanted', '▁to', '▁be', '▁...","['▁သူဟာ', '▁သတင်းစာ', 'သမား', 'ဖြစ်', 'ချင်', ..."
4,"the oldest rooms are near the western gate, an...",ရှေးအကျဆုံး အခန်းတွေဟာ အနောက်ဘက် တံခါးနားမှာရှ...,"['▁the', '▁oldest', '▁rooms', '▁are', '▁near',...","['▁ရှေးအကျဆုံး', '▁အခန်းတွေဟာ', '▁အနောက်ဘက်', ..."


In [13]:
# save combined datasets
save_models_df(combined_data, "combined_data")

# 1. Implementing RNN/LSTM Baseline

In [14]:
# Load SentencePiece BPE tokenizer
sp = spm.SentencePieceProcessor()
sp.Load("spt/spt_bpe.model")

True

## Data Preprocessing

In [15]:
# load combined data
lstm_preprocess_data = load_models_df("combined_data")

In [16]:
# Convert tokenized sequences into lists
lstm_preprocess_data["burmese_seq"] = lstm_preprocess_data["burmese_tokens"].progress_apply(
    lambda x: sp.EncodeAsIds(str(x)) if isinstance(x, str) else []
)
lstm_preprocess_data["english_seq"] = lstm_preprocess_data["english_tokens"].progress_apply(
    lambda x: sp.EncodeAsIds(str(x)) if isinstance(x, str) else []
)

display(lstm_preprocess_data.head())

  0%|          | 0/1627576 [00:00<?, ?it/s]

  0%|          | 0/1627576 [00:00<?, ?it/s]

Unnamed: 0,english,burmese,english_tokens,burmese_tokens,burmese_seq,english_seq
0,the correspondent must be given a different rate.,စာပို့သူကိုနှုန်း ကွာခြားချက်တစ်ခု ပေးအပ်ရပါမယ်။,"['▁the', '▁correspond', 'ent', '▁must', '▁be',...","['▁စာပို့', 'သူကို', 'နှုန်း', '▁ကွာခြား', 'ချ...","[8777, 30887, 2847, 30887, 30883, 1519, 5855, ...","[8777, 30887, 12, 30887, 30883, 1519, 18489, 3..."
1,octopi can be found in tiny crevices in the me...,Octopi ကို မြေထဲပင်လယ်ရှိ သေးငယ်သော အပေါက်များ...,"['▁oct', 'op', 'i', '▁can', '▁be', '▁found', '...","['▁O', 'ct', 'op', 'i', '▁ကို', '▁မြေထဲပင်လယ်'...","[8777, 30887, 1602, 30887, 30883, 1519, 283, 3...","[8777, 30887, 8904, 30887, 30883, 1519, 221, 3..."
2,the model shows where we can cut costs most ef...,မော်ဒယ်သည် ကုန်ကျစရိတ်များကို အထိရောက်ဆုံး လျှ...,"['▁the', '▁model', '▁shows', '▁where', '▁we', ...","['▁မော်ဒယ်', 'သည်', '▁ကုန်ကျစရိတ်', 'များကို',...","[8777, 30887, 7224, 30887, 30883, 1519, 68, 30...","[8777, 30887, 12, 30887, 30883, 1519, 4047, 30..."
3,he always wanted to be a journalist.,သူဟာ သတင်းစာသမားဖြစ်ချင်ခဲ့တာ အမြဲတမ်းပါ။,"['▁he', '▁always', '▁wanted', '▁to', '▁be', '▁...","['▁သူဟာ', '▁သတင်းစာ', 'သမား', 'ဖြစ်', 'ချင်', ...","[8777, 30887, 1499, 30887, 30883, 1519, 3143, ...","[8777, 30887, 153, 30887, 30883, 1519, 1321, 3..."
4,"the oldest rooms are near the western gate, an...",ရှေးအကျဆုံး အခန်းတွေဟာ အနောက်ဘက် တံခါးနားမှာရှ...,"['▁the', '▁oldest', '▁rooms', '▁are', '▁near',...","['▁ရှေးအကျဆုံး', '▁အခန်းတွေဟာ', '▁အနောက်ဘက်', ...","[8777, 30887, 14623, 30887, 30883, 1519, 29639...","[8777, 30887, 12, 30887, 30883, 1519, 8573, 30..."


In [17]:
# Define maximum sequence length
max_seq_length = 128

In [18]:
# Apply padding
lstm_preprocess_data["burmese_seq_padded"] = pad_sequences(
    lstm_preprocess_data["burmese_seq"], maxlen=max_seq_length, padding="post"
).tolist()
lstm_preprocess_data["english_seq_padded"] = pad_sequences(
    lstm_preprocess_data["english_seq"], maxlen=max_seq_length, padding="post"
).tolist()

In [19]:
# save lstm preprocess data
save_models_df(lstm_preprocess_data, "lstm_preprocess_data")

## Define LSTM Model
Define an LSTM-based sequence-to-sequence (seq2seq) model with embedding layers.

In [6]:
# Hyperparameters
lstm_embedding_dim = 256
lstm_hidden_dim = 512
lstm_vocab_size = sp.GetPieceSize()  # Get vocabulary size from SentencePiece

In [None]:
# Build LSTM Model
with tf.device('/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'):  
    lstm_model = Sequential([
        Embedding(input_dim=lstm_vocab_size, output_dim=lstm_embedding_dim, input_length=max_seq_length, mask_zero=True),
        Bidirectional(LSTM(lstm_hidden_dim, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
        LSTM(lstm_hidden_dim, return_sequences=False, dropout=0.3, recurrent_dropout=0.3),
        Dense(lstm_vocab_size, activation='softmax')
    ])

    # Compile model
    lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display Model Summary
lstm_model.summary()

## Train the Model
Train the model using Categorical Cross-Entropy loss & Adam optimizer.

In [None]:
# Load LSTM preprocess data
lstm_data = load_models_df("lstm_preprocess_data")

In [None]:
# Convert to NumPy arrays
X_train = np.array(lstm_data["burmese_seq_padded"].tolist())
y_train = np.array(lstm_data["english_seq_padded"].tolist())

In [None]:
# Train model
with tf.device('/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'):  
    lstm_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1)

In [None]:
# save lstm model
lstm_model.save("models/lstm_model")

## Evaluate Model with BLEU Score
Compute BLEU, ROUGE, and Perplexity scores.

In [None]:
# Load LSTM preprocess data
lstm_evaluation_results = load_models_df("lstm_preprocess_data")

### Compute BLEU Score

In [None]:
# function to compute bleu score
def compute_bleu_lstm(reference, prediction):
    return sentence_bleu([reference.split()], prediction.split())

In [None]:
# Generate predictions
lstm_evaluation_results["generated_lstm"] = lstm_evaluation_results["burmese"].progress_apply(
    lambda x: " ".join(sp.EncodeAsPieces(x))
)

In [None]:
# Compute BLEU scores
lstm_evaluation_results["bleu_lstm"] = lstm_evaluation_results.progress_apply(
    lambda row: compute_bleu_lstm(row["english"], row["generated_lstm"]), axis=1
)

In [None]:
# Display BLEU score results
print(f"LSTM BLEU Score: {lstm_evaluation_results['bleu_lstm'].mean()}")

### Compute ROUGE Score

In [None]:
# Compute ROUGE scores
lstm_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

In [None]:
lstm_evaluation_results["rouge_lstm"] = lstm_evaluation_results.progress_apply(
    lambda row: lstm_scorer.score(row["english"], row["generated_lstm"])["rougeL"].fmeasure, axis=1
)

In [None]:
# Display ROUGE score results
print(f"LSTM ROUGE Score: {lstm_evaluation_results['rouge_lstm'].mean()}")

### Compute Perplexity Score

In [None]:
def compute_perplexity_lstm(text, model_name="bert-base-multilingual-cased"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)

    log_likelihood = F.log_softmax(outputs.logits, dim=-1)
    perplexity = torch.exp(-log_likelihood.mean()).item()
    
    return perplexity

In [None]:
# Compute Perplexity for LSTM-generated text
lstm_evaluation_results["perplexity_lstm"] = lstm_evaluation_results["generated_lstm"].progress_apply(
    lambda x: compute_perplexity_lstm(x)
)

In [None]:
# Display Perplexity Score Results
print(f"LSTM Perplexity Score: {lstm_evaluation_results['perplexity_lstm'].mean()}")

### Save Model Results

In [None]:
save_models_df(lstm_evaluation_results, "lstm_evaluation_results")

# 2. Implementing Multilingual Transformer Baseline

## Load Pre-Trained Models
Load ```mBERT``` and ```XLM-R``` for Masked Language Modeling (MLM).
MLM helps predict missing words in Burmese sequences.

In [7]:
# Define model names
multilingual_model_names = {
    "mBERT": "bert-base-multilingual-cased",
    "XLM-R": "xlm-roberta-base"
}

In [None]:
# Load tokenizers & models for both mBERT and XLM-R
multilingual_tokenizers = {
    name: AutoTokenizer.from_pretrained(model) for name, model in multilingual_model_names.items()
}
multilingual_models = {
    name: AutoModelForMaskedLM.from_pretrained(model).to(device) for name, model in multilingual_model_names.items()
}

## Load Processed Dataset

In [9]:
# Load pre-processed dataset
mBERT_XLMR_evaludation_results = load_models_df("combined_data")

## Run Inference Without Fine-Tuning
Pass Burmese text through ```mBERT``` & ```XLM-R``` using Masked Language Modeling (MLM). mBERT/XLM-R predict missing words in Burmese sequences.

In [12]:
# Function to generate masked predictions
def generate_masked_predictions(text, model_name):
    tokenizer = multilingual_tokenizers[model_name]
    model = multilingual_models[model_name]

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    # Mask a random token in the sequence
    mask_idx = torch.randint(1, inputs["input_ids"].shape[1] - 1, (1,))
    inputs["input_ids"][0, mask_idx] = tokenizer.mask_token_id  # Replace one token with [MASK]

    # Run the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get highest probability prediction for the masked token
    predicted_token_id = torch.argmax(outputs.logits[0, mask_idx], dim=-1).item()
    predicted_token = tokenizer.decode([predicted_token_id])

    return text.replace(tokenizer.mask_token, predicted_token)

In [None]:
# Test on sample Burmese sentences
multilingual_sample_texts = mBERT_XLMR_evaludation_results["burmese"].sample(5).tolist()
for model_name in multilingual_model_names:
    print(f"Model: {model_name}")
    for text in multilingual_sample_texts:
        print(f"Original: {text}")
        print(f"Generated: {generate_masked_predictions(text, model_name)}\n")

## Evaluate Model Performance
Compare BLEU, ROUGE, and Perplexity scores between ```mBERT``` and ```XLM-R```.

### Compute BLEU Score

In [14]:
# Function to compute BLEU score
def compute_bleu_multilingual(reference, prediction):
    return sentence_bleu([reference.split()], prediction.split())

In [None]:
# Evaluate on dataset
for model_name in multilingual_model_names:
    mBERT_XLMR_evaludation_results[f"{model_name}_generated"] = mBERT_XLMR_evaludation_results["burmese"].process_apply(
        lambda x: generate_masked_predictions(x, model_name)
    )
    mBERT_XLMR_evaludation_results[f"{model_name}_bleu"] = mBERT_XLMR_evaludation_results.process_apply(
        lambda row: compute_bleu_multilingual(row["english"], row[f"{model_name}_generated"]), axis=1
    )

In [None]:
# Display BLEU scores
for model_name in multilingual_model_names:
    print(f"{model_name} BLEU Score: {mBERT_XLMR_evaludation_results[f'{model_name}_bleu'].mean()}")

### Compute ROUGE Score

In [None]:
# Compute ROUGE Score
multilingual_rouge_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
for model_name in multilingual_model_names:
    mBERT_XLMR_evaludation_results[f"{model_name}_rouge"] = mBERT_XLMR_evaludation_results.process_apply(
        lambda row: multilingual_rouge_scorer.score(row["english"], row[f"{model_name}_generated"])["rougeL"].fmeasure, axis=1
    )

In [None]:
# Display ROUGE scores
for model_name in multilingual_model_names:
    print(f"{model_name} ROUGE Score: {mBERT_XLMR_evaludation_results[f'{model_name}_rouge'].mean()}")

### Compute Perplexity Score

In [None]:
# function to compute perplexity
def compute_perplexity_multilingual(text, model_name):
    tokenizer = multilingual_tokenizers[model_name]
    model = multilingual_models[model_name]

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Compute log-likelihood
    log_likelihood = F.log_softmax(outputs.logits, dim=-1)
    
    # Compute Perplexity
    perplexity = torch.exp(-log_likelihood.mean()).item()
    return perplexity

In [None]:
# Compute Perplexity for both models
for model_name in multilingual_model_names:
    mBERT_XLMR_evaludation_results[f"{model_name}_perplexity"] = mBERT_XLMR_evaludation_results[f"{model_name}_generated"].process_apply(
        lambda x: compute_perplexity_multilingual(x, model_name)
    )

In [None]:
# Display Perplexity scores
for model_name in multilingual_model_names:
    print(f"{model_name} Perplexity Score: {compute_perplexity_multilingual[f'{model_name}_perplexity'].mean()}")

### Save Model Results

In [None]:
# Save results
save_models_df(mBERT_XLMR_evaludation_results, "mBERT_XLMR_evaludation_results")
print("Results saved successfully!")

# 3. Benchmarking and Analysis
Compare the performance of LSTM, mBERT, and XLM-R using BLEU, ROUGE, and Perplexity.