# Importing all the important libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers sentence_transformers sentencepiece

In [1]:
import pandas as pd
import tensorflow as tf
import torch
import numpy as np
import sentencepiece as spm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoTokenizer, AutoModelForMaskedLM
from rouge_score import rouge_scorer
import torch.nn.functional as F

# Set GPU

## Mac

In [None]:
# for mac
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        details = tf.config.experimental.get_device_details(gpu)
        print("GPU details: ", details)
else:
    print("No GPU found. Using CPU.")

# set GPU device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

## Window / Linux

In [None]:
# for window
print("Tensorflow GPUs: ", tf.config.list_physical_devices('GPU'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using PyTorch device:", device)
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

# Functions

In [3]:
# function to save models df
def save_models_df(df, df_name):
    df.to_csv(f"models/{df_name}.csv", index=False, encoding="utf-8")

In [4]:
# function to load spt df
def load_spt_df(df_name):
    return pd.read_csv(f"spt/{df_name}.csv", header=0, encoding="utf-8")

In [None]:
# function to load models df
def load_models_df(df_name):
    return pd.read_csv(f"models/{df_name}.csv", header=0, encoding="utf-8")

# Data Preprocessing
Before training our RNN/LSTM model, we need to load, process, and prepare the dataset. This step ensures that our input data is structured correctly.

### Load and Process Tokenized Sentences 
Load BPE tokenized datasets, convert tokens into sequences, and apply padding.

In [6]:
# columns
english_columns = [
     "english_tokens"
    ,"english_back_translated_tokens"
]
burmese_columns = [
    "burmese_tokens"
    ,"burmese_translated_tokens"
]

In [7]:
# Datasets and file paths
datasets = {
    "normal": [
        "tokenized_bpe_myxnli_normalized_1", 
        "tokenized_bpe_myxnli_normalized_2", 
        "tokenized_bpe_alt_combined_normalized"
    ],
    "nllb_back_translated": [
        "tokenized_bpe_myxnli_nllb_back_translated_final_1", 
        "tokenized_bpe_myxnli_nllb_back_translated_final_2", 
        "tokenized_bpe_alt_combined_nllb_back_translated_final"
    ],
    "seamless_m4t_back_translated": [
        "tokenized_bpe_myxnli_seamless_m4t_back_translated_final_1",
        "tokenized_bpe_myxnli_seamless_m4t_back_translated_final_2",
        "tokenized_bpe_alt_combined_seamless_m4t_back_translated_final"
    ],
}

In [8]:
# Rename columns
def rename_columns(df):
    column_mapping = {
        "english_back_translated": "english",
        "burmese_translated": "burmese",
        "english_back_translated_tokens": "english_tokens",
        "burmese_translated_tokens": "burmese_tokens",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["english", "burmese", "english_tokens", "burmese_tokens"]]
    
    return df

In [9]:
# Load and process dataset
def load_and_process_dataset(file_name):
    df = load_spt_df(f"{file_name}")

    # Rename columns
    df = rename_columns(df)

    for column in english_columns:
        if column in df.columns:
            df["english_seq"] = df[column].apply(lambda x: sp.EncodeAsIds(str(x)) if isinstance(x, str) else [])

    for column in burmese_columns:
        if column in df.columns:
            df["burmese_seq"] = df[column].apply(lambda x: sp.EncodeAsIds(str(x)) if isinstance(x, str) else [])

    return df

In [10]:
# Load all datasets
processed_datasets = {
    key: [load_and_process_dataset(file) for file in file_list] for key, file_list in datasets.items()
}

In [11]:
# Combine all datasets
full_data = pd.concat(
    processed_datasets["normal"] + 
    processed_datasets["nllb_back_translated"] + 
    processed_datasets["seamless_m4t_back_translated"],
    ignore_index=True  # Reset index to avoid duplicates
)

In [12]:
# Shuffle the data to prevent order bias
full_data = full_data.sample(frac=1).reset_index(drop=True)

In [13]:
print(f"Total training samples: {len(full_data)}")
display(full_data.head())

Total training samples: 1627576


Unnamed: 0,english,burmese,english_tokens,burmese_tokens,english_seq,burmese_seq
0,the palace is empty except for antiquities and...,နန်းတော်မှာ ရှေးဟောင်းပစ္စည်းတွေနဲ့ အခန်းလေးခန...,"['▁the', '▁palace', '▁is', '▁empty', '▁except'...","['▁နန်းတော်', 'မှာ', '▁ရှေးဟောင်း', 'ပစ္စည်းတွ...","[8777, 30887, 12, 30887, 30883, 1519, 5187, 30...","[8777, 30887, 4879, 30887, 30883, 1519, 79, 30..."
1,these things are all thought of as classic fra...,ဤအရာအားလုံးကို ပြင်သစ်ဂန္ထဝင်အဖြစ် ယူဆထားသည်။,"['▁these', '▁things', '▁are', '▁all', '▁though...","['▁ဤအရာ', 'အားလုံးကို', '▁ပြင်သစ်', 'ဂ', 'န္',...","[8777, 30887, 857, 30887, 30883, 1519, 1298, 3...","[8777, 30887, 15390, 30887, 30883, 1519, 2002,..."
2,the federal government's monetary budget proce...,ပြည်ထောင်စုအစိုးရရဲ့ ငွေကြေးအခြေခံ ဘတ်ဂျက်လုပ်...,"['▁the', '▁federal', '▁government', ""'"", 's', ...","['▁ပြည်ထောင်စု', 'အစိုးရရဲ့', '▁ငွေကြေး', 'အခြ...","[8777, 30887, 12, 30887, 30883, 1519, 1707, 30...","[8777, 30887, 2315, 30887, 30883, 1519, 19809,..."
3,the house was huge.,အိမ်က ကြီးမားခဲ့တယ်။,"['▁the', '▁house', '▁was', '▁huge', '.']","['▁အိမ်က', '▁ကြီးမား', 'ခဲ့တယ်။']","[8777, 30887, 12, 30887, 30883, 1519, 1334, 30...","[8777, 30887, 13319, 30887, 30883, 1519, 1809,..."
4,you won't find a single spice shop on ibiza.,Ibiza မှာ ဟင်းခတ်အမွှေးအကြိုင်ဆိုင် တစ်ခုမှ မတ...,"['▁you', '▁won', ""'"", 't', '▁find', '▁a', '▁si...","['▁Ibiza', '▁မှာ', '▁ဟင်းခတ်', 'အမွှေးအကြိုင်'...","[8777, 30887, 173, 30887, 30883, 1519, 1892, 3...","[8777, 30887, 11090, 30887, 30883, 1519, 571, ..."


### Apply Padding to Sequences
Ensure that all sequences have the same length for batch processing.

In [14]:
# Adjust based on dataset analysis
max_seq_length = 128

In [15]:
# Apply padding
full_data["burmese_seq_padded"] = pad_sequences(full_data["burmese_seq"], maxlen=max_seq_length, padding="post").tolist()
full_data["english_seq_padded"] = pad_sequences(full_data["english_seq"], maxlen=max_seq_length, padding="post").tolist()

print("Sequences padded successfully!")
display(full_data.head())

Sequences padded successfully!


Unnamed: 0,english,burmese,english_tokens,burmese_tokens,english_seq,burmese_seq,burmese_seq_padded,english_seq_padded
0,the palace is empty except for antiquities and...,နန်းတော်မှာ ရှေးဟောင်းပစ္စည်းတွေနဲ့ အခန်းလေးခန...,"['▁the', '▁palace', '▁is', '▁empty', '▁except'...","['▁နန်းတော်', 'မှာ', '▁ရှေးဟောင်း', 'ပစ္စည်းတွ...","[8777, 30887, 12, 30887, 30883, 1519, 5187, 30...","[8777, 30887, 4879, 30887, 30883, 1519, 79, 30...","[8777, 30887, 4879, 30887, 30883, 1519, 79, 30...","[8777, 30887, 12, 30887, 30883, 1519, 5187, 30..."
1,these things are all thought of as classic fra...,ဤအရာအားလုံးကို ပြင်သစ်ဂန္ထဝင်အဖြစ် ယူဆထားသည်။,"['▁these', '▁things', '▁are', '▁all', '▁though...","['▁ဤအရာ', 'အားလုံးကို', '▁ပြင်သစ်', 'ဂ', 'န္',...","[8777, 30887, 857, 30887, 30883, 1519, 1298, 3...","[8777, 30887, 15390, 30887, 30883, 1519, 2002,...","[8777, 30887, 15390, 30887, 30883, 1519, 2002,...","[8777, 30887, 857, 30887, 30883, 1519, 1298, 3..."
2,the federal government's monetary budget proce...,ပြည်ထောင်စုအစိုးရရဲ့ ငွေကြေးအခြေခံ ဘတ်ဂျက်လုပ်...,"['▁the', '▁federal', '▁government', ""'"", 's', ...","['▁ပြည်ထောင်စု', 'အစိုးရရဲ့', '▁ငွေကြေး', 'အခြ...","[8777, 30887, 12, 30887, 30883, 1519, 1707, 30...","[8777, 30887, 2315, 30887, 30883, 1519, 19809,...","[8777, 30887, 2315, 30887, 30883, 1519, 19809,...","[30887, 30883, 1519, 1707, 30887, 30883, 1519,..."
3,the house was huge.,အိမ်က ကြီးမားခဲ့တယ်။,"['▁the', '▁house', '▁was', '▁huge', '.']","['▁အိမ်က', '▁ကြီးမား', 'ခဲ့တယ်။']","[8777, 30887, 12, 30887, 30883, 1519, 1334, 30...","[8777, 30887, 13319, 30887, 30883, 1519, 1809,...","[8777, 30887, 13319, 30887, 30883, 1519, 1809,...","[8777, 30887, 12, 30887, 30883, 1519, 1334, 30..."
4,you won't find a single spice shop on ibiza.,Ibiza မှာ ဟင်းခတ်အမွှေးအကြိုင်ဆိုင် တစ်ခုမှ မတ...,"['▁you', '▁won', ""'"", 't', '▁find', '▁a', '▁si...","['▁Ibiza', '▁မှာ', '▁ဟင်းခတ်', 'အမွှေးအကြိုင်'...","[8777, 30887, 173, 30887, 30883, 1519, 1892, 3...","[8777, 30887, 11090, 30887, 30883, 1519, 571, ...","[8777, 30887, 11090, 30887, 30883, 1519, 571, ...","[8777, 30887, 173, 30887, 30883, 1519, 1892, 3..."


In [16]:
# save the processed data
save_models_df(full_data, "processed_data")

# 1. Implementing RNN/LSTM Baseline

In [5]:
# Load SentencePiece BPE tokenizer
sp = spm.SentencePieceProcessor()
sp.Load("spt/spt_bpe.model")

True

## Define LSTM Model
Use a Bidirectional LSTM encoder-decoder with attention.

In [6]:
# Hyperparameters
lstm_embedding_dim = 256
lstm_hidden_dim = 512
lstm_vocab_size = sp.GetPieceSize()  # Get vocabulary size from SentencePiece

In [7]:
# Build LSTM Model
with tf.device('/GPU:0'):  # Explicitly assign to GPU if available
    lstm_model = Sequential([
        Embedding(input_dim=lstm_vocab_size, output_dim=lstm_embedding_dim, mask_zero=True),
        Bidirectional(LSTM(lstm_hidden_dim, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
        LSTM(lstm_hidden_dim, return_sequences=False, dropout=0.3, recurrent_dropout=0.3),
        Dense(lstm_vocab_size, activation='softmax')
    ])

    # Compile model
    lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display Model Summary
lstm_model.summary()

2025-01-30 12:40:46.000879: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-01-30 12:40:46.000945: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-01-30 12:40:46.000960: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
2025-01-30 12:40:46.000996: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-01-30 12:40:46.001014: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Train the Model
Train the model using full dataset (Normal + Back-Translated).

In [None]:
# load processed data
lstm_processed_data = load_models_df("processed_data")

In [None]:
X_train_lstm = np.array(lstm_processed_data["burmese_seq_padded"].tolist())
y_train_lstm = np.array(lstm_processed_data["english_seq_padded"].tolist())

# Train on GPU
with tf.device('/GPU:0'):  
    lstm_model.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=64, validation_split=0.1)

## Evaluate Model with BLEU Score
Compute BLEU Score for translation accuracy.

In [None]:
# Function to decode tokenized text back to sentences
def decode_sentence_lstm(tokenized_seq):
    return sp.DecodeIds([int(token) for token in tokenized_seq if token > 0])

In [None]:
# Generate Predictions
lstm_sample_inputs = X_train_lstm[:5]
lstm_sample_outputs = lstm_model.predict(lstm_sample_inputs)

In [None]:
# Convert predictions back to text
lstm_generated_sentences = [decode_sentence_lstm(seq) for seq in lstm_sample_outputs]
lstm_reference_sentences = [decode_sentence_lstm(seq) for seq in y_train_lstm[:5]]

In [None]:
# Compute BLEU Score
lstm_bleu_scores = [sentence_bleu([ref.split()], gen.split()) for ref, gen in zip(lstm_reference_sentences, lstm_generated_sentences)]

In [None]:
# Print results
for i in range(5):
    print(f"Reference: {lstm_reference_sentences[i]}")
    print(f"Generated: {lstm_generated_sentences[i]}")
    print(f"BLEU Score: {lstm_bleu_scores[i]}\n")

## Save & Export Model
Save trained model for future use.

In [None]:
lstm_model.save("models/bpe_lstm_baseline_model.h5")
sp.Save("models/bpe_model_trained.model")

print("Model and tokenizer saved successfully!")

# 2. Implementing Multilingual Transformer Baseline

## Load And Save Pre-Trained Models
Load ```mBERT``` and ```XLM-R``` for Masked Language Modeling (MLM).
MLM helps predict missing words in Burmese sequences.

In [None]:
# Define model names
multilingual_model_names = {
    "mBERT": "bert-base-multilingual-cased",
    "XLM-R": "xlm-roberta-base"
}

In [None]:
# Load tokenizers & models for both mBERT and XLM-R
multilingual_tokenizers = {name: AutoTokenizer.from_pretrained(model) for name, model in multilingual_model_names.items()}
multilingual_models = {name: AutoModelForMaskedLM.from_pretrained(model).to("cuda" if torch.cuda.is_available() else "cpu") for name, model in multilingual_model_names.items()}

In [None]:
# save the models
for model_name in multilingual_model_names:
    multilingual_models[model_name].save_pretrained(f"models/{model_name}")
    multilingual_tokenizers[model_name].save_pretrained(f"models/{model_name}")
    print(f"{model_name} saved at models/{model_name}")

## Load Processed Dataset

In [None]:
# Load pre-processed dataset
multilingual_processed_data = load_models_df("processed_data")

In [None]:
# Keep only required columns
multilingual_processed_data = multilingual_processed_data[["english", "burmese", "english_tokens", "burmese_tokens"]]
display(multilingual_processed_data.head())

In [None]:
# Convert token sequences to list format
multilingual_processed_data["burmese_tokens"] = multilingual_processed_data["burmese_tokens"].apply(lambda x: eval(x) if isinstance(x, str) else x)
display(multilingual_processed_data.head())

## Load Saved Models
Load mBERT & XLM-R from disk without re-downloading.

In [None]:
# Load saved models and tokenizers
multilingual_models = {name: AutoModelForMaskedLM.from_pretrained(f"models/${name}") for name in multilingual_model_names}
multilingual_tokenizers = {name: AutoTokenizer.from_pretrained(f"models/${name}") for name in multilingual_model_names}

print("Saved models loaded successfully!")

## Run Inference Without Fine-Tuning
Pass Burmese text through ```mBERT``` & ```XLM-R``` using Masked Language Modeling (MLM). mBERT/XLM-R predict missing words in Burmese sequences.

In [None]:
# Function to generate masked predictions
def generate_masked_predictions(text, model_name):
    tokenizer = multilingual_tokenizers[model_name]
    model = multilingual_models[model_name]

    inputs = tokenizer(text, return_tensors="pt").to(device)

    # Mask a random token in the sequence
    mask_idx = torch.randint(1, inputs["input_ids"].shape[1] - 1, (1,))
    inputs["input_ids"][0, mask_idx] = tokenizer.mask_token_id  # Replace one token with [MASK]

    # Run the model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get highest probability prediction for the masked token
    predicted_token_id = torch.argmax(outputs.logits[0, mask_idx], dim=-1).item()
    predicted_token = tokenizer.decode([predicted_token_id])

    return text.replace(tokenizer.mask_token, predicted_token)

In [None]:
# Test on sample Burmese sentences
sample_texts = multilingual_processed_data["burmese"].sample(5).tolist()
for model_name in multilingual_model_names:
    print(f"🔹 Model: {model_name}")
    for text in sample_texts:
        print(f"Original: {text}")
        print(f"Generated: {generate_masked_predictions(text, model_name)}\n")

## Evaluate Model Performance
Compare BLEU, ROUGE, and Perplexity scores between ```mBERT``` and ```XLM-R```.

### Compute BLEU Score

In [None]:
# Function to compute BLEU score
def compute_bleu(reference, prediction):
    return sentence_bleu([reference.split()], prediction.split())

In [None]:
# Evaluate on dataset
for model_name in multilingual_model_names:
    multilingual_processed_data[f"{model_name}_generated"] = multilingual_processed_data["burmese"].apply(lambda x: generate_masked_predictions(x, model_name))
    multilingual_processed_data[f"{model_name}_bleu"] = multilingual_processed_data.apply(lambda row: compute_bleu(row["english"], row[f"{model_name}_generated"]), axis=1)

In [None]:
# Display BLEU scores
for model_name in multilingual_model_names:
    print(f"{model_name} BLEU Score: {multilingual_processed_data[f'{model_name}_bleu'].mean()}")

### Compute ROUGE Score

In [None]:
# Compute ROUGE Score
multilingual_rouge_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
for model_name in multilingual_model_names:
    multilingual_processed_data[f"{model_name}_rouge"] = multilingual_processed_data.apply(lambda row: multilingual_rouge_scorer.score(row["english"], row[f"{model_name}_generated"])["rougeL"].fmeasure, axis=1)

In [None]:
# Display ROUGE scores
for model_name in multilingual_model_names:
    print(f"{model_name} ROUGE Score: {multilingual_processed_data[f'{model_name}_rouge'].mean()}")

### Compute Perplexity Score
Lower perplexity = Better fluency

In [None]:
# function to compute perplexity
def compute_perplexity(text, model_name):
    tokenizer = multilingual_tokenizers[model_name]
    model = multilingual_models[model_name]

    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Compute log-likelihood
    log_likelihood = F.log_softmax(outputs.logits, dim=-1)
    
    # Compute Perplexity
    perplexity = torch.exp(-log_likelihood.mean()).item()
    return perplexity

In [None]:
# Compute Perplexity for both models
for model_name in multilingual_model_names:
    multilingual_processed_data[f"{model_name}_perplexity"] = multilingual_processed_data[f"{model_name}_generated"].apply(lambda x: compute_perplexity(x, model_name))

In [None]:
# Display Perplexity scores
for model_name in multilingual_model_names:
    print(f"{model_name} Perplexity Score: {multilingual_processed_data[f'{model_name}_perplexity'].mean()}")

### Save Model Results

In [None]:
# Save results
save_models_df(multilingual_processed_data, "mBERT_XLMR_results")
print("Results saved successfully!")