# Importing all the important libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers sentence_transformers sentencepiece

In [1]:
import pandas as pd
import tensorflow as tf
import torch
import numpy as np
import sentencepiece as spm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam
from nltk.translate.bleu_score import sentence_bleu

# Functions

In [3]:
# function to save models df
def save_models_df(df, df_name):
    df.to_csv(f"models/{df_name}.csv", index=False, encoding="utf-8")

In [4]:
# function to load spt df
def load_spt_df(df_name):
    return pd.read_csv(f"spt/{df_name}.csv", header=0, encoding="utf-8")

# 1. Implementing RNN/LSTM Baseline

In [5]:
# Load SentencePiece BPE tokenizer
sp = spm.SentencePieceProcessor()
sp.Load("spt/spt_bpe.model")

True

## Data Preprocessing
Before training our RNN/LSTM model, we need to load, process, and prepare the dataset. This step ensures that our input data is structured correctly.

### Load and Process Tokenized Sentences 
Load BPE tokenized datasets, convert tokens into sequences, and apply padding.

In [6]:
# columns
english_columns = [
     "english_tokens"
    ,"english_back_translated_tokens"
]
burmese_columns = [
    "burmese_tokens"
    ,"burmese_translated_tokens"
]

In [7]:
# Datasets and file paths
datasets = {
    "normal": [
        "tokenized_bpe_myxnli_normalized_1", 
        "tokenized_bpe_myxnli_normalized_2", 
        "tokenized_bpe_alt_combined_normalized"
    ],
    "nllb_back_translated": [
        "tokenized_bpe_myxnli_nllb_back_translated_final_1", 
        "tokenized_bpe_myxnli_nllb_back_translated_final_2", 
        "tokenized_bpe_alt_combined_nllb_back_translated_final"
    ],
    "seamless_m4t_back_translated": [
        "tokenized_bpe_myxnli_seamless_m4t_back_translated_final_1",
        "tokenized_bpe_myxnli_seamless_m4t_back_translated_final_2",
        "tokenized_bpe_alt_combined_seamless_m4t_back_translated_final"
    ],
}

In [8]:
# Rename columns
def rename_columns(df):
    column_mapping = {
        "english_back_translated": "english",
        "burmese_translated": "burmese",
        "english_back_translated_tokens": "english_tokens",
        "burmese_translated_tokens": "burmese_tokens",
    }
    
    df = df.rename(columns=column_mapping)
    
    # Ensure only required columns exist
    df = df[["english", "burmese", "english_tokens", "burmese_tokens"]]
    
    return df

In [9]:
# Load and process dataset
def load_and_process_dataset(file_name):
    df = load_spt_df(f"{file_name}")

    # Rename columns
    df = rename_columns(df)

    for column in english_columns:
        if column in df.columns:
            df["english_seq"] = df[column].apply(lambda x: sp.EncodeAsIds(str(x)) if isinstance(x, str) else [])

    for column in burmese_columns:
        if column in df.columns:
            df["burmese_seq"] = df[column].apply(lambda x: sp.EncodeAsIds(str(x)) if isinstance(x, str) else [])

    return df

In [10]:
# Load all datasets
processed_datasets = {
    key: [load_and_process_dataset(file) for file in file_list] for key, file_list in datasets.items()
}

In [11]:
# Combine all datasets
full_data = pd.concat(
    processed_datasets["normal"] + 
    processed_datasets["nllb_back_translated"] + 
    processed_datasets["seamless_m4t_back_translated"],
    ignore_index=True  # Reset index to avoid duplicates
)

In [12]:
# Shuffle the data to prevent order bias
full_data = full_data.sample(frac=1).reset_index(drop=True)

In [13]:
print(f"Total training samples: {len(full_data)}")
display(full_data.head())

Total training samples: 1627576


Unnamed: 0,english,burmese,english_tokens,burmese_tokens,english_seq,burmese_seq
0,the palace is empty except for antiquities and...,နန်းတော်မှာ ရှေးဟောင်းပစ္စည်းတွေနဲ့ အခန်းလေးခန...,"['▁the', '▁palace', '▁is', '▁empty', '▁except'...","['▁နန်းတော်', 'မှာ', '▁ရှေးဟောင်း', 'ပစ္စည်းတွ...","[8777, 30887, 12, 30887, 30883, 1519, 5187, 30...","[8777, 30887, 4879, 30887, 30883, 1519, 79, 30..."
1,these things are all thought of as classic fra...,ဤအရာအားလုံးကို ပြင်သစ်ဂန္ထဝင်အဖြစ် ယူဆထားသည်။,"['▁these', '▁things', '▁are', '▁all', '▁though...","['▁ဤအရာ', 'အားလုံးကို', '▁ပြင်သစ်', 'ဂ', 'န္',...","[8777, 30887, 857, 30887, 30883, 1519, 1298, 3...","[8777, 30887, 15390, 30887, 30883, 1519, 2002,..."
2,the federal government's monetary budget proce...,ပြည်ထောင်စုအစိုးရရဲ့ ငွေကြေးအခြေခံ ဘတ်ဂျက်လုပ်...,"['▁the', '▁federal', '▁government', ""'"", 's', ...","['▁ပြည်ထောင်စု', 'အစိုးရရဲ့', '▁ငွေကြေး', 'အခြ...","[8777, 30887, 12, 30887, 30883, 1519, 1707, 30...","[8777, 30887, 2315, 30887, 30883, 1519, 19809,..."
3,the house was huge.,အိမ်က ကြီးမားခဲ့တယ်။,"['▁the', '▁house', '▁was', '▁huge', '.']","['▁အိမ်က', '▁ကြီးမား', 'ခဲ့တယ်။']","[8777, 30887, 12, 30887, 30883, 1519, 1334, 30...","[8777, 30887, 13319, 30887, 30883, 1519, 1809,..."
4,you won't find a single spice shop on ibiza.,Ibiza မှာ ဟင်းခတ်အမွှေးအကြိုင်ဆိုင် တစ်ခုမှ မတ...,"['▁you', '▁won', ""'"", 't', '▁find', '▁a', '▁si...","['▁Ibiza', '▁မှာ', '▁ဟင်းခတ်', 'အမွှေးအကြိုင်'...","[8777, 30887, 173, 30887, 30883, 1519, 1892, 3...","[8777, 30887, 11090, 30887, 30883, 1519, 571, ..."


### Apply Padding to Sequences
Ensure that all sequences have the same length for batch processing.

In [14]:
# Adjust based on dataset analysis
max_seq_length = 128

In [15]:
# Apply padding
full_data["burmese_seq_padded"] = pad_sequences(full_data["burmese_seq"], maxlen=max_seq_length, padding="post").tolist()
full_data["english_seq_padded"] = pad_sequences(full_data["english_seq"], maxlen=max_seq_length, padding="post").tolist()

print("Sequences padded successfully!")
display(full_data.head())

Sequences padded successfully!


Unnamed: 0,english,burmese,english_tokens,burmese_tokens,english_seq,burmese_seq,burmese_seq_padded,english_seq_padded
0,the palace is empty except for antiquities and...,နန်းတော်မှာ ရှေးဟောင်းပစ္စည်းတွေနဲ့ အခန်းလေးခန...,"['▁the', '▁palace', '▁is', '▁empty', '▁except'...","['▁နန်းတော်', 'မှာ', '▁ရှေးဟောင်း', 'ပစ္စည်းတွ...","[8777, 30887, 12, 30887, 30883, 1519, 5187, 30...","[8777, 30887, 4879, 30887, 30883, 1519, 79, 30...","[8777, 30887, 4879, 30887, 30883, 1519, 79, 30...","[8777, 30887, 12, 30887, 30883, 1519, 5187, 30..."
1,these things are all thought of as classic fra...,ဤအရာအားလုံးကို ပြင်သစ်ဂန္ထဝင်အဖြစ် ယူဆထားသည်။,"['▁these', '▁things', '▁are', '▁all', '▁though...","['▁ဤအရာ', 'အားလုံးကို', '▁ပြင်သစ်', 'ဂ', 'န္',...","[8777, 30887, 857, 30887, 30883, 1519, 1298, 3...","[8777, 30887, 15390, 30887, 30883, 1519, 2002,...","[8777, 30887, 15390, 30887, 30883, 1519, 2002,...","[8777, 30887, 857, 30887, 30883, 1519, 1298, 3..."
2,the federal government's monetary budget proce...,ပြည်ထောင်စုအစိုးရရဲ့ ငွေကြေးအခြေခံ ဘတ်ဂျက်လုပ်...,"['▁the', '▁federal', '▁government', ""'"", 's', ...","['▁ပြည်ထောင်စု', 'အစိုးရရဲ့', '▁ငွေကြေး', 'အခြ...","[8777, 30887, 12, 30887, 30883, 1519, 1707, 30...","[8777, 30887, 2315, 30887, 30883, 1519, 19809,...","[8777, 30887, 2315, 30887, 30883, 1519, 19809,...","[30887, 30883, 1519, 1707, 30887, 30883, 1519,..."
3,the house was huge.,အိမ်က ကြီးမားခဲ့တယ်။,"['▁the', '▁house', '▁was', '▁huge', '.']","['▁အိမ်က', '▁ကြီးမား', 'ခဲ့တယ်။']","[8777, 30887, 12, 30887, 30883, 1519, 1334, 30...","[8777, 30887, 13319, 30887, 30883, 1519, 1809,...","[8777, 30887, 13319, 30887, 30883, 1519, 1809,...","[8777, 30887, 12, 30887, 30883, 1519, 1334, 30..."
4,you won't find a single spice shop on ibiza.,Ibiza မှာ ဟင်းခတ်အမွှေးအကြိုင်ဆိုင် တစ်ခုမှ မတ...,"['▁you', '▁won', ""'"", 't', '▁find', '▁a', '▁si...","['▁Ibiza', '▁မှာ', '▁ဟင်းခတ်', 'အမွှေးအကြိုင်'...","[8777, 30887, 173, 30887, 30883, 1519, 1892, 3...","[8777, 30887, 11090, 30887, 30883, 1519, 571, ...","[8777, 30887, 11090, 30887, 30883, 1519, 571, ...","[8777, 30887, 173, 30887, 30883, 1519, 1892, 3..."


In [16]:
# save the processed data
save_models_df(full_data, "processed_data")

## Define LSTM Model
Use a Bidirectional LSTM encoder-decoder with attention.

In [6]:
# Hyperparameters
embedding_dim = 256
hidden_dim = 512
vocab_size = sp.GetPieceSize()  # Get vocabulary size from SentencePiece

In [7]:
# Build LSTM Model
with tf.device('/GPU:0'):  # Explicitly assign to GPU if available
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True),
        Bidirectional(LSTM(hidden_dim, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
        LSTM(hidden_dim, return_sequences=False, dropout=0.3, recurrent_dropout=0.3),
        Dense(vocab_size, activation='softmax')
    ])

    # Compile model
    model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display Model Summary
model.summary()

2025-01-30 12:40:46.000879: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-01-30 12:40:46.000945: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-01-30 12:40:46.000960: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
2025-01-30 12:40:46.000996: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-01-30 12:40:46.001014: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Train the Model
Train the model using full dataset (Normal + Back-Translated).

In [None]:
# load processed data
processed_data = load_spt_df("processed_data")

In [None]:
X_train = np.array(processed_data["burmese_seq_padded"].tolist())
y_train = np.array(processed_data["english_seq_padded"].tolist())

# Train on GPU
with tf.device('/GPU:0'):  
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1)

## Evaluate Model with BLEU Score
Compute BLEU Score for translation accuracy.

In [None]:
# Function to decode tokenized text back to sentences
def decode_sentence(tokenized_seq):
    return sp.DecodeIds([int(token) for token in tokenized_seq if token > 0])

In [None]:
# Generate Predictions
sample_inputs = X_train[:5]
sample_outputs = model.predict(sample_inputs)

In [None]:
# Convert predictions back to text
generated_sentences = [decode_sentence(seq) for seq in sample_outputs]
reference_sentences = [decode_sentence(seq) for seq in y_train[:5]]

In [None]:
# Compute BLEU Score
bleu_scores = [sentence_bleu([ref.split()], gen.split()) for ref, gen in zip(reference_sentences, generated_sentences)]

In [None]:
# Print results
for i in range(5):
    print(f"Reference: {reference_sentences[i]}")
    print(f"Generated: {generated_sentences[i]}")
    print(f"BLEU Score: {bleu_scores[i]}\n")

## Save & Export Model
Save trained model for future use.

In [None]:
model.save("models/bpe_lstm_baseline_model.h5")
sp.Save("models/bpe_model_trained.model")

print("Model and tokenizer saved successfully!")