### Importing all the important libraries

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!pip install transformers sentence_transformers sentencepiece

In [1]:
import os
import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from IPython.display import display
import torch
from torch.utils.data import DataLoader, Dataset
import unicodedata
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, SeamlessM4TForTextToText, AutoProcessor
from sentence_transformers import SentenceTransformer, util

2025-01-25 11:58:10.288905: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-25 11:58:10.305283: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-25 11:58:10.330040: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-25 11:58:10.330066: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-25 11:58:10.345238: I tensorflow/core/platform/cpu_feature_gua

### Set GPU

#### Mac

In [2]:
# for mac
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        details = tf.config.experimental.get_device_details(gpu)
        print("GPU details: ", details)
else:
    print("No GPU found. Using CPU.")

# set GPU device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

2025-01-25 11:58:16.026596: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-25 11:58:16.076044: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-25 11:58:16.078112: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'compute_capability': (7, 5), 'device_name': 'Tesla T4'}
Using device: cpu


L355


#### Window / Linux

In [3]:
# for window
print("Tensorflow GPUs: ", tf.config.list_physical_devices('GPU'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using PyTorch device:", device)
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

Tensorflow GPUs:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Using PyTorch device: cuda
GPU Name: Tesla T4


### Functions

In [4]:
# function to save gen df
def save_gen_df(df, df_name):
    df.to_csv(f"gen/{df_name}.csv", index=False)

In [5]:
# function to save tmp df
def save_tmp_df(df, df_name):
    df.to_csv(f"tmp/{df_name}.csv", index=False)

In [6]:
# function to append tmp df
def append_tmp_df(df, df_name):
    df.to_csv(f"tmp/{df_name}.csv", index=False, mode='a', header=False)

In [7]:
# function to load generated df
def load_gen_df(df_name):
    return pd.read_csv(f"gen/{df_name}.csv", header=0)

In [8]:
# function to load tmp df
def load_tmp_df(df_name):
    return pd.read_csv(f"tmp/{df_name}.csv", header=0)

### Set settings

In [9]:
tqdm.pandas()

### 1. Dataset Loading

This step involves loading the datasets `myXNLI` and `ALT Corpus` into pandas DataFrames. 
The English and Burmese datasets from the ALT Corpus are combined to create a bilingual parallel corpus.

In [6]:
# Load myXNLI dataset
myxnli_path = './data/myXNLI.train.tsv'  # Path to the file
myxnli_data = pd.read_csv(myxnli_path, sep='\t', header=0)
print(f"myXNLI dataset loaded successfully with {len(myxnli_data)} records.")
display(myxnli_data.head())  # Display the first few rows of the dataset

myXNLI dataset loaded successfully with 392702 records.


Unnamed: 0,genre,label,sentence1_en,sentence2_en,sentence1_my,sentence2_my
0,government,neutral,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,သဘောတရားအရ ခရင်မ်စိမ်ခြင်းတွင် အခြေခံအတိုင်းအတ...,ထုတ်ကုန်နှင့် ပထဝီဝင်အနေအထားသည် ခရင်မ် skimmin...
1,telephone,entailment,you know during the season and i guess at at y...,You lose the things to the following level if ...,ရာသီအတွင်း မင်းသိတယ်၊ မင်းရဲ့အဆင့်ကို ငါ ခန့်မ...,လူတွေပြန်ခေါ်ရင် အောက်ပါအဆင့်အထိ ဆုံးရှုံးသွား...
2,fiction,entailment,One of our number will carry out your instruct...,A member of my team will execute your orders w...,ကျွန်ုပ်တို့၏နံပါတ်တစ်ခုသည် သင့်ညွှန်ကြားချက်မ...,ကျွန်ုပ်၏အဖွဲ့သားတစ်ဦးသည် သင်၏အမိန့်စာများကို ...
3,fiction,entailment,How do you know? All this is their information...,This information belongs to them.,သင်ဘယ်လိုသိသလဲ? ဒါတွေအားလုံးဟာ သူတို့ရဲ့ အချက်...,ဒီအချက်အလက်က သူတို့ပိုင်တယ်။
4,telephone,neutral,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,ဟုတ်တယ် ငါမင်းကိုပြောပြမယ် ဒီတင်းနစ်ဖိနပ်တချို...,တင်းနစ်ဖိနပ်များသည် ဈေးနှုန်းအမျိုးမျိုးရှိသည်။


In [7]:
# Load ALT English data
alt_en_path = './data/ALT_data_en.txt'  # Path to the English ALT corpus
alt_en_data = pd.read_csv(alt_en_path, sep='\t', header=None, names=["ID", "English_Sentence"])
print(f"ALT English dataset loaded successfully with {len(alt_en_data)} records.")
display(alt_en_data.head())

ALT English dataset loaded successfully with 19908 records.


Unnamed: 0,ID,English_Sentence
0,SNT.80188.1,Italy have defeated Portugal 31-5 in Pool C of...
1,SNT.80188.2,Andrea Masi opened the scoring in the fourth m...
2,SNT.80188.3,Despite controlling the game for much of the f...
3,SNT.80188.4,Portugal never gave up and David Penalva score...
4,SNT.80188.5,Italy led 16-5 at half time but were matched b...


In [8]:
# Load ALT Burmese data
alt_my_path = './data/ALT_data_my.txt'  # Path to the Burmese ALT corpus
alt_my_data = pd.read_csv(alt_my_path, sep='\t', header=None, names=["ID", "Burmese_Sentence"])
print(f"ALT Burmese dataset loaded successfully with {len(alt_my_data)} records.")
display(alt_my_data.head())

ALT Burmese dataset loaded successfully with 19265 records.


Unnamed: 0,ID,Burmese_Sentence
0,SNT.80188.1,ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂...
1,SNT.80188.2,အန်ဒရီယာ မာစီ သည် အီတလီ အတွက် စမ်းသပ်မှု တစ်ခု...
2,SNT.80188.3,ပထမ တစ်ဝက် ၏ တော်တော်များများ အတွက် ကစားပွဲ ကိ...
3,SNT.80188.4,ပေါ်တူဂီ သည် ဘယ်သောအခါမှ စွန့်လွှတ်မှု မရှိခဲ့...
4,SNT.80188.5,အီတလီ သည် ပထမပိုင်း ၌ ၁၆-၅ ဖြင့် ဦးဆောင်ခဲ့ သေ...


In [9]:
# Combine ALT datasets (if IDs match)
alt_combined = pd.merge(alt_en_data, alt_my_data, on="ID")
print(f"ALT combined dataset created successfully with {len(alt_combined)} records.")
display(alt_combined.head())

ALT combined dataset created successfully with 19173 records.


Unnamed: 0,ID,English_Sentence,Burmese_Sentence
0,SNT.80188.1,Italy have defeated Portugal 31-5 in Pool C of...,ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂...
1,SNT.80188.2,Andrea Masi opened the scoring in the fourth m...,အန်ဒရီယာ မာစီ သည် အီတလီ အတွက် စမ်းသပ်မှု တစ်ခု...
2,SNT.80188.3,Despite controlling the game for much of the f...,ပထမ တစ်ဝက် ၏ တော်တော်များများ အတွက် ကစားပွဲ ကိ...
3,SNT.80188.4,Portugal never gave up and David Penalva score...,ပေါ်တူဂီ သည် ဘယ်သောအခါမှ စွန့်လွှတ်မှု မရှိခဲ့...
4,SNT.80188.5,Italy led 16-5 at half time but were matched b...,အီတလီ သည် ပထမပိုင်း ၌ ၁၆-၅ ဖြင့် ဦးဆောင်ခဲ့ သေ...


### 2. Data Cleaning
This step focuses on cleaning the datasets to prepare them for further processing. The cleaning operations include:
1. Removing duplicate entries.
2. Handling missing values.
3. Removing non-standard characters or symbols unrelated to the Burmese or English language.
4. Ensuring consistent formatting.

The cleaned datasets will be ready for normalization and tokenization in the next steps.

In [10]:
# Cleaning myXNLI dataset
print("Cleaning myXNLI dataset...")
myxnli_cleaned = myxnli_data.drop_duplicates()  # Remove duplicates
myxnli_cleaned = myxnli_cleaned.dropna()  # Remove rows with missing values
#myxnli_cleaned = myxnli_cleaned.replace(r'[^\w\s]', '', regex=True)  # Remove non-standard characters
print(f"myXNLI dataset cleaned successfully.")
print(f"Original Records: {len(myxnli_data)}.")
print(f"Remaining records: {len(myxnli_cleaned)}.")
display(myxnli_cleaned.head())

Cleaning myXNLI dataset...
myXNLI dataset cleaned successfully.
Original Records: 392702.
Remaining records: 392682.


Unnamed: 0,genre,label,sentence1_en,sentence2_en,sentence1_my,sentence2_my
0,government,neutral,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,သဘောတရားအရ ခရင်မ်စိမ်ခြင်းတွင် အခြေခံအတိုင်းအတ...,ထုတ်ကုန်နှင့် ပထဝီဝင်အနေအထားသည် ခရင်မ် skimmin...
1,telephone,entailment,you know during the season and i guess at at y...,You lose the things to the following level if ...,ရာသီအတွင်း မင်းသိတယ်၊ မင်းရဲ့အဆင့်ကို ငါ ခန့်မ...,လူတွေပြန်ခေါ်ရင် အောက်ပါအဆင့်အထိ ဆုံးရှုံးသွား...
2,fiction,entailment,One of our number will carry out your instruct...,A member of my team will execute your orders w...,ကျွန်ုပ်တို့၏နံပါတ်တစ်ခုသည် သင့်ညွှန်ကြားချက်မ...,ကျွန်ုပ်၏အဖွဲ့သားတစ်ဦးသည် သင်၏အမိန့်စာများကို ...
3,fiction,entailment,How do you know? All this is their information...,This information belongs to them.,သင်ဘယ်လိုသိသလဲ? ဒါတွေအားလုံးဟာ သူတို့ရဲ့ အချက်...,ဒီအချက်အလက်က သူတို့ပိုင်တယ်။
4,telephone,neutral,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,ဟုတ်တယ် ငါမင်းကိုပြောပြမယ် ဒီတင်းနစ်ဖိနပ်တချို...,တင်းနစ်ဖိနပ်များသည် ဈေးနှုန်းအမျိုးမျိုးရှိသည်။


In [11]:
# save cleaned myXNLI dataset
save_gen_df(myxnli_cleaned, "myxnli_cleaned")

In [12]:
# Cleaning ALT English data
print("Cleaning ALT English dataset...")
alt_en_cleaned = alt_en_data.drop_duplicates()  # Remove duplicates
alt_en_cleaned = alt_en_cleaned.dropna()  # Remove rows with missing values
alt_en_cleaned["English_Sentence"] = alt_en_cleaned["English_Sentence"].replace(r'[^\w\s]', '', regex=True)  # Remove non-standard characters
print(f"ALT English dataset cleaned successfully.")
print(f"Original records: {len(alt_en_data)}.")
print(f"Remaining records: {len(alt_en_cleaned)}.")
display(alt_en_cleaned.head())

Cleaning ALT English dataset...
ALT English dataset cleaned successfully.
Original records: 19908.
Remaining records: 19908.


Unnamed: 0,ID,English_Sentence
0,SNT.80188.1,Italy have defeated Portugal 315 in Pool C of ...
1,SNT.80188.2,Andrea Masi opened the scoring in the fourth m...
2,SNT.80188.3,Despite controlling the game for much of the f...
3,SNT.80188.4,Portugal never gave up and David Penalva score...
4,SNT.80188.5,Italy led 165 at half time but were matched by...


In [13]:
# Cleaning ALT Burmese data
print("Cleaning ALT Burmese dataset...")
alt_my_cleaned = alt_my_data.drop_duplicates()  # Remove duplicates
alt_my_cleaned = alt_my_cleaned.dropna()  # Remove rows with missing values
#alt_my_cleaned["Burmese_Sentence"] = alt_my_cleaned["Burmese_Sentence"].replace(r'[^\w\s]', '', regex=True)  # Remove non-standard characters
print(f"ALT Burmese dataset cleaned successfully.")
print(f"Original records: {len(alt_my_data)}")
print(f"Remaining records: {len(alt_my_cleaned)}")
display(alt_my_cleaned.head())

Cleaning ALT Burmese dataset...
ALT Burmese dataset cleaned successfully.
Original records: 19265
Remaining records: 19258


Unnamed: 0,ID,Burmese_Sentence
0,SNT.80188.1,ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂...
1,SNT.80188.2,အန်ဒရီယာ မာစီ သည် အီတလီ အတွက် စမ်းသပ်မှု တစ်ခု...
2,SNT.80188.3,ပထမ တစ်ဝက် ၏ တော်တော်များများ အတွက် ကစားပွဲ ကိ...
3,SNT.80188.4,ပေါ်တူဂီ သည် ဘယ်သောအခါမှ စွန့်လွှတ်မှု မရှိခဲ့...
4,SNT.80188.5,အီတလီ သည် ပထမပိုင်း ၌ ၁၆-၅ ဖြင့် ဦးဆောင်ခဲ့ သေ...


In [14]:
# Combine cleaned ALT datasets
print("Cleaning combined ALT dataset...")
alt_combined_cleaned = pd.merge(alt_en_cleaned, alt_my_cleaned, on="ID")
print(f"Combined ALT dataset cleaned successfully.")
print(f"Original records: {len(alt_combined)}")
print(f"Remaining records: {len(alt_combined_cleaned)}")
display(alt_combined_cleaned.head())

Cleaning combined ALT dataset...
Combined ALT dataset cleaned successfully.
Original records: 19173
Remaining records: 19166


Unnamed: 0,ID,English_Sentence,Burmese_Sentence
0,SNT.80188.1,Italy have defeated Portugal 315 in Pool C of ...,ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂...
1,SNT.80188.2,Andrea Masi opened the scoring in the fourth m...,အန်ဒရီယာ မာစီ သည် အီတလီ အတွက် စမ်းသပ်မှု တစ်ခု...
2,SNT.80188.3,Despite controlling the game for much of the f...,ပထမ တစ်ဝက် ၏ တော်တော်များများ အတွက် ကစားပွဲ ကိ...
3,SNT.80188.4,Portugal never gave up and David Penalva score...,ပေါ်တူဂီ သည် ဘယ်သောအခါမှ စွန့်လွှတ်မှု မရှိခဲ့...
4,SNT.80188.5,Italy led 165 at half time but were matched by...,အီတလီ သည် ပထမပိုင်း ၌ ၁၆-၅ ဖြင့် ဦးဆောင်ခဲ့ သေ...


In [15]:
# save cleaned combined ALT dataset
save_gen_df(alt_combined_cleaned, "alt_combined_cleaned")

### 3. Data Normalization
This step normalizes the text data to ensure consistency across datasets. The normalization process includes:
1. Applying Unicode normalization to handle encoding inconsistencies.
2. Standardizing text formatting by converting all text to lowercase and standardizing punctuation.
3. Normalizing diacritical marks and stacked consonants in the Burmese text to improve text representation.

In [16]:
# Function to normalize text
def normalize_text(text):
    if pd.isnull(text):
        return text  # Skip null values
    # Apply Unicode normalization
    normalized_text = unicodedata.normalize('NFKC', text)
    # Convert to lowercase
    normalized_text = normalized_text.lower()
    # Standardize punctuation (e.g., replace unusual punctuation marks)
    normalized_text = normalized_text.replace('“', '"').replace('”', '"').replace('’', "'")
    return normalized_text

In [17]:
# Function to normalize Burmese text (handles diacritical marks and stacked consonants)
def normalize_burmese(text):
    if pd.isnull(text):
        return text  # Skip null values
    normalized_text = unicodedata.normalize('NFKC', text)
    # Additional Burmese-specific normalization can be added here if needed
    return normalized_text

In [18]:
# Normalize myXNLI cleaned dataset
print("Normalizing myXNLI dataset...")
myxnli_normalized = load_gen_df("myxnli_cleaned")

# Normalize English columns
myxnli_normalized["sentence1_en"] = myxnli_normalized["sentence1_en"].apply(normalize_text)
myxnli_normalized["sentence2_en"] = myxnli_normalized["sentence2_en"].apply(normalize_text)

# Normalize Burmese columns
myxnli_normalized["sentence1_my"] = myxnli_normalized["sentence1_my"].apply(normalize_burmese)
myxnli_normalized["sentence2_my"] = myxnli_normalized["sentence2_my"].apply(normalize_burmese)

print(f"myXNLI dataset normalized successfully.")
display(myxnli_normalized.head())

Normalizing myXNLI dataset...
myXNLI dataset normalized successfully.


Unnamed: 0,genre,label,sentence1_en,sentence2_en,sentence1_my,sentence2_my
0,government,neutral,conceptually cream skimming has two basic dime...,product and geography are what make cream skim...,သဘောတရားအရ ခရင်မ်စိမ်ခြင်းတွင် အခြေခံအတိုင်းအတ...,ထုတ်ကုန်နှင့် ပထဝီဝင်အနေအထားသည် ခရင်မ် skimmin...
1,telephone,entailment,you know during the season and i guess at at y...,you lose the things to the following level if ...,ရာသီအတွင်း မင်းသိတယ်၊ မင်းရဲ့အဆင့်ကို ငါ ခန့်မ...,လူတွေပြန်ခေါ်ရင် အောက်ပါအဆင့်အထိ ဆုံးရှုံးသွား...
2,fiction,entailment,one of our number will carry out your instruct...,a member of my team will execute your orders w...,ကျွန်ုပ်တို့၏နံပါတ်တစ်ခုသည် သင့်ညွှန်ကြားချက်မ...,ကျွန်ုပ်၏အဖွဲ့သားတစ်ဦးသည် သင်၏အမိန့်စာများကို ...
3,fiction,entailment,how do you know? all this is their information...,this information belongs to them.,သင်ဘယ်လိုသိသလဲ? ဒါတွေအားလုံးဟာ သူတို့ရဲ့ အချက်...,ဒီအချက်အလက်က သူတို့ပိုင်တယ်။
4,telephone,neutral,yeah i tell you what though if you go price so...,the tennis shoes have a range of prices.,ဟုတ်တယ် ငါမင်းကိုပြောပြမယ် ဒီတင်းနစ်ဖိနပ်တချို...,တင်းနစ်ဖိနပ်များသည် ဈေးနှုန်းအမျိုးမျိုးရှိသည်။


In [19]:
# save normalized myXNLI dataset
save_gen_df(myxnli_normalized, "myxnli_normalized")

In [20]:
# Normalize combined ALT cleaned dataset
print("Normalizing combined ALT dataset...")
alt_combined_normalized = load_gen_df("alt_combined_cleaned")
alt_combined_normalized["English_Sentence"] = alt_combined_normalized["English_Sentence"].apply(normalize_text)
alt_combined_normalized["Burmese_Sentence"] = alt_combined_normalized["Burmese_Sentence"].apply(normalize_burmese)
print(f"Combined ALT dataset normalized successfully.")
display(alt_combined_normalized.head())

Normalizing combined ALT dataset...
Combined ALT dataset normalized successfully.


Unnamed: 0,ID,English_Sentence,Burmese_Sentence
0,SNT.80188.1,italy have defeated portugal 315 in pool c of ...,ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂...
1,SNT.80188.2,andrea masi opened the scoring in the fourth m...,အန်ဒရီယာ မာစီ သည် အီတလီ အတွက် စမ်းသပ်မှု တစ်ခု...
2,SNT.80188.3,despite controlling the game for much of the f...,ပထမ တစ်ဝက် ၏ တော်တော်များများ အတွက် ကစားပွဲ ကိ...
3,SNT.80188.4,portugal never gave up and david penalva score...,ပေါ်တူဂီ သည် ဘယ်သောအခါမှ စွန့်လွှတ်မှု မရှိခဲ့...
4,SNT.80188.5,italy led 165 at half time but were matched by...,အီတလီ သည် ပထမပိုင်း ၌ ၁၆-၅ ဖြင့် ဦးဆောင်ခဲ့ သေ...


In [21]:
# save normalized ALT dataset
save_gen_df(alt_combined_normalized, "alt_combined_normalized")

### 4. Back-Translation Augmentation
This step applies back-translation augmentation to the `myXNLI` and `ALT Combined` datasets. Models (`facebook/nllb-200-distilled-600M` and `facebook/hf-seamless-m4t-large`) are used to generate synthetic data. 
Results are stored in additional columns for evaluation and comparison.

In [10]:
# load normalized myXNLI and Combined ALT  dataset
myxnli_back_translate = load_gen_df("myxnli_normalized")
alt_combined_back_translate = load_gen_df("alt_combined_normalized")

In [11]:
# Custom Dataset class for loading sentences
class SentenceDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]

#### facebook/nllb-200-distilled-600M

In [7]:
# Load NLLB model and tokenizer
nllb_model_name = "facebook/nllb-200-distilled-600M"
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(nllb_model_name).to(device)
nllb_tokenizer = AutoTokenizer.from_pretrained(nllb_model_name)

In [8]:
# Function for back-translation using NLLB
def back_translate_nllb(text, src_lang="eng_Latn", tgt_lang="mya_Mymr"):
    try:
        # Forward translation: English -> Burmese
        inputs = nllb_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        translated_tokens = nllb_model.generate(
            **inputs, forced_bos_token_id=nllb_tokenizer.convert_tokens_to_ids(tgt_lang)
        )
        translated_text = nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
        
        # Backward translation: Burmese -> English
        back_inputs = nllb_tokenizer(translated_text, return_tensors="pt", truncation=True, padding=True).to(device)
        back_translated_tokens = nllb_model.generate(
            **back_inputs, forced_bos_token_id=nllb_tokenizer.convert_tokens_to_ids(src_lang)
        )
        back_translated_text = nllb_tokenizer.batch_decode(back_translated_tokens, skip_special_tokens=True)[0]
        
        return translated_text, back_translated_text
    except Exception as e:
        print(f"Error processing text: {text}. Error: {e}")
        return None, None

##### myXNLI dataset

In [11]:
# Apply back-translation to the sentences 1 in myXNLI
print("Applying back-translation to myXNLI dataset (sentences 1)...")
myxnli_nllb_back_translated_start_index_1 = 0

with open("tmp/myxnli_nllb_back_translated_1.csv", mode="a", encoding="utf-8") as f:
    if myxnli_nllb_back_translated_start_index_1 == 0:
        # Write CSV header
        f.write("original,translated,back_translated\n")
    
    # Process rows and write results
    for index, row in tqdm(myxnli_back_translate.iloc[myxnli_nllb_back_translated_start_index_1:].iterrows(), 
                           total=len(myxnli_back_translate) - myxnli_nllb_back_translated_start_index_1):
        original = row["sentence1_en"]
        translated, back_translated = back_translate_nllb(original)
        
        # Replace double quotes with single quotes
        original = original.replace('"', "'")
        translated = translated.replace('"', "'")
        back_translated = back_translated.replace('"', "'")
        
        # Wrap text in double quotes if it contains a comma
        original = f'"{original}"' if ',' in original else original
        translated = f'"{translated}"' if ',' in translated else translated
        back_translated = f'"{back_translated}"' if ',' in back_translated else back_translated
        
        # Write to CSV
        if translated and back_translated:
            f.write(f"{original},{translated},{back_translated}\n")
        else:
            f.write(f"{original},,\n")


Applying back-translation to myXNLI dataset (sentences 1)...


  0%|          | 0/203657 [00:00<?, ?it/s]

In [None]:
# set and display back-translated myXNLI dataset (sentences 1)
tmp_myxnli_nllb_back_translated_1 = load_tmp_df("myxnli_nllb_back_translated_1")
myxnli_back_translate["nllb_translated_s1"] = tmp_myxnli_nllb_back_translated_1["translated"]
myxnli_back_translate["nllb_back_translated_s1"] = tmp_myxnli_nllb_back_translated_1["back_translated"]
display(myxnli_back_translate.head())

In [None]:
# Apply back-translation to the sentences 2 in myXNLI
print("Applying back-translation to myXNLI dataset (sentences 2)...")
myxnli_nllb_back_translated_start_index_2 = 97494
with open("tmp/myxnli_nllb_back_translated_2.csv", mode="w", encoding="utf-8") as f:
    if myxnli_nllb_back_translated_start_index_2 == 0:
        # Write CSV header
        f.write("original,translated,back_translated\n")
    
    # Process rows and write results
    for index, row in tqdm(myxnli_back_translate.iloc[myxnli_nllb_back_translated_start_index_2:].iterrows(), 
                       total=len(myxnli_back_translate) - myxnli_nllb_back_translated_start_index_2):
        original = row["sentence2_en"]
        translated, back_translated = back_translate_nllb(original)
        
        # Replace double quotes with single quotes
        original = original.replace('"', "'") if original is not None else None
        translated = translated.replace('"', "'") if translated is not None else None
        back_translated = back_translated.replace('"', "'") if back_translated is not None else None
        
        # Wrap text in double quotes if it contains a comma
        original = f'"{original}"' if ',' in original else original
        translated = f'"{translated}"' if ',' in translated else translated
        back_translated = f'"{back_translated}"' if ',' in back_translated else back_translated
        
        # Write to CSV
        if translated and back_translated:
            f.write(f"{original},{translated},{back_translated}\n")
        else:
            f.write(f"{original},,\n")

Applying back-translation to myXNLI dataset (sentences 2)...


  0%|          | 0/295188 [00:00<?, ?it/s]

In [None]:
# set and display back-translated myXNLI dataset (sentences 2)
tmp_myxnli_nllb_back_translated_2 = load_tmp_df("myxnli_nllb_back_translated_2")
myxnli_back_translate["nllb_translated_s2"] = tmp_myxnli_nllb_back_translated_2["translated"]
myxnli_back_translate["nllb_back_translated_s2"] = tmp_myxnli_nllb_back_translated_2["back_translated"]
display(myxnli_back_translate.head())

##### ALT dataset

In [None]:
# Apply back-translation to the alt combined dataset
print("Applying back-translation to alt combined dataset...")
alt_combined_nllb_back_translated_start_index = 0
with open("tmp/alt_combined_nllb_back_translated.csv", mode="a", encoding="utf-8") as f:
    if alt_combined_nllb_back_translated_start_index == 0:
        # Write CSV header
        f.write("original,translated,back_translated\n")
    
    # Process rows and write results
    for index, row in tqdm(alt_combined_back_translate.iloc[alt_combined_nllb_back_translated_start_index:].iterrows(),
                       total=len(alt_combined_back_translate) - alt_combined_nllb_back_translated_start_index):
        original = row["English_Sentence"]
        translated, back_translated = back_translate_nllb(original)
        
        # Replace double quotes with single quotes
        original = original.replace('"', "'")
        translated = translated.replace('"', "'")
        back_translated = back_translated.replace('"', "'")
        
        # Wrap text in double quotes if it contains a comma
        original = f'"{original}"' if ',' in original else original
        translated = f'"{translated}"' if ',' in translated else translated
        back_translated = f'"{back_translated}"' if ',' in back_translated else back_translated
        
        # Write to CSV
        if translated and back_translated:
            f.write(f"{original},{translated},{back_translated}\n")
        else:
            f.write(f"{original},,\n")

Applying back-translation to alt combined dataset...


  0%|          | 0/19166 [00:00<?, ?it/s]

In [None]:
# set and display back-translated combined ALT dataset
tmp_alt_combined_nllb_back_translated = load_tmp_df("alt_combined_nllb_back_translated")
alt_combined_back_translate["nllb_translated"] = tmp_alt_combined_nllb_back_translated["translated"]
alt_combined_back_translate["nllb_back_translated"] = tmp_alt_combined_nllb_back_translated["back_translated"]
display(alt_combined_back_translate.head())

#### facebook/hf-seamless-m4t-large

In [12]:
# Load seamless m4t model and processor
seamless_m4t_model_name = "facebook/hf-seamless-m4t-large"
seamless_m4t_model = SeamlessM4TForTextToText.from_pretrained(seamless_m4t_model_name).to(device)
seamless_m4t_processor = AutoProcessor.from_pretrained(seamless_m4t_model_name)

In [13]:
# Function for back-translation using seamless_m4t
def back_translate_seamless_m4t_batch(batch, src_lang="eng", tgt_lang="mya"):
    text_inputs = seamless_m4t_processor(batch, src_lang=src_lang, return_tensors="pt", padding=True).to(device)
    
    # Forward translation: English -> Burmese
    output_tokens = seamless_m4t_model.generate(**text_inputs, tgt_lang=tgt_lang)
    translated_texts = seamless_m4t_processor.batch_decode(output_tokens, skip_special_tokens=True)
    
    # Backward translation: Burmese -> English
    back_text_inputs = seamless_m4t_processor(translated_texts, src_lang=tgt_lang, return_tensors="pt", padding=True).to(device)
    back_output_tokens = seamless_m4t_model.generate(**back_text_inputs, tgt_lang=src_lang)
    back_translated_texts = seamless_m4t_processor.batch_decode(back_output_tokens, skip_special_tokens=True)
    
    return translated_texts, back_translated_texts

In [14]:
# Parameters
seamless_m4t_batch_size = 50

##### myXNLI dataset

In [15]:
# Dataset and DataLoader for sentence 1
myxnli_seamless_m4t_back_translate_sentences_1 = myxnli_back_translate["sentence1_en"].tolist()
myxnli_seamless_m4t_back_translate_dataset_1 = SentenceDataset(myxnli_seamless_m4t_back_translate_sentences_1)
myxnli_seamless_m4t_back_translate_dataloader_1 = DataLoader(myxnli_seamless_m4t_back_translate_dataset_1, batch_size=seamless_m4t_batch_size, shuffle=False)

In [None]:
# Apply back-translation to the sentences 1 in myXNLI
print("Applying back-translation to myXNLI dataset (sentences 1)...")
for batch_idx, batch in enumerate(tqdm(myxnli_seamless_m4t_back_translate_dataloader_1, desc="Processing batches")):
    try:
        # Perform back-translation
        translated_texts, back_translated_texts = back_translate_seamless_m4t_batch(batch)
        
        # Collect results for this batch
        batch_results = []
        for original, translated, back_translated in zip(batch, translated_texts, back_translated_texts):            
            # Add to results list
            batch_results.append({
                "original": original,
                "translated": translated,
                "back_translated": back_translated,
            })
        
        # Convert batch results to DataFrame and save incrementally
        batch_df = pd.DataFrame(batch_results)
        if batch_idx == 0:
            # Save with header for the first batch
            save_tmp_df(batch_df, "myxnli_seamless_m4t_back_translated_1")
        else:
            # Append without header for subsequent batches
            append_tmp_df(batch_df, "myxnli_seamless_m4t_back_translated_1")

    except Exception as e:
        print(f"Error processing batch {batch_idx}: {e}")
        continue

print("Back-translation completed.")

Applying back-translation to myXNLI dataset (sentences 1)...


Processing batches:   0%|          | 0/7854 [00:00<?, ?it/s]

In [None]:
# Dataset and DataLoader for sentence 2
myxnli_seamless_m4t_back_translate_sentences_2 = myxnli_back_translate["sentence2_en"].tolist()
myxnli_seamless_m4t_back_translate_dataset_2 = SentenceDataset(myxnli_seamless_m4t_back_translate_sentences_2)
myxnli_seamless_m4t_back_translate_dataloader_2 = DataLoader(myxnli_seamless_m4t_back_translate_dataset_2, batch_size=seamless_m4t_batch_size, shuffle=False)

In [None]:
# Apply back-translation to the sentences 2 in myXNLI
print("Applying back-translation to myXNLI dataset (sentences 2)...")
for batch_idx, batch in enumerate(tqdm(myxnli_seamless_m4t_back_translate_dataloader_2, desc="Processing batches")):
    try:
        # Perform back-translation
        translated_texts, back_translated_texts = back_translate_seamless_m4t_batch(batch)
        
        # Collect results for this batch
        batch_results = []
        for original, translated, back_translated in zip(batch, translated_texts, back_translated_texts):            
            # Add to results list
            batch_results.append({
                "original": original,
                "translated": translated,
                "back_translated": back_translated,
            })
        
        # Convert batch results to DataFrame and save incrementally
        batch_df = pd.DataFrame(batch_results)
        if batch_idx == 0:
            # Save with header for the first batch
            save_tmp_df(batch_df, "myxnli_seamless_m4t_back_translated_2")
        else:
            # Append without header for subsequent batches
            append_tmp_df(batch_df, "myxnli_seamless_m4t_back_translated_2")

    except Exception as e:
        print(f"Error processing batch {batch_idx}: {e}")
        continue

print("Back-translation completed.")

##### ALT dataset

In [None]:
# Dataset and DataLoader for combined ALT dataset
alt_combined_seamless_m4t_back_translate_sentences = alt_combined_back_translate["English_Sentence"].tolist()
alt_combined_seamless_m4t_back_translate_dataset = SentenceDataset(alt_combined_seamless_m4t_back_translate_sentences)
alt_combined_seamless_m4t_back_translate_dataloader = DataLoader(alt_combined_seamless_m4t_back_translate_dataset, batch_size=seamless_m4t_batch_size, shuffle=False)

In [None]:
# Apply back-translation to the combined ALT dataset
print("Applying back-translation to alt combined dataset...")
for batch_idx, batch in enumerate(tqdm(alt_combined_seamless_m4t_back_translate_dataloader, desc="Processing batches")):
    try:
        # Perform back-translation
        translated_texts, back_translated_texts = back_translate_seamless_m4t_batch(batch)
        
        # Collect results for this batch
        batch_results = []
        for original, translated, back_translated in zip(batch, translated_texts, back_translated_texts):            
            # Add to results list
            batch_results.append({
                "original": original,
                "translated": translated,
                "back_translated": back_translated,
            })
        
        # Convert batch results to DataFrame and save incrementally
        batch_df = pd.DataFrame(batch_results)
        if batch_idx == 0:
            # Save with header for the first batch
            save_tmp_df(batch_df, "alt_combined_seamless_m4t_back_translated")
        else:
            # Append without header for subsequent batches
            append_tmp_df(batch_df, "alt_combined_seamless_m4t_back_translated")

    except Exception as e:
        print(f"Error processing batch {batch_idx}: {e}")
        continue

print("Back-translation completed.")

### 5. Pseudo-Parallel Corpus Creation

This step involves aligning monolingual English and Burmese text from the datasets to create pseudo-parallel corpora by using models (`sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`). 
Semantic similarity methods are used to identify pairs of sentences with similar meanings. 
The resulting aligned corpus enhances the dataset and is valuable for low-resource language modeling.

In [9]:
# load normalized myXNLI and Combined ALT  dataset
myxnli_corpus = load_gen_df("myxnli_normalized")
alt_combined_corpus = load_gen_df("alt_combined_normalized")

In [10]:
# Extract monolingual text
corpus_english_sentences = list(myxnli_corpus["sentence1_en"]) + list(myxnli_corpus["sentence2_en"]) + list(alt_combined_corpus["English_Sentence"])
corpus_burmese_sentences = list(myxnli_corpus["sentence1_my"]) + list(myxnli_corpus["sentence2_my"]) + list(alt_combined_corpus["Burmese_Sentence"])

#### sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2

In [12]:
# Load pretrained multilingual embedding model
minilm_embedding_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
minilm_embedding_model = SentenceTransformer(minilm_embedding_model_name, device=device)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

##### Generate Embeddings

In [13]:
# File paths for embeddings
minilm_corpus_english_embedding_file = "gen/minilm_corpus_english_embeddings.npy"
minilm_corpus_burmese_embedding_file = "gen/minilm_corpus_burmese_embeddings.npy"

In [14]:
# Initialize progress
if os.path.exists(minilm_corpus_english_embedding_file) and os.path.exists(minilm_corpus_burmese_embedding_file):
    print("Loading existing embeddings and checkpoint...")
    minilm_corpus_english_embeddings = np.load(minilm_corpus_english_embedding_file)
    minilm_corpus_burmese_embeddings = np.load(minilm_corpus_burmese_embedding_file)
else:
    print("Starting fresh embedding computation...")
    # Compute embeddings for English and Burmese sentences
    minilm_corpus_english_embeddings = minilm_embedding_model.encode(corpus_english_sentences, convert_to_tensor=False)
    minilm_corpus_burmese_embeddings = minilm_embedding_model.encode(corpus_burmese_sentences, convert_to_tensor=False)

    # Save the embeddings to files
    print("Saving embeddings to files...")
    np.save(minilm_corpus_english_embedding_file, minilm_corpus_english_embeddings)
    np.save(minilm_corpus_burmese_embedding_file, minilm_corpus_burmese_embeddings)

Loading existing embeddings and checkpoint...


##### Semantic Similarity and Filter High-Similarity

In [15]:
minilm_threshold = 0.8  # Similarity threshold

In [16]:
# Apply to embeddings
minilm_corpus_english_embeddings = torch.tensor(minilm_corpus_english_embeddings).to(device)
minilm_corpus_burmese_embeddings = torch.tensor(minilm_corpus_burmese_embeddings).to(device)

In [18]:
# Batch size for processing similarity computation
minilm_batch_size = 1000
minilm_threshold = 0.8

minilm_corpus_aligned_pairs = []

# Compute similarity in batches
print("Processing similarity in batches...")
for batch_start in tqdm(range(0, len(minilm_corpus_english_embeddings), minilm_batch_size)):
    batch_end = min(batch_start + minilm_batch_size, len(minilm_corpus_english_embeddings))
    english_batch = minilm_corpus_english_embeddings[batch_start:batch_end]

    # Compute similarity matrix for the batch
    similarity_matrix = util.cos_sim(english_batch, minilm_corpus_burmese_embeddings)

    # Filter pairs exceeding the threshold
    aligned_indices = (similarity_matrix > minilm_threshold).nonzero(as_tuple=True)

    # Extract aligned pairs for the current batch
    for i, j in zip(*aligned_indices):
        minilm_corpus_aligned_pairs.append({
            "english": corpus_english_sentences[batch_start + i.item()],  # Adjust for batch offset
            "burmese": corpus_burmese_sentences[j.item()],
            "similarity_score": similarity_matrix[i, j].item(),
        })

    # Save progress incrementally after each batch
    aligned_df = pd.DataFrame(minilm_corpus_aligned_pairs)
    save_gen_df(aligned_df, "minilm_pseudo_parallel_corpus")

print("Pseudo-parallel corpus creation completed.")

Processing similarity in batches...


  0%|          | 0/805 [00:00<?, ?it/s]

Pseudo-parallel corpus creation completed.


#### sentence-transformers/LaBSE

In [11]:
# Load pretrained multilingual embedding model
labse_embedding_model_name = "sentence-transformers/LaBSE"
labse_embedding_model = SentenceTransformer(labse_embedding_model_name, device=device)

##### Generate Embeddings

In [12]:
# File paths for embeddings
labse_corpus_english_embedding_file = "gen/labse_corpus_english_embeddings.npy"
labse_corpus_burmese_embedding_file = "gen/labse_corpus_burmese_embeddings.npy"

In [13]:
# Initialize progress
if os.path.exists(labse_corpus_english_embedding_file) and os.path.exists(labse_corpus_burmese_embedding_file):
    print("Loading existing embeddings and checkpoint...")
    labse_corpus_english_embeddings = np.load(labse_corpus_english_embedding_file)
    labse_corpus_burmese_embeddings = np.load(labse_corpus_burmese_embedding_file)
else:
    print("Starting fresh embedding computation...")
    # Compute embeddings for English and Burmese sentences
    labse_corpus_english_embeddings = labse_embedding_model.encode(corpus_english_sentences, convert_to_tensor=False)
    labse_corpus_burmese_embeddings = labse_embedding_model.encode(corpus_burmese_sentences, convert_to_tensor=False)

    # Save the embeddings to files
    print("Saving embeddings to files...")
    np.save(labse_corpus_english_embedding_file, labse_corpus_english_embeddings)
    np.save(labse_corpus_burmese_embedding_file, labse_corpus_burmese_embeddings)

Loading existing embeddings and checkpoint...


##### Semantic Similarity and Filter High-Similarity

In [14]:
labse_threshold = 0.8  # Similarity threshold

In [15]:
# Apply to embeddings
labse_corpus_english_embeddings = torch.tensor(labse_corpus_english_embeddings).to(device)
labse_corpus_burmese_embeddings = torch.tensor(labse_corpus_burmese_embeddings).to(device)

In [14]:
# Batch size for processing similarity computation
labse_batch_size = 500
labse_threshold = 0.8

labse_corpus_aligned_pairs = []

# Compute similarity in batches
print("Processing similarity in batches...")
for batch_start in tqdm(range(0, len(labse_corpus_english_embeddings), labse_batch_size)):
    batch_end = min(batch_start + labse_batch_size, len(labse_corpus_english_embeddings))
    english_batch = labse_corpus_english_embeddings[batch_start:batch_end]

    # Compute similarity matrix for the batch
    similarity_matrix = util.cos_sim(english_batch, labse_corpus_burmese_embeddings)

    # Filter pairs exceeding the threshold
    aligned_indices = (similarity_matrix > labse_threshold).nonzero(as_tuple=True)

    # Extract aligned pairs for the current batch
    for i, j in zip(*aligned_indices):
        labse_corpus_aligned_pairs.append({
            "english": corpus_english_sentences[batch_start + i.item()],  # Adjust for batch offset
            "burmese": corpus_burmese_sentences[j.item()],
            "similarity_score": similarity_matrix[i, j].item(),
        })

    # Save progress incrementally after each batch
    aligned_df = pd.DataFrame(labse_corpus_aligned_pairs)
    save_gen_df(aligned_df, "labse_pseudo_parallel_corpus")

print("Pseudo-parallel corpus creation completed.")

Processing similarity in batches...


  0%|          | 0/1610 [00:00<?, ?it/s]

Pseudo-parallel corpus creation completed.
