#### Importing all the important libraries

In [1]:
import tensorflow as tf
import pandas as pd
from IPython.display import display, clear_output
import unicodedata
import sentencepiece as spm
import re
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, MBartForConditionalGeneration, MBart50TokenizerFast
from sentence_transformers import SentenceTransformer, util

2025-01-11 00:41:33.957233: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-11 00:41:33.968071: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736523693.978259    5385 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736523693.981195    5385 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-11 00:41:33.992963: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

#### Set GPU

In [None]:
# for mac
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        details = tf.config.experimental.get_device_details(gpu)
        print("GPU details: ", details)
else:
    print("No GPU found. Using CPU.")

# set GPU device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

In [2]:
# for window
print("Tensorflow GPUs: ", tf.config.list_physical_devices('GPU'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using PyTorch device:", device)
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

Tensorflow GPUs:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Using PyTorch device: cuda
GPU Name: NVIDIA GeForce RTX 3070 Ti Laptop GPU


#### 1. Data Loading
This step loads the primary datasets:
1. `myXNLI.train.tsv`: English-Burmese parallel dataset in TSV format.
2. `ALT_data_en.txt` and `ALT_data_my.txt`: English and Burmese parts of the ALT corpus, respectively.

The datasets will be loaded into Pandas DataFrames for analysis and preprocessing.

In [3]:
# Load myXNLI dataset
myxnli_path = './data/myXNLI.train.tsv'  # Path to the file
myxnli_data = pd.read_csv(myxnli_path, sep='\t', header=0)
print(f"myXNLI dataset loaded successfully with {len(myxnli_data)} records.")
display(myxnli_data.head())  # Display the first few rows of the dataset

myXNLI dataset loaded successfully with 392702 records.


Unnamed: 0,genre,label,sentence1_en,sentence2_en,sentence1_my,sentence2_my
0,government,neutral,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,သဘောတရားအရ ခရင်မ်စိမ်ခြင်းတွင် အခြေခံအတိုင်းအတ...,ထုတ်ကုန်နှင့် ပထဝီဝင်အနေအထားသည် ခရင်မ် skimmin...
1,telephone,entailment,you know during the season and i guess at at y...,You lose the things to the following level if ...,ရာသီအတွင်း မင်းသိတယ်၊ မင်းရဲ့အဆင့်ကို ငါ ခန့်မ...,လူတွေပြန်ခေါ်ရင် အောက်ပါအဆင့်အထိ ဆုံးရှုံးသွား...
2,fiction,entailment,One of our number will carry out your instruct...,A member of my team will execute your orders w...,ကျွန်ုပ်တို့၏နံပါတ်တစ်ခုသည် သင့်ညွှန်ကြားချက်မ...,ကျွန်ုပ်၏အဖွဲ့သားတစ်ဦးသည် သင်၏အမိန့်စာများကို ...
3,fiction,entailment,How do you know? All this is their information...,This information belongs to them.,သင်ဘယ်လိုသိသလဲ? ဒါတွေအားလုံးဟာ သူတို့ရဲ့ အချက်...,ဒီအချက်အလက်က သူတို့ပိုင်တယ်။
4,telephone,neutral,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,ဟုတ်တယ် ငါမင်းကိုပြောပြမယ် ဒီတင်းနစ်ဖိနပ်တချို...,တင်းနစ်ဖိနပ်များသည် ဈေးနှုန်းအမျိုးမျိုးရှိသည်။


In [4]:
# Load ALT English data
alt_en_path = './data/ALT_data_en.txt'  # Path to the English ALT corpus
alt_en_data = pd.read_csv(alt_en_path, sep='\t', header=None, names=["ID", "English_Sentence"])
print(f"ALT English dataset loaded successfully with {len(alt_en_data)} records.")
display(alt_en_data.head())

ALT English dataset loaded successfully with 19908 records.


Unnamed: 0,ID,English_Sentence
0,SNT.80188.1,Italy have defeated Portugal 31-5 in Pool C of...
1,SNT.80188.2,Andrea Masi opened the scoring in the fourth m...
2,SNT.80188.3,Despite controlling the game for much of the f...
3,SNT.80188.4,Portugal never gave up and David Penalva score...
4,SNT.80188.5,Italy led 16-5 at half time but were matched b...


In [5]:
# Load ALT Burmese data
alt_my_path = './data/ALT_data_my.txt'  # Path to the Burmese ALT corpus
alt_my_data = pd.read_csv(alt_my_path, sep='\t', header=None, names=["ID", "Burmese_Sentence"])
print(f"ALT Burmese dataset loaded successfully with {len(alt_my_data)} records.")
display(alt_my_data.head())

ALT Burmese dataset loaded successfully with 19265 records.


Unnamed: 0,ID,Burmese_Sentence
0,SNT.80188.1,ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂...
1,SNT.80188.2,အန်ဒရီယာ မာစီ သည် အီတလီ အတွက် စမ်းသပ်မှု တစ်ခု...
2,SNT.80188.3,ပထမ တစ်ဝက် ၏ တော်တော်များများ အတွက် ကစားပွဲ ကိ...
3,SNT.80188.4,ပေါ်တူဂီ သည် ဘယ်သောအခါမှ စွန့်လွှတ်မှု မရှိခဲ့...
4,SNT.80188.5,အီတလီ သည် ပထမပိုင်း ၌ ၁၆-၅ ဖြင့် ဦးဆောင်ခဲ့ သေ...


In [6]:
# Combine ALT datasets (if IDs match)
alt_combined = pd.merge(alt_en_data, alt_my_data, on="ID")
print(f"ALT combined dataset created successfully with {len(alt_combined)} records.")
display(alt_combined.head())

ALT combined dataset created successfully with 19173 records.


Unnamed: 0,ID,English_Sentence,Burmese_Sentence
0,SNT.80188.1,Italy have defeated Portugal 31-5 in Pool C of...,ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂...
1,SNT.80188.2,Andrea Masi opened the scoring in the fourth m...,အန်ဒရီယာ မာစီ သည် အီတလီ အတွက် စမ်းသပ်မှု တစ်ခု...
2,SNT.80188.3,Despite controlling the game for much of the f...,ပထမ တစ်ဝက် ၏ တော်တော်များများ အတွက် ကစားပွဲ ကိ...
3,SNT.80188.4,Portugal never gave up and David Penalva score...,ပေါ်တူဂီ သည် ဘယ်သောအခါမှ စွန့်လွှတ်မှု မရှိခဲ့...
4,SNT.80188.5,Italy led 16-5 at half time but were matched b...,အီတလီ သည် ပထမပိုင်း ၌ ၁၆-၅ ဖြင့် ဦးဆောင်ခဲ့ သေ...


#### 2. Data Cleaning
This step focuses on cleaning the datasets to prepare them for further processing. The cleaning operations include:
1. Removing duplicate entries.
2. Handling missing values.
3. Removing non-standard characters or symbols unrelated to the Burmese or English language.
4. Ensuring consistent formatting.

The cleaned datasets will be ready for normalization and tokenization in the next steps.

In [7]:
# Cleaning myXNLI dataset
print("Cleaning myXNLI dataset...")
myxnli_cleaned = myxnli_data.drop_duplicates()  # Remove duplicates
myxnli_cleaned = myxnli_cleaned.dropna()  # Remove rows with missing values
#myxnli_cleaned = myxnli_cleaned.replace(r'[^\w\s]', '', regex=True)  # Remove non-standard characters
print(f"myXNLI dataset cleaned successfully.")
print(f"Original Records: {len(myxnli_data)}.")
print(f"Remaining records: {len(myxnli_cleaned)}.")
display(myxnli_cleaned.head())

Cleaning myXNLI dataset...
myXNLI dataset cleaned successfully.
Original Records: 392702.
Remaining records: 392682.


Unnamed: 0,genre,label,sentence1_en,sentence2_en,sentence1_my,sentence2_my
0,government,neutral,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,သဘောတရားအရ ခရင်မ်စိမ်ခြင်းတွင် အခြေခံအတိုင်းအတ...,ထုတ်ကုန်နှင့် ပထဝီဝင်အနေအထားသည် ခရင်မ် skimmin...
1,telephone,entailment,you know during the season and i guess at at y...,You lose the things to the following level if ...,ရာသီအတွင်း မင်းသိတယ်၊ မင်းရဲ့အဆင့်ကို ငါ ခန့်မ...,လူတွေပြန်ခေါ်ရင် အောက်ပါအဆင့်အထိ ဆုံးရှုံးသွား...
2,fiction,entailment,One of our number will carry out your instruct...,A member of my team will execute your orders w...,ကျွန်ုပ်တို့၏နံပါတ်တစ်ခုသည် သင့်ညွှန်ကြားချက်မ...,ကျွန်ုပ်၏အဖွဲ့သားတစ်ဦးသည် သင်၏အမိန့်စာများကို ...
3,fiction,entailment,How do you know? All this is their information...,This information belongs to them.,သင်ဘယ်လိုသိသလဲ? ဒါတွေအားလုံးဟာ သူတို့ရဲ့ အချက်...,ဒီအချက်အလက်က သူတို့ပိုင်တယ်။
4,telephone,neutral,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,ဟုတ်တယ် ငါမင်းကိုပြောပြမယ် ဒီတင်းနစ်ဖိနပ်တချို...,တင်းနစ်ဖိနပ်များသည် ဈေးနှုန်းအမျိုးမျိုးရှိသည်။


In [8]:
# Cleaning ALT English data
print("Cleaning ALT English dataset...")
alt_en_cleaned = alt_en_data.drop_duplicates()  # Remove duplicates
alt_en_cleaned = alt_en_cleaned.dropna()  # Remove rows with missing values
alt_en_cleaned["English_Sentence"] = alt_en_cleaned["English_Sentence"].replace(r'[^\w\s]', '', regex=True)  # Remove non-standard characters
print(f"ALT English dataset cleaned successfully.")
print(f"Original records: {len(alt_en_data)}.")
print(f"Remaining records: {len(alt_en_cleaned)}.")
display(alt_en_cleaned.head())

Cleaning ALT English dataset...
ALT English dataset cleaned successfully.
Original records: 19908.
Remaining records: 19908.


Unnamed: 0,ID,English_Sentence
0,SNT.80188.1,Italy have defeated Portugal 315 in Pool C of ...
1,SNT.80188.2,Andrea Masi opened the scoring in the fourth m...
2,SNT.80188.3,Despite controlling the game for much of the f...
3,SNT.80188.4,Portugal never gave up and David Penalva score...
4,SNT.80188.5,Italy led 165 at half time but were matched by...


In [9]:
# Cleaning ALT Burmese data
print("Cleaning ALT Burmese dataset...")
alt_my_cleaned = alt_my_data.drop_duplicates()  # Remove duplicates
alt_my_cleaned = alt_my_cleaned.dropna()  # Remove rows with missing values
#alt_my_cleaned["Burmese_Sentence"] = alt_my_cleaned["Burmese_Sentence"].replace(r'[^\w\s]', '', regex=True)  # Remove non-standard characters
print(f"ALT Burmese dataset cleaned successfully.")
print(f"Original records: {len(alt_my_data)}")
print(f"Remaining records: {len(alt_my_cleaned)}")
display(alt_my_cleaned.head())

Cleaning ALT Burmese dataset...
ALT Burmese dataset cleaned successfully.
Original records: 19265
Remaining records: 19258


Unnamed: 0,ID,Burmese_Sentence
0,SNT.80188.1,ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂...
1,SNT.80188.2,အန်ဒရီယာ မာစီ သည် အီတလီ အတွက် စမ်းသပ်မှု တစ်ခု...
2,SNT.80188.3,ပထမ တစ်ဝက် ၏ တော်တော်များများ အတွက် ကစားပွဲ ကိ...
3,SNT.80188.4,ပေါ်တူဂီ သည် ဘယ်သောအခါမှ စွန့်လွှတ်မှု မရှိခဲ့...
4,SNT.80188.5,အီတလီ သည် ပထမပိုင်း ၌ ၁၆-၅ ဖြင့် ဦးဆောင်ခဲ့ သေ...


In [10]:
# Combine cleaned ALT datasets
print("Cleaning combined ALT dataset...")
alt_combined_cleaned = pd.merge(alt_en_cleaned, alt_my_cleaned, on="ID")
print(f"Combined ALT dataset cleaned successfully.")
print(f"Original records: {len(alt_combined)}")
print(f"Remaining records: {len(alt_combined_cleaned)}")
display(alt_combined_cleaned.head())

Cleaning combined ALT dataset...
Combined ALT dataset cleaned successfully.
Original records: 19173
Remaining records: 19166


Unnamed: 0,ID,English_Sentence,Burmese_Sentence
0,SNT.80188.1,Italy have defeated Portugal 315 in Pool C of ...,ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂...
1,SNT.80188.2,Andrea Masi opened the scoring in the fourth m...,အန်ဒရီယာ မာစီ သည် အီတလီ အတွက် စမ်းသပ်မှု တစ်ခု...
2,SNT.80188.3,Despite controlling the game for much of the f...,ပထမ တစ်ဝက် ၏ တော်တော်များများ အတွက် ကစားပွဲ ကိ...
3,SNT.80188.4,Portugal never gave up and David Penalva score...,ပေါ်တူဂီ သည် ဘယ်သောအခါမှ စွန့်လွှတ်မှု မရှိခဲ့...
4,SNT.80188.5,Italy led 165 at half time but were matched by...,အီတလီ သည် ပထမပိုင်း ၌ ၁၆-၅ ဖြင့် ဦးဆောင်ခဲ့ သေ...


#### 3. Data Normalization
This step normalizes the text data to ensure consistency across datasets. The normalization process includes:
1. Applying Unicode normalization to handle encoding inconsistencies.
2. Standardizing text formatting by converting all text to lowercase and standardizing punctuation.
3. Normalizing diacritical marks and stacked consonants in the Burmese text to improve text representation.

In [11]:
# Function to normalize text
def normalize_text(text):
    if pd.isnull(text):
        return text  # Skip null values
    # Apply Unicode normalization
    normalized_text = unicodedata.normalize('NFKC', text)
    # Convert to lowercase
    normalized_text = normalized_text.lower()
    # Standardize punctuation (e.g., replace unusual punctuation marks)
    normalized_text = normalized_text.replace('“', '"').replace('”', '"').replace('’', "'")
    return normalized_text

In [12]:
# Function to normalize Burmese text (handles diacritical marks and stacked consonants)
def normalize_burmese(text):
    if pd.isnull(text):
        return text  # Skip null values
    normalized_text = unicodedata.normalize('NFKC', text)
    # Additional Burmese-specific normalization can be added here if needed
    return normalized_text

In [13]:
# Normalize myXNLI cleaned dataset
print("Normalizing myXNLI dataset...")
myxnli_normalized = myxnli_cleaned.copy()

# Normalize English columns
myxnli_normalized["sentence1_en"] = myxnli_normalized["sentence1_en"].apply(normalize_text)
myxnli_normalized["sentence2_en"] = myxnli_normalized["sentence2_en"].apply(normalize_text)

# Normalize Burmese columns
myxnli_normalized["sentence1_my"] = myxnli_normalized["sentence1_my"].apply(normalize_burmese)
myxnli_normalized["sentence2_my"] = myxnli_normalized["sentence2_my"].apply(normalize_burmese)

print(f"myXNLI dataset normalized successfully.")
display(myxnli_normalized.head())

Normalizing myXNLI dataset...
myXNLI dataset normalized successfully.


Unnamed: 0,genre,label,sentence1_en,sentence2_en,sentence1_my,sentence2_my
0,government,neutral,conceptually cream skimming has two basic dime...,product and geography are what make cream skim...,သဘောတရားအရ ခရင်မ်စိမ်ခြင်းတွင် အခြေခံအတိုင်းအတ...,ထုတ်ကုန်နှင့် ပထဝီဝင်အနေအထားသည် ခရင်မ် skimmin...
1,telephone,entailment,you know during the season and i guess at at y...,you lose the things to the following level if ...,ရာသီအတွင်း မင်းသိတယ်၊ မင်းရဲ့အဆင့်ကို ငါ ခန့်မ...,လူတွေပြန်ခေါ်ရင် အောက်ပါအဆင့်အထိ ဆုံးရှုံးသွား...
2,fiction,entailment,one of our number will carry out your instruct...,a member of my team will execute your orders w...,ကျွန်ုပ်တို့၏နံပါတ်တစ်ခုသည် သင့်ညွှန်ကြားချက်မ...,ကျွန်ုပ်၏အဖွဲ့သားတစ်ဦးသည် သင်၏အမိန့်စာများကို ...
3,fiction,entailment,how do you know? all this is their information...,this information belongs to them.,သင်ဘယ်လိုသိသလဲ? ဒါတွေအားလုံးဟာ သူတို့ရဲ့ အချက်...,ဒီအချက်အလက်က သူတို့ပိုင်တယ်။
4,telephone,neutral,yeah i tell you what though if you go price so...,the tennis shoes have a range of prices.,ဟုတ်တယ် ငါမင်းကိုပြောပြမယ် ဒီတင်းနစ်ဖိနပ်တချို...,တင်းနစ်ဖိနပ်များသည် ဈေးနှုန်းအမျိုးမျိုးရှိသည်။


In [14]:
# Normalize ALT English cleaned dataset
print("Normalizing ALT English dataset...")
alt_en_normalized = alt_en_cleaned.copy()
alt_en_normalized["English_Sentence"] = alt_en_normalized["English_Sentence"].apply(normalize_text)
print(f"ALT English dataset normalized successfully.")
display(alt_en_normalized.head())

Normalizing ALT English dataset...
ALT English dataset normalized successfully.


Unnamed: 0,ID,English_Sentence
0,SNT.80188.1,italy have defeated portugal 315 in pool c of ...
1,SNT.80188.2,andrea masi opened the scoring in the fourth m...
2,SNT.80188.3,despite controlling the game for much of the f...
3,SNT.80188.4,portugal never gave up and david penalva score...
4,SNT.80188.5,italy led 165 at half time but were matched by...


In [15]:
# Normalize ALT Burmese cleaned dataset
print("Normalizing ALT Burmese dataset...")
alt_my_normalized = alt_my_cleaned.copy()
alt_my_normalized["Burmese_Sentence"] = alt_my_normalized["Burmese_Sentence"].apply(normalize_burmese)
print(f"ALT Burmese dataset normalized successfully.")
display(alt_my_normalized.head())

Normalizing ALT Burmese dataset...
ALT Burmese dataset normalized successfully.


Unnamed: 0,ID,Burmese_Sentence
0,SNT.80188.1,ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂...
1,SNT.80188.2,အန်ဒရီယာ မာစီ သည် အီတလီ အတွက် စမ်းသပ်မှု တစ်ခု...
2,SNT.80188.3,ပထမ တစ်ဝက် ၏ တော်တော်များများ အတွက် ကစားပွဲ ကိ...
3,SNT.80188.4,ပေါ်တူဂီ သည် ဘယ်သောအခါမှ စွန့်လွှတ်မှု မရှိခဲ့...
4,SNT.80188.5,အီတလီ သည် ပထမပိုင်း ၌ ၁၆-၅ ဖြင့် ဦးဆောင်ခဲ့ သေ...


In [16]:
# Normalize combined ALT cleaned dataset
print("Normalizing combined ALT dataset...")
alt_combined_normalized = alt_combined_cleaned.copy()
alt_combined_normalized["English_Sentence"] = alt_combined_normalized["English_Sentence"].apply(normalize_text)
alt_combined_normalized["Burmese_Sentence"] = alt_combined_normalized["Burmese_Sentence"].apply(normalize_burmese)
print(f"Combined ALT dataset normalized successfully.")
display(alt_combined_normalized.head())

Normalizing combined ALT dataset...
Combined ALT dataset normalized successfully.


Unnamed: 0,ID,English_Sentence,Burmese_Sentence
0,SNT.80188.1,italy have defeated portugal 315 in pool c of ...,ပြင်သစ်နိုင်ငံ ပါရီမြို့ ပါ့ဒက်စ် ပရင့်စက် ၌ ၂...
1,SNT.80188.2,andrea masi opened the scoring in the fourth m...,အန်ဒရီယာ မာစီ သည် အီတလီ အတွက် စမ်းသပ်မှု တစ်ခု...
2,SNT.80188.3,despite controlling the game for much of the f...,ပထမ တစ်ဝက် ၏ တော်တော်များများ အတွက် ကစားပွဲ ကိ...
3,SNT.80188.4,portugal never gave up and david penalva score...,ပေါ်တူဂီ သည် ဘယ်သောအခါမှ စွန့်လွှတ်မှု မရှိခဲ့...
4,SNT.80188.5,italy led 165 at half time but were matched by...,အီတလီ သည် ပထမပိုင်း ၌ ၁၆-၅ ဖြင့် ဦးဆောင်ခဲ့ သေ...


#### 4. Sentence Segmentation
This step segments text into subword units using SentencePiece Tokenization (SPT). 
The process includes:
1. Training a SentencePiece model using the English and Burmese text from the `myXNLI` dataset and the combined ALT dataset.
2. Applying the trained model to segment sentences in both datasets.
3. Validating the segmentation results with manual or automated benchmarks.

In [17]:
# Prepare paths for SentencePiece model
sp_model_prefix = "sentencepiece_model"
sp_train_input = "combined_texts.txt"  # A temporary file to hold combined dataset text for training
sp_model_path = f"{sp_model_prefix}.model"

In [18]:
# Combine text from myXNLI and ALT datasets for SentencePiece training
print("Preparing data for SentencePiece training...")
with open(sp_train_input, "w", encoding="utf-8") as f:
    # Add text from myXNLI dataset
    for text in myxnli_normalized["sentence1_en"].tolist() + myxnli_normalized["sentence2_en"].tolist() + myxnli_normalized["sentence1_my"].tolist() + myxnli_normalized["sentence2_my"].tolist():
        if pd.notnull(text):  # Avoid writing NaN values
            f.write(f"{text}\n")
        
    # Add text from combined ALT dataset
    for text in alt_combined_normalized["English_Sentence"].tolist() + alt_combined_normalized["Burmese_Sentence"].tolist():
        if pd.notnull(text):  # Avoid writing NaN values
            f.write(f"{text}\n")

print(f"Data prepared in {sp_train_input}.")


Preparing data for SentencePiece training...
Data prepared in combined_texts.txt.


In [19]:
# Train SentencePiece model
print("Training SentencePiece model...")
spm.SentencePieceTrainer.train(
    input=sp_train_input,
    model_prefix=sp_model_prefix,
    vocab_size=8000,
    character_coverage=0.9995,
    model_type="unigram"  # Use unigram language model
)
print(f"SentencePiece model trained and saved as {sp_model_path}.")

Training SentencePiece model...
SentencePiece model trained and saved as sentencepiece_model.model.


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: combined_texts.txt
  input_format: 
  model_prefix: sentencepiece_model
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_p

In [20]:
# Load trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)
print(f"SentencePiece model loaded from {sp_model_path}.")

SentencePiece model loaded from sentencepiece_model.model.


In [21]:
# Apply SentencePiece Tokenization
def apply_sentencepiece(data, column_name):
    return data[column_name].apply(lambda x: " ".join(sp.encode_as_pieces(x)) if pd.notnull(x) else x)

In [22]:
# Apply SentencePiece Tokenization to myXNLI dataset
print("Applying SentencePiece tokenization to myXNLI dataset...")
myxnli_segmented = myxnli_normalized.copy()
myxnli_segmented["sentence1_en"] = apply_sentencepiece(myxnli_segmented, "sentence1_en")
myxnli_segmented["sentence2_en"] = apply_sentencepiece(myxnli_segmented, "sentence2_en")
myxnli_segmented["sentence1_my"] = apply_sentencepiece(myxnli_segmented, "sentence1_my")
myxnli_segmented["sentence2_my"] = apply_sentencepiece(myxnli_segmented, "sentence2_my")
print("SentencePiece tokenization applied to myXNLI dataset successfully.")
display(myxnli_segmented.head())

Applying SentencePiece tokenization to myXNLI dataset...
SentencePiece tokenization applied to myXNLI dataset successfully.


Unnamed: 0,genre,label,sentence1_en,sentence2_en,sentence1_my,sentence2_my
0,government,neutral,▁concept ual ly ▁cre am ▁ski m ming ▁has ▁two ...,▁product ▁and ▁ ge ography ▁are ▁what ▁make ▁c...,▁ သဘော တရား အရ ▁ ခ ရင် မ် စ ိ မ် ခြင်း တွင် ▁အ...,▁ထုတ်ကုန် နှင့် ▁ပ ထ ဝီ ဝင် အနေအထား သည် ▁ ခ ရင...
1,telephone,entailment,▁you ▁know ▁during ▁the ▁season ▁and ▁i ▁guess...,▁you ▁lose ▁the ▁things ▁to ▁the ▁following ▁l...,▁ ရာသီ အတွင်း ▁မင်းသိတယ် ၊ ▁မင်းရဲ့ အဆင့် ကို ...,▁လူ တွေ ပြန် ခေါ် ရင် ▁အောက် ပါ အဆင့် အထိ ▁ဆုံ...
2,fiction,entailment,▁one ▁of ▁our ▁number ▁will ▁carry ▁out ▁your ...,▁a ▁member ▁of ▁my ▁team ▁will ▁execut e ▁your...,▁ကျွန်ုပ်တို့၏ နံပါတ် တစ်ခုသည် ▁သင့် ညွှန ် ကြ...,▁ကျွန်ုပ်၏ အဖွဲ့ သား တစ်ဦးသည် ▁သင်၏ အမိန့် စာ ...
3,fiction,entailment,▁how ▁do ▁you ▁know ? ▁all ▁this ▁is ▁their ▁i...,▁this ▁information ▁be lo ng s ▁to ▁them .,▁သင် ဘယ်လို သိ သ လဲ ? ▁ဒါ တွေ အားလုံး ဟာ ▁သူတိ...,▁ဒီ အချက်အလက် က ▁သူတို့ ပိုင် တယ်။
4,telephone,neutral,▁yeah ▁i ▁tell ▁you ▁what ▁though ▁if ▁you ▁go...,▁the ▁ten ni s ▁ s ho es ▁have ▁a ▁range ▁of ▁...,▁ဟုတ်တယ် ▁ငါမင်းကို ပြောပြ မယ် ▁ဒီ တင်း နစ် ဖိ...,▁ တင်း နစ် ဖိနပ် များသည် ▁ ဈ ေး နှုန်း အမျိုးမ...


In [23]:
# Apply SentencePiece Tokenization to ALT English and Burmese datasets
print("Applying SentencePiece tokenization to combined ALT dataset...")
alt_combined_segmented = alt_combined_normalized.copy()
alt_combined_segmented["English_Sentence"] = apply_sentencepiece(alt_combined_segmented, "English_Sentence")
alt_combined_segmented["Burmese_Sentence"] = apply_sentencepiece(alt_combined_segmented, "Burmese_Sentence")
print("SentencePiece tokenization applied to combined ALT dataset successfully.")
display(alt_combined_segmented.head())

Applying SentencePiece tokenization to combined ALT dataset...
SentencePiece tokenization applied to combined ALT dataset successfully.


Unnamed: 0,ID,English_Sentence,Burmese_Sentence
0,SNT.80188.1,▁italy ▁have ▁defeat ed ▁port u gal ▁3 15 ▁in ...,▁ပြင်သစ် နိုင်ငံ ▁ပါရီ မြို့ ▁ပါ ့ ဒ က်စ် ▁ပ ရ...
1,SNT.80188.2,▁and re a ▁ma s i ▁open ed ▁the ▁sc or ing ▁in...,▁အ န် ဒ ရီ ယာ ▁ မာ စီ ▁သည် ▁အီတလီ ▁အတွက် ▁ စမ်...
2,SNT.80188.3,▁de spite ▁control ling ▁the ▁game ▁for ▁much ...,▁ပထမ ▁ တစ်ဝက် ▁၏ ▁ တော်တော်များများ ▁အတွက် ▁ကစ...
3,SNT.80188.4,▁port u gal ▁never ▁gave ▁up ▁and ▁david ▁pen ...,▁ပေါ်တူဂီ ▁သည် ▁ဘယ်သောအခါမှ ▁စွန့်လွှတ် မှု ▁မ...
4,SNT.80188.5,▁italy ▁ led ▁16 5 ▁at ▁half ▁time ▁but ▁were ...,▁အီတလီ ▁သည် ▁ပထမ ပိုင်း ▁ ၌ ▁၁ ၆ - ၅ ▁ ဖြင့် ▁...


#### 5. Morphological Processing
This step involves advanced processing to capture morphological nuances in the text. The operations include:
1. Segmenting words into morphemes, handling prefixes, suffixes, and compound words.
2. Normalizing compounded forms while preserving semantic meanings.
3. Incorporating loanwords for better representation in the text data.

In [24]:
# Function to segment words into morphemes
def segment_morphemes(text):
    if pd.isnull(text):
        return text  # Skip null values
    # Example: Handle prefixes, suffixes, and compounds
    # For demonstration, splitting by common Burmese and English morphemes
    segmented_text = re.sub(r'(\bpre|un|re|in|dis|mis|non)(\w+)', r'\1-\2', text)  # English prefixes
    segmented_text = re.sub(r'(\w+)(ing|ly|ed|er|ion|able|ible|ment|ness|ship|ous|ive|ish|ize)\b', r'\1-\2', segmented_text)  # English suffixes
    # Add custom Burmese rules here for morpheme segmentation
    return segmented_text

In [25]:
# Function to normalize compounded forms
def normalize_compounds(text):
    if pd.isnull(text):
        return text
    # Example: Handle English hyphenated compounds (adjust as needed for Burmese)
    normalized_text = re.sub(r'(\w+)-(\w+)', r'\1 \2', text)
    return normalized_text

In [26]:
# Function to incorporate loanwords
def incorporate_loanwords(text, loanword_dict):
    if pd.isnull(text):
        return text
    # Replace loanwords based on a predefined dictionary
    for loanword, replacement in loanword_dict.items():
        text = re.sub(rf'\b{loanword}\b', replacement, text)
    return text

In [27]:
# Sample loanword dictionary for Burmese
loanword_dict = {
    "ဘဏ်": "bank",  # Example: Burmese word for 'bank'
    "အင်တာနက်": "internet",  # Example: Burmese word for 'internet'
}

In [28]:
# Apply morphological processing to myXNLI dataset
print("Processing myXNLI dataset...")
myxnli_processed = myxnli_segmented.copy()
# Apply morpheme segmentation
myxnli_processed["sentence1_en"] = myxnli_processed["sentence1_en"].apply(segment_morphemes)
myxnli_processed["sentence2_en"] = myxnli_processed["sentence2_en"].apply(segment_morphemes)
myxnli_processed["sentence1_my"] = myxnli_processed["sentence1_my"].apply(segment_morphemes)
myxnli_processed["sentence2_my"] = myxnli_processed["sentence2_my"].apply(segment_morphemes)
# Normalize compounded forms
myxnli_processed["sentence1_en"] = myxnli_processed["sentence1_en"].apply(normalize_compounds)
myxnli_processed["sentence2_en"] = myxnli_processed["sentence2_en"].apply(normalize_compounds)
myxnli_processed["sentence1_my"] = myxnli_processed["sentence1_my"].apply(normalize_compounds)
myxnli_processed["sentence2_my"] = myxnli_processed["sentence2_my"].apply(normalize_compounds)
# Incorporate loanwords
myxnli_processed["sentence1_my"] = myxnli_processed["sentence1_my"].apply(lambda x: incorporate_loanwords(x, loanword_dict))
myxnli_processed["sentence2_my"] = myxnli_processed["sentence2_my"].apply(lambda x: incorporate_loanwords(x, loanword_dict))
    
print("Morphological processing applied to myXNLI dataset successfully.")
display(myxnli_processed.head())

Processing myXNLI dataset...
Morphological processing applied to myXNLI dataset successfully.


Unnamed: 0,genre,label,sentence1_en,sentence2_en,sentence1_my,sentence2_my
0,government,neutral,▁concept ual ly ▁cre am ▁ski m min g ▁has ▁two...,▁product ▁and ▁ ge ography ▁are ▁what ▁make ▁c...,▁ သဘော တရား အရ ▁ ခ ရင် မ် စ ိ မ် ခြင်း တွင် ▁အ...,▁ထုတ်ကုန် နှင့် ▁ပ ထ ဝီ ဝင် အနေအထား သည် ▁ ခ ရင...
1,telephone,entailment,▁you ▁know ▁durin g ▁the ▁season ▁and ▁i ▁gues...,▁you ▁lose ▁the ▁thin gs ▁to ▁the ▁followin g ...,▁ ရာသီ အတွင်း ▁မင်းသိတယ် ၊ ▁မင်းရဲ့ အဆင့် ကို ...,▁လူ တွေ ပြန် ခေါ် ရင် ▁အောက် ပါ အဆင့် အထိ ▁ဆုံ...
2,fiction,entailment,▁one ▁of ▁our ▁numb er ▁will ▁carry ▁out ▁your...,▁a ▁memb er ▁of ▁my ▁team ▁will ▁execut e ▁you...,▁ကျွန်ုပ်တို့၏ နံပါတ် တစ်ခုသည် ▁သင့် ညွှန ် ကြ...,▁ကျွန်ုပ်၏ အဖွဲ့ သား တစ်ဦးသည် ▁သင်၏ အမိန့် စာ ...
3,fiction,entailment,▁how ▁do ▁you ▁know ? ▁all ▁this ▁is ▁their ▁i...,▁this ▁in format-ion ▁be lo ng s ▁to ▁them .,▁သင် ဘယ်လို သိ သ လဲ ? ▁ဒါ တွေ အားလုံး ဟာ ▁သူတိ...,▁ဒီ အချက်အလက် က ▁သူတို့ ပိုင် တယ်။
4,telephone,neutral,▁yeah ▁i ▁tell ▁you ▁what ▁though ▁if ▁you ▁go...,▁the ▁ten ni s ▁ s ho es ▁have ▁a ▁range ▁of ▁...,▁ဟုတ်တယ် ▁ငါမင်းကို ပြောပြ မယ် ▁ဒီ တင်း နစ် ဖိ...,▁ တင်း နစ် ဖိနပ် များသည် ▁ ဈ ေး နှုန်း အမျိုးမ...


In [29]:
# Apply morphological processing to combined ALT dataset
print("Processing combined ALT dataset...")
alt_combined_processed = alt_combined_segmented.copy()
# Apply morpheme segmentation
alt_combined_processed["English_Sentence"] = alt_combined_processed["English_Sentence"].apply(segment_morphemes)
alt_combined_processed["Burmese_Sentence"] = alt_combined_processed["Burmese_Sentence"].apply(segment_morphemes)
# Normalize compounded forms
alt_combined_processed["English_Sentence"] = alt_combined_processed["English_Sentence"].apply(normalize_compounds)
alt_combined_processed["Burmese_Sentence"] = alt_combined_processed["Burmese_Sentence"].apply(normalize_compounds)
# Incorporate loanwords
alt_combined_processed["Burmese_Sentence"] = alt_combined_processed["Burmese_Sentence"].apply(lambda x: incorporate_loanwords(x, loanword_dict))

print("Morphological processing applied to combined ALT dataset successfully.")
display(alt_combined_processed.head())

Processing combined ALT dataset...
Morphological processing applied to combined ALT dataset successfully.


Unnamed: 0,ID,English_Sentence,Burmese_Sentence
0,SNT.80188.1,▁ita ly ▁have ▁defeat ed ▁port u gal ▁3 15 ▁in...,▁ပြင်သစ် နိုင်ငံ ▁ပါရီ မြို့ ▁ပါ ့ ဒ က်စ် ▁ပ ရ...
1,SNT.80188.2,▁and re a ▁ma s i ▁open ed ▁the ▁sc or in g ▁i...,▁အ န် ဒ ရီ ယာ ▁ မာ စီ ▁သည် ▁အီတလီ ▁အတွက် ▁ စမ်...
2,SNT.80188.3,▁de spite ▁control lin g ▁the ▁game ▁for ▁much...,▁ပထမ ▁ တစ်ဝက် ▁၏ ▁ တော်တော်များများ ▁အတွက် ▁ကစ...
3,SNT.80188.4,▁port u gal ▁nev er ▁gave ▁up ▁and ▁david ▁pen...,▁ပေါ်တူဂီ ▁သည် ▁ဘယ်သောအခါမှ ▁စွန့်လွှတ် မှု ▁မ...
4,SNT.80188.5,▁ita ly ▁ l ed ▁16 5 ▁at ▁half ▁time ▁but ▁wer...,▁အီတလီ ▁သည် ▁ပထမ ပိုင်း ▁ ၌ ▁၁ ၆ - ၅ ▁ ဖြင့် ▁...


#### 6. Data Augmentation
This step enhances the dataset by generating additional data using the following methods:
1. **Back-Translation**:
    - Translate Burmese sentences to English and back to Burmese using `facebook/m2m100_418M` and `facebook/mbart-large-50` to create diverse translations while preserving semantic meaning for both `myXNLI` and `combined ALT dataset`.
2. **Pseudo-Parallel Corpus Creation**:
    - Use semantic similarity alignment to identify and align semantically similar sentences from monolingual data to generate pseudo-parallel corpora for the `combined ALT dataset` only.

In [30]:
# function to create temp df
def create_temp_df():
    return pd.DataFrame(columns=["isNull", "original", "translated", "back_translated"])

In [31]:
# function to add row to temp df
def add_row_to_temp_df(df, row):
    # Convert the row dictionary to a DataFrame
    row_df = pd.DataFrame([row])

    # Use pd.concat to add the row
    updated_df = pd.concat([df, row_df], ignore_index=True)

    return updated_df

In [32]:
# function to save and display temp df
def save_display_temp_df(temp_df, tmp_df_name):
    temp_df.to_csv(f"{tmp_df_name}.csv", index=False)
    display(temp_df.tail(1))

##### Back-Translation (facebook/m2m100_418M)

In [34]:
# Load M2M100 model and tokenizer
m2m_model_name = "facebook/m2m100_418M"
m2m_translation_model = M2M100ForConditionalGeneration.from_pretrained(m2m_model_name).to(device)
m2m_translation_tokenizer = M2M100Tokenizer.from_pretrained(m2m_model_name)

In [35]:
# Function for back-translation using M2M100
def m2m_back_translate(text, src_lang, tgt_lang, df, df_name):
    clear_output(wait=True)
    
    if pd.isnull(text):
        df.append({"isNull": True})
        save_display_temp_df(df, df_name)
        return text  # Skip null values
    # Translate to the target language
    m2m_translation_tokenizer.src_lang = src_lang
    encoded = m2m_translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    translated = m2m_translation_model.generate(**encoded)
    translated_text = m2m_translation_tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

    # Translate back to the source language
    m2m_translation_tokenizer.src_lang = tgt_lang
    encoded_back = m2m_translation_tokenizer(translated_text, return_tensors="pt", padding=True, truncation=True).to(device)
    back_translated = m2m_translation_model.generate(**encoded_back)
    back_translated_text = m2m_translation_tokenizer.batch_decode(back_translated, skip_special_tokens=True)[0]
    
    new_row = {"isNull": False, "original": text, "translated": translated_text, "back_translated": back_translated_text}
    df.loc[len(df)] = new_row

    save_display_temp_df(df, df_name)

    return back_translated_text

In [36]:
# Apply back-translation to the Burmese sentences in myXNLI
myxnli_m2m_back_translated = myxnli_processed.copy()

In [None]:
# Apply back-translation to the Burmese sentences 1 in myXNLI
myxnli_m2m_back_translated_temp_df1 = create_temp_df()
myxnli_m2m_back_translated["sentence1_my"] = myxnli_m2m_back_translated["sentence1_my"].apply(
    lambda x: m2m_back_translate(x, src_lang="my", tgt_lang="en", df=myxnli_m2m_back_translated_temp_df1, df_name='myxnli_m2m_back_translated_temp_df1')
)

In [None]:
# Apply back-translation to the Burmese sentences 2 in myXNLI
myxnli_m2m_back_translated_temp_df2 = create_temp_df()
myxnli_m2m_back_translated["sentence2_my"] = myxnli_m2m_back_translated["sentence2_my"].apply(
    lambda x: m2m_back_translate(x, src_lang="my", tgt_lang="en", df=myxnli_m2m_back_translated_temp_df2, df_name='myxnli_m2m_back_translated_temp_df2')
)

In [None]:
# display and save back-translated myXNLI dataset
print("Back-translation applied to myXNLI dataset with m2m100.")
display(myxnli_m2m_back_translated.head())
myxnli_m2m_back_translated.to_csv('myxnli_m2m_back_translated.csv', index=False)

In [None]:
# Apply back-translation to combined ALT dataset
print("Applying back-translation to combined ALT dataset with m2m100...")
alt_m2m_back_translated = alt_combined_processed.copy()
alt_m2m_back_translated["Burmese_Sentence"] = alt_m2m_back_translated["Burmese_Sentence"].apply(
    lambda x: m2m_back_translate(x, src_lang="my", tgt_lang="en") if pd.notnull(x) else x
)
print("Back-translation applied to combined ALT dataset with m2m100.")
display(alt_m2m_back_translated.head())

In [None]:
# save back-translated alt dataset
alt_m2m_back_translated.to_csv('alt_m2m_back_translated.csv', index=False)

##### Back-Translation (facebook/mbart-large-50)

In [36]:
# Load mBART-50 model and tokenizer
mbart_model_name = "facebook/mbart-large-50"
mbart_translation_model = MBartForConditionalGeneration.from_pretrained(mbart_model_name).to(device)
mbart_translation_tokenizer = MBart50TokenizerFast.from_pretrained(mbart_model_name)

In [37]:
# Function for back-translation using mBART-50
def mbart_back_translate(text, src_lang, tgt_lang, df, df_name):
    clear_output(wait=True)
    
    if pd.isnull(text):
        df.append({"isNull": True})
        save_display_temp_df(df, df_name)
        return text  # Skip null values

    # Translate to the target language
    mbart_translation_tokenizer.src_lang = src_lang
    encoded = mbart_translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    translated = mbart_translation_model.generate(
        **encoded,
        forced_bos_token_id=mbart_translation_tokenizer.lang_code_to_id[tgt_lang]
    )
    translated_text = mbart_translation_tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

    # Translate back to the source language
    mbart_translation_tokenizer.src_lang = tgt_lang
    encoded_back = mbart_translation_tokenizer(translated_text, return_tensors="pt", padding=True, truncation=True).to(device)
    back_translated = mbart_translation_model.generate(
        **encoded_back,
        forced_bos_token_id=mbart_translation_tokenizer.lang_code_to_id[src_lang]
    )
    back_translated_text = mbart_translation_tokenizer.batch_decode(back_translated, skip_special_tokens=True)[0]

    new_row = {"isNull": False, "original": text, "translated": translated_text, "back_translated": back_translated_text}
    df.loc[len(df)] = new_row

    save_display_temp_df(df, df_name)

    return back_translated_text

In [38]:
# Apply back-translation to the Burmese sentences in myXNLI
myxnli_mbart_back_translated = myxnli_processed.copy()

In [None]:
# Apply back-translation to the Burmese sentences 1 in myXNLI
myxnli_mbart_back_translated_temp_df1 = create_temp_df()
myxnli_mbart_back_translated["sentence1_my"] = myxnli_mbart_back_translated["sentence1_my"].apply(
    lambda x: mbart_back_translate(x, src_lang="my_MM", tgt_lang="en_XX", df=myxnli_mbart_back_translated_temp_df1, df_name="myxnli_mbart_back_translated_temp_df1")
)

Unnamed: 0,isNull,original,translated,back_translated
120,False,▁ဆေး ဖက် ဝင် အ ပင် များ ▁ ထုတ် ယူ မှုသည် ▁အ ပျ...,ဆေး ဖက် ဝင် အ ပင် များ ထုတ် ယူ မှုသည် အ ပျော့ ...,ဆေး ဖက် ဝင် အ ပင် များ ထုတ် ယူ မှုသည် အ ပျော့ ...


In [None]:
# Apply back-translation to the Burmese sentences 2 in myXNLI
myxnli_mbart_back_translated_temp_df2 = create_temp_df()
myxnli_mbart_back_translated["sentence2_my"] = myxnli_mbart_back_translated["sentence2_my"].apply(
    lambda x: mbart_back_translate(x, src_lang="my_MM", tgt_lang="en_XX", df=myxnli_mbart_back_translated_temp_df2, df_name="myxnli_mbart_back_translated_temp_df2")
)

In [None]:
# display and save back-translated myXNLI dataset
print("Back-translation applied to myXNLI dataset with mBART-50.")
display(myxnli_mbart_back_translated.head())
myxnli_mbart_back_translated.to_csv('myxnli_mbart_back_translated.csv', index=False)

In [None]:
# Apply back-translation to combined ALT dataset
alt_mbart_back_translated_temp_df = create_temp_df()
alt_mbart_back_translated = alt_combined_processed.copy()
alt_mbart_back_translated["Burmese_Sentence"] = alt_mbart_back_translated["Burmese_Sentence"].apply(
    lambda x: mbart_back_translate(x, src_lang="my_MM", tgt_lang="en_XX", df=alt_mbart_back_translated_temp_df, df_name="alt_mbart_back_translated_temp_df")
)
print("Back-translation applied to combined ALT dataset with mBART-50.")
display(alt_mbart_back_translated.head())
alt_mbart_back_translated.to_csv('alt_mbart_back_translated.csv', index=False)

##### Pseudo-Parallel Corpus Creation

In [33]:
# Load semantic similarity model
similarity_model_name = "all-MiniLM-L6-v2"
similarity_model = SentenceTransformer(similarity_model_name).to(device)

In [34]:
# Function to create pseudo-parallel corpus
def create_pseudo_parallel(data_en, data_my, similarity_model, top_k=1):
    pseudo_parallel = []
    embeddings_en = similarity_model.encode(data_en, convert_to_tensor=True, device=device)
    embeddings_my = similarity_model.encode(data_my, convert_to_tensor=True, device=device)
    similarity_scores = util.pytorch_cos_sim(embeddings_en, embeddings_my)

    for idx_en, scores in enumerate(similarity_scores):
        top_matches = scores.topk(k=top_k)
        for match_idx in top_matches.indices:
            pseudo_parallel.append((data_en[idx_en], data_my[match_idx.item()], scores[match_idx].item()))
    
    return pseudo_parallel

In [35]:
# Apply pseudo-parallel creation to combined ALT dataset
print("Creating pseudo-parallel corpus from combined ALT dataset...")
alt_combined_en = alt_combined_processed["English_Sentence"].dropna().tolist()
alt_combined_my = alt_combined_processed["Burmese_Sentence"].dropna().tolist()
pseudo_parallel_data = create_pseudo_parallel(alt_combined_en, alt_combined_my, similarity_model)
    
pseudo_parallel_df = pd.DataFrame(pseudo_parallel_data, columns=["English_Sentence", "Burmese_Sentence", "Similarity_Score"])
print("Pseudo-parallel corpus created successfully.")
display(pseudo_parallel_df.head())

Creating pseudo-parallel corpus from combined ALT dataset...
Pseudo-parallel corpus created successfully.


Unnamed: 0,English_Sentence,Burmese_Sentence,Similarity_Score
0,▁ita ly ▁have ▁defeat ed ▁port u gal ▁3 15 ▁in...,▁ ရီ ဂ ႀ င်း နင်း ▁ ဆန်း ရှ ိုင်း ▁ကို ႕ ▁စ ကူ...,0.766668
1,▁and re a ▁ma s i ▁open ed ▁the ▁sc or in g ▁i...,▁သူတို့ ▁သည် ▁I C A O ▁၊ ▁A T S B ▁နှင့် ▁ “ ▁...,0.645176
2,▁de spite ▁control lin g ▁the ▁game ▁for ▁much...,▁ဘ န် ကောက် မြို့ ▁၏ ▁P ra we t ▁ ခရိုင် ▁ ဆင်...,0.647617
3,▁port u gal ▁nev er ▁gave ▁up ▁and ▁david ▁pen...,▁I H S ▁G lo b al ▁In s ight ▁၏ ▁ဆန်းစစ် လေ့လာ...,0.709018
4,▁ita ly ▁ l ed ▁16 5 ▁at ▁half ▁time ▁but ▁wer...,▁ယင်း သည် ▁200 5 ခု နှစ် ▁တွင် ▁ပြုလုပ်ခဲ့ သော...,0.655455


In [None]:
# save pseudo-parallel alt dataset
pseudo_parallel_df.to_csv('pseudo_parallel_df.csv', index=False)