#### Importing all the important libraries

In [None]:
import tensorflow as tf
import pandas as pd
from IPython.display import display, clear_output
import unicodedata
import sentencepiece as spm
import re
import torch
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, MBartForConditionalGeneration, MBart50TokenizerFast
from sentence_transformers import SentenceTransformer, util

#### Set GPU

In [None]:
# for mac
devices = tf.config.list_physical_devices()
print("\nDevices: ", devices)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
        details = tf.config.experimental.get_device_details(gpu)
        print("GPU details: ", details)
else:
    print("No GPU found. Using CPU.")

# set GPU device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# for window
print("Tensorflow GPUs: ", tf.config.list_physical_devices('GPU'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using PyTorch device:", device)
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

#### 1. Data Loading
This step loads the primary datasets:
1. `myXNLI.train.tsv`: English-Burmese parallel dataset in TSV format.
2. `ALT_data_en.txt` and `ALT_data_my.txt`: English and Burmese parts of the ALT corpus, respectively.

The datasets will be loaded into Pandas DataFrames for analysis and preprocessing.

In [None]:
# Load myXNLI dataset
myxnli_path = './data/myXNLI.train.tsv'  # Path to the file
myxnli_data = pd.read_csv(myxnli_path, sep='\t', header=0)
print(f"myXNLI dataset loaded successfully with {len(myxnli_data)} records.")
display(myxnli_data.head())  # Display the first few rows of the dataset

In [None]:
# Load ALT English data
alt_en_path = './data/ALT_data_en.txt'  # Path to the English ALT corpus
alt_en_data = pd.read_csv(alt_en_path, sep='\t', header=None, names=["ID", "English_Sentence"])
print(f"ALT English dataset loaded successfully with {len(alt_en_data)} records.")
display(alt_en_data.head())

In [None]:
# Load ALT Burmese data
alt_my_path = './data/ALT_data_my.txt'  # Path to the Burmese ALT corpus
alt_my_data = pd.read_csv(alt_my_path, sep='\t', header=None, names=["ID", "Burmese_Sentence"])
print(f"ALT Burmese dataset loaded successfully with {len(alt_my_data)} records.")
display(alt_my_data.head())

In [None]:
# Combine ALT datasets (if IDs match)
alt_combined = pd.merge(alt_en_data, alt_my_data, on="ID")
print(f"ALT combined dataset created successfully with {len(alt_combined)} records.")
display(alt_combined.head())

#### 2. Data Cleaning
This step focuses on cleaning the datasets to prepare them for further processing. The cleaning operations include:
1. Removing duplicate entries.
2. Handling missing values.
3. Removing non-standard characters or symbols unrelated to the Burmese or English language.
4. Ensuring consistent formatting.

The cleaned datasets will be ready for normalization and tokenization in the next steps.

In [None]:
# Cleaning myXNLI dataset
print("Cleaning myXNLI dataset...")
myxnli_cleaned = myxnli_data.drop_duplicates()  # Remove duplicates
myxnli_cleaned = myxnli_cleaned.dropna()  # Remove rows with missing values
#myxnli_cleaned = myxnli_cleaned.replace(r'[^\w\s]', '', regex=True)  # Remove non-standard characters
print(f"myXNLI dataset cleaned successfully.")
print(f"Original Records: {len(myxnli_data)}.")
print(f"Remaining records: {len(myxnli_cleaned)}.")
display(myxnli_cleaned.head())

In [None]:
# Cleaning ALT English data
print("Cleaning ALT English dataset...")
alt_en_cleaned = alt_en_data.drop_duplicates()  # Remove duplicates
alt_en_cleaned = alt_en_cleaned.dropna()  # Remove rows with missing values
alt_en_cleaned["English_Sentence"] = alt_en_cleaned["English_Sentence"].replace(r'[^\w\s]', '', regex=True)  # Remove non-standard characters
print(f"ALT English dataset cleaned successfully.")
print(f"Original records: {len(alt_en_data)}.")
print(f"Remaining records: {len(alt_en_cleaned)}.")
display(alt_en_cleaned.head())

In [None]:
# Cleaning ALT Burmese data
print("Cleaning ALT Burmese dataset...")
alt_my_cleaned = alt_my_data.drop_duplicates()  # Remove duplicates
alt_my_cleaned = alt_my_cleaned.dropna()  # Remove rows with missing values
#alt_my_cleaned["Burmese_Sentence"] = alt_my_cleaned["Burmese_Sentence"].replace(r'[^\w\s]', '', regex=True)  # Remove non-standard characters
print(f"ALT Burmese dataset cleaned successfully.")
print(f"Original records: {len(alt_my_data)}")
print(f"Remaining records: {len(alt_my_cleaned)}")
display(alt_my_cleaned.head())

In [None]:
# Combine cleaned ALT datasets
print("Cleaning combined ALT dataset...")
alt_combined_cleaned = pd.merge(alt_en_cleaned, alt_my_cleaned, on="ID")
print(f"Combined ALT dataset cleaned successfully.")
print(f"Original records: {len(alt_combined)}")
print(f"Remaining records: {len(alt_combined_cleaned)}")
display(alt_combined_cleaned.head())

#### 3. Data Normalization
This step normalizes the text data to ensure consistency across datasets. The normalization process includes:
1. Applying Unicode normalization to handle encoding inconsistencies.
2. Standardizing text formatting by converting all text to lowercase and standardizing punctuation.
3. Normalizing diacritical marks and stacked consonants in the Burmese text to improve text representation.

In [11]:
# Function to normalize text
def normalize_text(text):
    if pd.isnull(text):
        return text  # Skip null values
    # Apply Unicode normalization
    normalized_text = unicodedata.normalize('NFKC', text)
    # Convert to lowercase
    normalized_text = normalized_text.lower()
    # Standardize punctuation (e.g., replace unusual punctuation marks)
    normalized_text = normalized_text.replace('“', '"').replace('”', '"').replace('’', "'")
    return normalized_text

In [12]:
# Function to normalize Burmese text (handles diacritical marks and stacked consonants)
def normalize_burmese(text):
    if pd.isnull(text):
        return text  # Skip null values
    normalized_text = unicodedata.normalize('NFKC', text)
    # Additional Burmese-specific normalization can be added here if needed
    return normalized_text

In [None]:
# Normalize myXNLI cleaned dataset
print("Normalizing myXNLI dataset...")
myxnli_normalized = myxnli_cleaned.copy()

# Normalize English columns
myxnli_normalized["sentence1_en"] = myxnli_normalized["sentence1_en"].apply(normalize_text)
myxnli_normalized["sentence2_en"] = myxnli_normalized["sentence2_en"].apply(normalize_text)

# Normalize Burmese columns
myxnli_normalized["sentence1_my"] = myxnli_normalized["sentence1_my"].apply(normalize_burmese)
myxnli_normalized["sentence2_my"] = myxnli_normalized["sentence2_my"].apply(normalize_burmese)

print(f"myXNLI dataset normalized successfully.")
display(myxnli_normalized.head())

In [None]:
# Normalize ALT English cleaned dataset
print("Normalizing ALT English dataset...")
alt_en_normalized = alt_en_cleaned.copy()
alt_en_normalized["English_Sentence"] = alt_en_normalized["English_Sentence"].apply(normalize_text)
print(f"ALT English dataset normalized successfully.")
display(alt_en_normalized.head())

In [None]:
# Normalize ALT Burmese cleaned dataset
print("Normalizing ALT Burmese dataset...")
alt_my_normalized = alt_my_cleaned.copy()
alt_my_normalized["Burmese_Sentence"] = alt_my_normalized["Burmese_Sentence"].apply(normalize_burmese)
print(f"ALT Burmese dataset normalized successfully.")
display(alt_my_normalized.head())

In [None]:
# Normalize combined ALT cleaned dataset
print("Normalizing combined ALT dataset...")
alt_combined_normalized = alt_combined_cleaned.copy()
alt_combined_normalized["English_Sentence"] = alt_combined_normalized["English_Sentence"].apply(normalize_text)
alt_combined_normalized["Burmese_Sentence"] = alt_combined_normalized["Burmese_Sentence"].apply(normalize_burmese)
print(f"Combined ALT dataset normalized successfully.")
display(alt_combined_normalized.head())

#### 4. Sentence Segmentation
This step segments text into subword units using SentencePiece Tokenization (SPT). 
The process includes:
1. Training a SentencePiece model using the English and Burmese text from the `myXNLI` dataset and the combined ALT dataset.
2. Applying the trained model to segment sentences in both datasets.
3. Validating the segmentation results with manual or automated benchmarks.

In [17]:
# Prepare paths for SentencePiece model
sp_model_prefix = "sentencepiece_model"
sp_train_input = "combined_texts.txt"  # A temporary file to hold combined dataset text for training
sp_model_path = f"{sp_model_prefix}.model"

In [None]:
# Combine text from myXNLI and ALT datasets for SentencePiece training
print("Preparing data for SentencePiece training...")
with open(sp_train_input, "w", encoding="utf-8") as f:
    # Add text from myXNLI dataset
    for text in myxnli_normalized["sentence1_en"].tolist() + myxnli_normalized["sentence2_en"].tolist() + myxnli_normalized["sentence1_my"].tolist() + myxnli_normalized["sentence2_my"].tolist():
        if pd.notnull(text):  # Avoid writing NaN values
            f.write(f"{text}\n")
        
    # Add text from combined ALT dataset
    for text in alt_combined_normalized["English_Sentence"].tolist() + alt_combined_normalized["Burmese_Sentence"].tolist():
        if pd.notnull(text):  # Avoid writing NaN values
            f.write(f"{text}\n")

print(f"Data prepared in {sp_train_input}.")


In [None]:
# Train SentencePiece model
print("Training SentencePiece model...")
spm.SentencePieceTrainer.train(
    input=sp_train_input,
    model_prefix=sp_model_prefix,
    vocab_size=8000,
    character_coverage=0.9995,
    model_type="unigram"  # Use unigram language model
)
print(f"SentencePiece model trained and saved as {sp_model_path}.")

In [None]:
# Load trained SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load(sp_model_path)
print(f"SentencePiece model loaded from {sp_model_path}.")

In [21]:
# Apply SentencePiece Tokenization
def apply_sentencepiece(data, column_name):
    return data[column_name].apply(lambda x: " ".join(sp.encode_as_pieces(x)) if pd.notnull(x) else x)

In [None]:
# Apply SentencePiece Tokenization to myXNLI dataset
print("Applying SentencePiece tokenization to myXNLI dataset...")
myxnli_segmented = myxnli_normalized.copy()
myxnli_segmented["sentence1_en"] = apply_sentencepiece(myxnli_segmented, "sentence1_en")
myxnli_segmented["sentence2_en"] = apply_sentencepiece(myxnli_segmented, "sentence2_en")
myxnli_segmented["sentence1_my"] = apply_sentencepiece(myxnli_segmented, "sentence1_my")
myxnli_segmented["sentence2_my"] = apply_sentencepiece(myxnli_segmented, "sentence2_my")
print("SentencePiece tokenization applied to myXNLI dataset successfully.")
display(myxnli_segmented.head())

In [None]:
# Apply SentencePiece Tokenization to ALT English and Burmese datasets
print("Applying SentencePiece tokenization to combined ALT dataset...")
alt_combined_segmented = alt_combined_normalized.copy()
alt_combined_segmented["English_Sentence"] = apply_sentencepiece(alt_combined_segmented, "English_Sentence")
alt_combined_segmented["Burmese_Sentence"] = apply_sentencepiece(alt_combined_segmented, "Burmese_Sentence")
print("SentencePiece tokenization applied to combined ALT dataset successfully.")
display(alt_combined_segmented.head())

#### 5. Morphological Processing
This step involves advanced processing to capture morphological nuances in the text. The operations include:
1. Segmenting words into morphemes, handling prefixes, suffixes, and compound words.
2. Normalizing compounded forms while preserving semantic meanings.
3. Incorporating loanwords for better representation in the text data.

In [28]:
# Function to segment words into morphemes
def segment_morphemes(text):
    if pd.isnull(text):
        return text  # Skip null values
    # Example: Handle prefixes, suffixes, and compounds
    # For demonstration, splitting by common Burmese and English morphemes
    segmented_text = re.sub(r'(\bpre|un|re|in|dis|mis|non)(\w+)', r'\1-\2', text)  # English prefixes
    segmented_text = re.sub(r'(\w+)(ing|ly|ed|er|ion|able|ible|ment|ness|ship|ous|ive|ish|ize)\b', r'\1-\2', segmented_text)  # English suffixes
    # Add custom Burmese rules here for morpheme segmentation
    return segmented_text

In [24]:
# Function to normalize compounded forms
def normalize_compounds(text):
    if pd.isnull(text):
        return text
    # Example: Handle English hyphenated compounds (adjust as needed for Burmese)
    normalized_text = re.sub(r'(\w+)-(\w+)', r'\1 \2', text)
    return normalized_text

In [25]:
# Function to incorporate loanwords
def incorporate_loanwords(text, loanword_dict):
    if pd.isnull(text):
        return text
    # Replace loanwords based on a predefined dictionary
    for loanword, replacement in loanword_dict.items():
        text = re.sub(rf'\b{loanword}\b', replacement, text)
    return text

In [26]:
# Sample loanword dictionary for Burmese
loanword_dict = {
    "ဘဏ်": "bank",  # Example: Burmese word for 'bank'
    "အင်တာနက်": "internet",  # Example: Burmese word for 'internet'
}

In [None]:
# Apply morphological processing to myXNLI dataset
print("Processing myXNLI dataset...")
myxnli_processed = myxnli_segmented.copy()
# Apply morpheme segmentation
myxnli_processed["sentence1_en"] = myxnli_processed["sentence1_en"].apply(segment_morphemes)
myxnli_processed["sentence2_en"] = myxnli_processed["sentence2_en"].apply(segment_morphemes)
myxnli_processed["sentence1_my"] = myxnli_processed["sentence1_my"].apply(segment_morphemes)
myxnli_processed["sentence2_my"] = myxnli_processed["sentence2_my"].apply(segment_morphemes)
# Normalize compounded forms
myxnli_processed["sentence1_en"] = myxnli_processed["sentence1_en"].apply(normalize_compounds)
myxnli_processed["sentence2_en"] = myxnli_processed["sentence2_en"].apply(normalize_compounds)
myxnli_processed["sentence1_my"] = myxnli_processed["sentence1_my"].apply(normalize_compounds)
myxnli_processed["sentence2_my"] = myxnli_processed["sentence2_my"].apply(normalize_compounds)
# Incorporate loanwords
myxnli_processed["sentence1_my"] = myxnli_processed["sentence1_my"].apply(lambda x: incorporate_loanwords(x, loanword_dict))
myxnli_processed["sentence2_my"] = myxnli_processed["sentence2_my"].apply(lambda x: incorporate_loanwords(x, loanword_dict))
    
print("Morphological processing applied to myXNLI dataset successfully.")
display(myxnli_processed.head())

In [None]:
# Apply morphological processing to combined ALT dataset
print("Processing combined ALT dataset...")
alt_combined_processed = alt_combined_segmented.copy()
# Apply morpheme segmentation
alt_combined_processed["English_Sentence"] = alt_combined_processed["English_Sentence"].apply(segment_morphemes)
alt_combined_processed["Burmese_Sentence"] = alt_combined_processed["Burmese_Sentence"].apply(segment_morphemes)
# Normalize compounded forms
alt_combined_processed["English_Sentence"] = alt_combined_processed["English_Sentence"].apply(normalize_compounds)
alt_combined_processed["Burmese_Sentence"] = alt_combined_processed["Burmese_Sentence"].apply(normalize_compounds)
# Incorporate loanwords
alt_combined_processed["Burmese_Sentence"] = alt_combined_processed["Burmese_Sentence"].apply(lambda x: incorporate_loanwords(x, loanword_dict))

print("Morphological processing applied to combined ALT dataset successfully.")
display(alt_combined_processed.head())

#### 6. Data Augmentation
This step enhances the dataset by generating additional data using the following methods:
1. **Back-Translation**:
    - Translate Burmese sentences to English and back to Burmese using `facebook/m2m100_418M` and `facebook/mbart-large-50` to create diverse translations while preserving semantic meaning for both `myXNLI` and `combined ALT dataset`.
2. **Pseudo-Parallel Corpus Creation**:
    - Use semantic similarity alignment to identify and align semantically similar sentences from monolingual data to generate pseudo-parallel corpora for the `combined ALT dataset` only.

In [31]:
# function to create temp df
def create_temp_df():
    return pd.DataFrame(columns=["isNull", "original", "translated", "back_translated"])

In [32]:
# function to add row to temp df
def add_row_to_temp_df(df, row):
    # Convert the row dictionary to a DataFrame
    row_df = pd.DataFrame([row])

    # Use pd.concat to add the row
    updated_df = pd.concat([df, row_df], ignore_index=True)

    return updated_df

In [33]:
# function to save and display temp df
def save_display_temp_df(temp_df, tmp_df_name):
    temp_df.to_csv(f"{tmp_df_name}.csv", index=False)
    display(temp_df.tail(1))

##### Back-Translation (facebook/m2m100_418M)

In [None]:
# Load M2M100 model and tokenizer
m2m_model_name = "facebook/m2m100_418M"
m2m_translation_model = M2M100ForConditionalGeneration.from_pretrained(m2m_model_name).to(device)
m2m_translation_tokenizer = M2M100Tokenizer.from_pretrained(m2m_model_name)

In [36]:
# Function for back-translation using M2M100
def m2m_back_translate(text, src_lang, tgt_lang, df, df_name):
    if pd.isnull(text):
        df.append({"isNull": True})
        save_display_temp_df(df, df_name)
        return text  # Skip null values
    # Translate to the target language
    m2m_translation_tokenizer.src_lang = src_lang
    encoded = m2m_translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    translated = m2m_translation_model.generate(**encoded)
    translated_text = m2m_translation_tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

    # Translate back to the source language
    m2m_translation_tokenizer.src_lang = tgt_lang
    encoded_back = m2m_translation_tokenizer(translated_text, return_tensors="pt", padding=True, truncation=True).to(device)
    back_translated = m2m_translation_model.generate(**encoded_back)
    back_translated_text = m2m_translation_tokenizer.batch_decode(back_translated, skip_special_tokens=True)[0]
    
    new_row = {"isNull": False, "original": text, "translated": translated_text, "back_translated": back_translated_text}
    df.loc[len(df)] = new_row

    save_display_temp_df(df, df_name)

    return back_translated_text

In [34]:
# Apply back-translation to the Burmese sentences in myXNLI
myxnli_m2m_back_translated = myxnli_processed.copy()

In [None]:
# Apply back-translation to the Burmese sentences 1 in myXNLI
myxnli_m2m_back_translated_temp_df1 = create_temp_df()
myxnli_m2m_back_translated["sentence1_my"] = myxnli_m2m_back_translated["sentence1_my"].apply(
    lambda x: m2m_back_translate(x, src_lang="my", tgt_lang="en", df=myxnli_m2m_back_translated_temp_df1, df_name='myxnli_m2m_back_translated_temp_df1')
)

In [None]:
# Apply back-translation to the Burmese sentences 2 in myXNLI
myxnli_m2m_back_translated_temp_df2 = create_temp_df()
myxnli_m2m_back_translated["sentence2_my"] = myxnli_m2m_back_translated["sentence2_my"].apply(
    lambda x: m2m_back_translate(x, src_lang="my", tgt_lang="en", df=myxnli_m2m_back_translated_temp_df2, df_name='myxnli_m2m_back_translated_temp_df2')
)

In [None]:
# display and save back-translated myXNLI dataset
print("Back-translation applied to myXNLI dataset with m2m100.")
display(myxnli_m2m_back_translated.head())
myxnli_m2m_back_translated.to_csv('myxnli_m2m_back_translated.csv', index=False)

In [None]:
# Apply back-translation to combined ALT dataset
print("Applying back-translation to combined ALT dataset with m2m100...")
alt_m2m_back_translated = alt_combined_processed.copy()
alt_m2m_back_translated["Burmese_Sentence"] = alt_m2m_back_translated["Burmese_Sentence"].apply(
    lambda x: m2m_back_translate(x, src_lang="my", tgt_lang="en") if pd.notnull(x) else x
)
print("Back-translation applied to combined ALT dataset with m2m100.")
display(alt_m2m_back_translated.head())

In [None]:
# save back-translated alt dataset
alt_m2m_back_translated.to_csv('alt_m2m_back_translated.csv', index=False)

##### Back-Translation (facebook/mbart-large-50)

In [33]:
# Load mBART-50 model and tokenizer
mbart_model_name = "facebook/mbart-large-50"
mbart_translation_model = MBartForConditionalGeneration.from_pretrained(mbart_model_name).to(device)
mbart_translation_tokenizer = MBart50TokenizerFast.from_pretrained(mbart_model_name)

In [42]:
# Function for back-translation using mBART-50
def mbart_back_translate(text, src_lang, tgt_lang, df, df_name):
    clear_output(wait=True)
    
    if pd.isnull(text):
        df.append({"isNull": True})
        save_display_temp_df(df, df_name)
        return text  # Skip null values

    # Translate to the target language
    mbart_translation_tokenizer.src_lang = src_lang
    encoded = mbart_translation_tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    translated = mbart_translation_model.generate(
        **encoded,
        forced_bos_token_id=mbart_translation_tokenizer.lang_code_to_id[tgt_lang]
    )
    translated_text = mbart_translation_tokenizer.batch_decode(translated, skip_special_tokens=True)[0]

    # Translate back to the source language
    mbart_translation_tokenizer.src_lang = tgt_lang
    encoded_back = mbart_translation_tokenizer(translated_text, return_tensors="pt", padding=True, truncation=True).to(device)
    back_translated = mbart_translation_model.generate(
        **encoded_back,
        forced_bos_token_id=mbart_translation_tokenizer.lang_code_to_id[src_lang]
    )
    back_translated_text = mbart_translation_tokenizer.batch_decode(back_translated, skip_special_tokens=True)[0]

    new_row = {"isNull": False, "original": text, "translated": translated_text, "back_translated": back_translated_text}
    df.loc[len(df)] = new_row

    save_display_temp_df(df, df_name)

    return back_translated_text

In [35]:
# Apply back-translation to the Burmese sentences in myXNLI
myxnli_mbart_back_translated = myxnli_processed.copy()

In [None]:
# Apply back-translation to the Burmese sentences 1 in myXNLI
myxnli_mbart_back_translated_temp_df1 = create_temp_df()
myxnli_mbart_back_translated["sentence1_my"] = myxnli_mbart_back_translated["sentence1_my"].apply(
    lambda x: mbart_back_translate(x, src_lang="my_MM", tgt_lang="en_XX", df=myxnli_mbart_back_translated_temp_df1, df_name="myxnli_mbart_back_translated_temp_df1")
)

In [None]:
# Apply back-translation to the Burmese sentences 2 in myXNLI
myxnli_mbart_back_translated_temp_df2 = create_temp_df()
myxnli_mbart_back_translated["sentence2_my"] = myxnli_mbart_back_translated["sentence2_my"].apply(
    lambda x: mbart_back_translate(x, src_lang="my_MM", tgt_lang="en_XX", df=myxnli_mbart_back_translated_temp_df2, df_name="myxnli_mbart_back_translated_temp_df2")
)

In [None]:
# display and save back-translated myXNLI dataset
print("Back-translation applied to myXNLI dataset with mBART-50.")
display(myxnli_mbart_back_translated.head())
myxnli_mbart_back_translated.to_csv('myxnli_mbart_back_translated.csv', index=False)

In [None]:
# Apply back-translation to combined ALT dataset
alt_mbart_back_translated_temp_df = create_temp_df()
alt_mbart_back_translated = alt_combined_processed.copy()
alt_mbart_back_translated["Burmese_Sentence"] = alt_mbart_back_translated["Burmese_Sentence"].apply(
    lambda x: mbart_back_translate(x, src_lang="my_MM", tgt_lang="en_XX", df=alt_mbart_back_translated_temp_df, df_name="alt_mbart_back_translated_temp_df")
)
print("Back-translation applied to combined ALT dataset with mBART-50.")
display(alt_mbart_back_translated.head())
alt_mbart_back_translated.to_csv('alt_mbart_back_translated.csv', index=False)

##### Pseudo-Parallel Corpus Creation

In [None]:
# Load semantic similarity model
similarity_model_name = "all-MiniLM-L6-v2"
similarity_model = SentenceTransformer(similarity_model_name).to(device)

In [None]:
# Function to create pseudo-parallel corpus
def create_pseudo_parallel(data_en, data_my, similarity_model, top_k=1):
    pseudo_parallel = []
    embeddings_en = similarity_model.encode(data_en, convert_to_tensor=True, device=device)
    embeddings_my = similarity_model.encode(data_my, convert_to_tensor=True, device=device)
    similarity_scores = util.pytorch_cos_sim(embeddings_en, embeddings_my)

    for idx_en, scores in enumerate(similarity_scores):
        top_matches = scores.topk(k=top_k)
        for match_idx in top_matches.indices:
            pseudo_parallel.append((data_en[idx_en], data_my[match_idx.item()], scores[match_idx].item()))
    
    return pseudo_parallel

In [None]:
# Apply pseudo-parallel creation to combined ALT dataset
print("Creating pseudo-parallel corpus from combined ALT dataset...")
alt_combined_en = alt_combined_processed["English_Sentence"].dropna().tolist()
alt_combined_my = alt_combined_processed["Burmese_Sentence"].dropna().tolist()
pseudo_parallel_data = create_pseudo_parallel(alt_combined_en, alt_combined_my, similarity_model)
    
pseudo_parallel_df = pd.DataFrame(pseudo_parallel_data, columns=["English_Sentence", "Burmese_Sentence", "Similarity_Score"])
print("Pseudo-parallel corpus created successfully.")
display(pseudo_parallel_df.head())

In [None]:
# save pseudo-parallel alt dataset
pseudo_parallel_df.to_csv('pseudo_parallel_df.csv', index=False)