# Importing all the important libraries

In [2]:
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from utils.dataframe import (
    save_model_variants_chunk_hf,
    save_model_variants_hf, load_model_variants_hf
)
from IPython.display import display
import tensorflow_hub as hub
import tensorflow_text as text
import fasttext
import fasttext.util
from laserembeddings import Laser
from utils.gpu import get_device
from sklearn.decomposition import PCA

# Set Settings

In [3]:
tqdm.pandas()

# Common

In [4]:
# gpu device 
device = get_device()


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


# 2. Enhance Burmese Contextual Representations
- Use LASER, mUSE, and FastText for cross-lingual and morphology-aware training.
- Fine-tune mBERT, XLM-R on Burmese dataset after adding contextual embeddings.
- Train models again using combined embeddings.

## Data Preprocessing

### Split Train Data

In [5]:
# # splti train data
# for model_name in ["mbert", "xlm-r"]:
#     for spt_name in ["bpe", "unigram"]:
#         name = f"{model_name}_{spt_name}_train"
#         train_dataset = load_model_variants_hf(name)

#         # split and save
#         save_model_variants_hf(train_dataset, f"embedded_{name}", 20)

In [6]:
# # splti test data
# for model_name in ["mbert", "xlm-r"]:
#     for spt_name in ["bpe", "unigram"]:
#         name = f"{model_name}_{spt_name}_test"
#         train_dataset = load_model_variants_hf(name)

#         # split and save
#         save_model_variants_hf(train_dataset, f"embedded_{name}", 5)

### Generate Embeddings

In [7]:
# LASER Embeddings
laser = Laser()

In [8]:
# mUSE (Multilingual Universal Sentence Encoder)
muse = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

2025-02-22 12:06:41.007928: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-02-22 12:06:41.007955: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-02-22 12:06:41.007960: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
I0000 00:00:1740193601.007980 7309482 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1740193601.007999 7309482 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-02-22 12:06:42.973471: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [9]:
# FastText for Morphology-Aware Training
fasttext.util.download_model('my', if_exists='ignore')  # Download Burmese FastText model
fasttext_model = fasttext.load_model('cc.my.300.bin')

In [10]:
def generate_embeddings(batch):
    """
    Generates contextual embeddings using mUSE (TensorFlow), LASER, and FastText.
    Uses Hugging Face Datasets to process batches efficiently.
    Logs errors and stops execution on failure.
    """

    sentences = batch["source"]
    sentences = [str(text) if text is not None else "" for text in sentences]

    # Ensure all sentences are valid strings
    sentences = [s.replace("\n", " ").strip() if s else "" for s in sentences]

    # Generate mUSE embeddings (GPU-accelerated)
    batch_tensor = tf.convert_to_tensor(sentences)
    embeddings_muse = muse(batch_tensor).numpy()  # Shape: (batch_size, 512)

    # Generate FastText embeddings
    embeddings_fasttext = np.array([
        fasttext_model.get_sentence_vector(sentence) for sentence in sentences
    ])

    # Generate LASER embeddings
    embeddings_laser = laser.embed_sentences(sentences, lang="my")

    # Stack embeddings
    combined_embeddings = np.hstack([
        embeddings_muse,
        embeddings_laser,
        embeddings_fasttext
    ])

    return {"contextual_embeddings": combined_embeddings}

In [11]:
# model and spt
model_name = "mbert"
spt_name = "bpe"

#### Train

In [12]:
# chunk num
chunk_num = 4

In [13]:
# train path
chunk_dataset_path = f"embedded_{model_name}_{spt_name}_test"

In [14]:
# load chunk dataset
chunk_dataset = load_model_variants_hf(chunk_dataset_path, chunk_num=chunk_num)
print(f"Chunk Dataset Length: {len(chunk_dataset)}")
display(chunk_dataset.to_pandas().head())

model-variants/data/embedded_mbert_bpe_test_hf_dataset/chunk_4
Chunk Dataset Length: 65103


Unnamed: 0,source,target,input_ids,token_type_ids,attention_mask,labels
0,some of them are only part of the basic buildi...,တချို့ကို ဖယ်ရှားထားလို့ အခြေခံ အဆောက်အအုံ အပိ...,"[101, 100, 100, 100, 100, 100, 100, 100, 100, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 100, 100, 1508, 35089, 41866, 49816, 100..."
1,"well, i don't think it would be good if we los...",ကောင်းပြီ၊ ကျွန်တော်တို့ ဒါကို လုံးဝ ဆုံးရှုံး...,"[101, 100, 117, 100, 100, 112, 188, 100, 100, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 100, 1558, 100, 100, 100, 100, 1521, 111..."
2,i was born in new mexico in 1995.,၁၉၉၅ မှာ New Mexico မှာမွေးတယ်။,"[101, 100, 100, 100, 100, 100, 100, 100, 100, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[101, 100, 100, 100, 100, 11419, 177, 10812, 1..."
3,cynthia was a pharmacist.,Cynthia သည် ဆေးဆိုင်ဖြစ်သည်။,"[101, 100, 100, 100, 100, 13621, 10298, 119, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[101, 100, 100, 100, 1498, 46974, 22683, 1559,..."
4,it's not good to forget something because the ...,စူးစမ်းရေးမှူးဟာ သေးငယ်တာကြောင့် တစ်ခုခုကို လျ...,"[101, 100, 112, 187, 100, 100, 100, 100, 100, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 100, 1518, 59512, 35108, 100303, 1522, 1..."


In [15]:
# generate embeddings
chunk_dataset = chunk_dataset.map(generate_embeddings, batched=True, batch_size=512)



Map:   0%|          | 0/65103 [00:00<?, ? examples/s]

In [16]:
# display
display(chunk_dataset.to_pandas().head())

Unnamed: 0,source,target,input_ids,token_type_ids,attention_mask,labels,contextual_embeddings
0,some of them are only part of the basic buildi...,တချို့ကို ဖယ်ရှားထားလို့ အခြေခံ အဆောက်အအုံ အပိ...,"[101, 100, 100, 100, 100, 100, 100, 100, 100, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 100, 100, 1508, 35089, 41866, 49816, 100...","[0.05255283, -0.04685146, 0.07002552, -0.05051..."
1,"well, i don't think it would be good if we los...",ကောင်းပြီ၊ ကျွန်တော်တို့ ဒါကို လုံးဝ ဆုံးရှုံး...,"[101, 100, 117, 100, 100, 112, 188, 100, 100, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 100, 1558, 100, 100, 100, 100, 1521, 111...","[0.0620354, -0.046208937, 0.07468706, -0.05485..."
2,i was born in new mexico in 1995.,၁၉၉၅ မှာ New Mexico မှာမွေးတယ်။,"[101, 100, 100, 100, 100, 100, 100, 100, 100, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[101, 100, 100, 100, 100, 11419, 177, 10812, 1...","[0.05329533, -0.038748924, 0.07143976, -0.0572..."
3,cynthia was a pharmacist.,Cynthia သည် ဆေးဆိုင်ဖြစ်သည်။,"[101, 100, 100, 100, 100, 13621, 10298, 119, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[101, 100, 100, 100, 1498, 46974, 22683, 1559,...","[0.0554102, -0.04346907, 0.06986986, -0.050670..."
4,it's not good to forget something because the ...,စူးစမ်းရေးမှူးဟာ သေးငယ်တာကြောင့် တစ်ခုခုကို လျ...,"[101, 100, 112, 187, 100, 100, 100, 100, 100, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 100, 1518, 59512, 35108, 100303, 1522, 1...","[0.056105405, -0.04841557, 0.07068223, -0.0525..."


In [17]:
# save chunk dataset
save_model_variants_chunk_hf(chunk_dataset, chunk_dataset_path, chunk_num=chunk_num)

Saving the dataset (0/2 shards):   0%|          | 0/65103 [00:00<?, ? examples/s]

In [None]:
# not_generated_list = []
# for i in range(19):
#     dataset = load_model_variants_hf(chunk_dataset_path, chunk_num=i)

#     if "contextual_embeddings" not in dataset.to_pandas().columns:
#         not_generated_list.append(i)

# print(not_generated_list)

model-variants/data/embedded_mbert_bpe_test_hf_dataset/chunk_0
model-variants/data/embedded_mbert_bpe_test_hf_dataset/chunk_1
model-variants/data/embedded_mbert_bpe_test_hf_dataset/chunk_2
model-variants/data/embedded_mbert_bpe_test_hf_dataset/chunk_3
[]
