# Importing all the important libraries

In [1]:
import tensorflow as tf
import torch.nn as nn
import torch
import numpy as np
from tqdm import tqdm
from utils.dataframe import (
    save_model_variants_chunk_hf,
    load_model_variants_hf, save_model_variants_hf
)
from IPython.display import display
import tensorflow_hub as hub
import tensorflow_text as text
import fasttext
import fasttext.util
from laserembeddings import Laser
from utils.gpu import get_device

# Set Settings

In [2]:
tqdm.pandas()

# Common

In [3]:
# gpu device 
device = get_device()


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


# 2. Enhance Burmese Contextual Representations
- Use LASER, mUSE, and FastText for cross-lingual and morphology-aware training.
- Fine-tune mBERT, XLM-R on Burmese dataset after adding contextual embeddings.
- Train models again using combined embeddings.

## Data Preprocessing

### Split Train Data

In [4]:
# # splti train data
# for model_name in ["mbert", "xlm-r"]:
#     for spt_name in ["bpe", "unigram"]:
#         name = f"{model_name}_{spt_name}_train"
#         train_dataset = load_model_variants_hf(name)

#         # split and save
#         save_model_variants_hf(train_dataset, f"embedded_{name}", 20)

In [5]:
# # splti test data
# for model_name in ["mbert", "xlm-r"]:
#     for spt_name in ["bpe", "unigram"]:
#         name = f"{model_name}_{spt_name}_test"
#         train_dataset = load_model_variants_hf(name)

#         # split and save
#         save_model_variants_hf(train_dataset, f"embedded_{name}", 5)

### Generate Embeddings

In [6]:
# LASER Embeddings
laser = Laser()

In [7]:
# mUSE (Multilingual Universal Sentence Encoder)
muse = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

2025-02-24 21:32:51.739562: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-02-24 21:32:51.739604: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-02-24 21:32:51.739611: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
I0000 00:00:1740400371.739632 12534025 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1740400371.739654 12534025 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-02-24 21:32:53.925074: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [8]:
# FastText for Morphology-Aware Training
fasttext.util.download_model('my', if_exists='ignore')  # Download Burmese FastText model
fasttext_model = fasttext.load_model('cc.my.300.bin')

In [9]:
# Initialize projection layer (1836 → 768) in bf16
projection_layer = nn.Linear(1836, 768).to(torch.bfloat16)

In [10]:
def generate_embeddings(batch):
    """
    Generates contextual embeddings using mUSE (TensorFlow), LASER, and FastText.
    Uses Hugging Face Datasets to process batches efficiently.
    Logs errors and stops execution on failure.
    """

    sentences = batch["target"]
    sentences = [str(text) if text is not None else "" for text in sentences]

    # Ensure all sentences are valid strings
    sentences = [s.replace("\n", " ").strip() if s else "" for s in sentences]

    # Generate mUSE embeddings (GPU-accelerated)
    batch_tensor = tf.convert_to_tensor(sentences)
    embeddings_muse = muse(batch_tensor).numpy()  # Shape: (batch_size, 512)

    # Generate FastText embeddings
    embeddings_fasttext = np.array([
        fasttext_model.get_sentence_vector(sentence) for sentence in sentences
    ])

    # Generate LASER embeddings
    embeddings_laser = laser.embed_sentences(sentences, lang="my")

    # Stack embeddings
    combined_embeddings = np.hstack([
        embeddings_muse,
        embeddings_laser,
        embeddings_fasttext
    ])

    # Convert to tensor & bf16
    tensor_embeddings = torch.tensor(combined_embeddings, dtype=torch.bfloat16)

    # Apply projection (1836 → 768)
    projected_embeddings = projection_layer(tensor_embeddings).detach().tolist()

    return {"contextual_embeddings": projected_embeddings}

In [11]:
# model and spt
model_name = "xlm-r"
spt_name = "bpe"

#### Train

In [12]:
# chunk num
chunk_num = 4

In [13]:
# train path
chunk_dataset_path = f"embedded_{model_name}_{spt_name}_test"

In [14]:
# load chunk dataset
chunk_dataset = load_model_variants_hf(chunk_dataset_path, chunk_num=chunk_num)
print(f"Chunk Dataset Length: {len(chunk_dataset)}")
display(chunk_dataset.to_pandas().head())

model-variants/data/embedded_xlm-r_bpe_test_hf_dataset/chunk_4
Chunk Dataset Length: 65103


Unnamed: 0,source,target,input_ids,attention_mask,labels
0,some of them are only part of the basic buildi...,တချို့ကို ဖယ်ရှားထားလို့ အခြေခံ အဆောက်အအုံ အပိ...,"[0, 3060, 111, 2856, 621, 4734, 2831, 111, 70,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 14725, 191256, 149272, 43871, 39208, 10882..."
1,"well, i don't think it would be good if we los...",ကောင်းပြီ၊ ကျွန်တော်တို့ ဒါကို လုံးဝ ဆုံးရှုံး...,"[0, 5299, 6, 4, 17, 2301, 242, 808, 5351, 442,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 6, 44373, 64043, 1586, 6, 86960, 20409, 25..."
2,i was born in new mexico in 1995.,၁၉၉၅ မှာ New Mexico မှာမွေးတယ်။,"[0, 17, 509, 103122, 23, 3525, 163, 73140, 23,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[0, 54586, 29722, 21963, 81987, 2356, 276, 111..."
3,cynthia was a pharmacist.,Cynthia သည် ဆေးဆိုင်ဖြစ်သည်။,"[0, 18986, 4450, 11, 509, 10, 189258, 39, 1030...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[0, 49715, 4450, 11, 6, 8885, 6, 62520, 6, 743..."
4,it's not good to forget something because the ...,စူးစမ်းရေးမှူးဟာ သေးငယ်တာကြောင့် တစ်ခုခုကို လျ...,"[0, 442, 242, 91, 959, 4127, 47, 90820, 9844, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 16528, 36371, 196008, 6, 9177, 155551, 887..."


In [None]:
# generate embeddings
chunk_dataset = chunk_dataset.map(generate_embeddings, batched=True, batch_size=1024)



Map:   0%|          | 0/65103 [00:00<?, ? examples/s]

In [None]:
# display
display(chunk_dataset.to_pandas().head())

In [None]:
# save chunk dataset
save_model_variants_chunk_hf(chunk_dataset, chunk_dataset_path, chunk_num=chunk_num)

In [None]:
not_generated_list = []
for i in range(5):
    dataset = load_model_variants_hf(chunk_dataset_path, chunk_num=i)

    if "contextual_embeddings" not in dataset.to_pandas().columns:
        not_generated_list.append(i)

print(not_generated_list)