# Importing all the important libraries

In [1]:
import tensorflow as tf
import torch.nn as nn
import torch
import numpy as np
from tqdm import tqdm
from utils.dataframe import (
    save_model_variants_chunk_hf,
    load_model_variants_hf, save_model_variants_hf
)
from IPython.display import display
import tensorflow_hub as hub
import tensorflow_text as text
import fasttext
import fasttext.util
from laserembeddings import Laser
from utils.gpu import get_device

# Set Settings

In [2]:
tqdm.pandas()

# Common

In [3]:
# gpu device 
device = get_device()


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


# 2. Enhance Burmese Contextual Representations
- Use LASER, mUSE, and FastText for cross-lingual and morphology-aware training.
- Fine-tune mBERT, XLM-R on Burmese dataset after adding contextual embeddings.
- Train models again using combined embeddings.

## Data Preprocessing

### Split Train Data

In [4]:
# chunk size
embedded_train_chunk_sizes = {
    "mbert": 20,
    "xlm-r": 20,
}
embedded_test_chunk_sizes = {
    "mbert": 5,
    "xlm-r": 5,
}

In [5]:
# # splti train data
# for model_name in ["mbert", "xlm-r"]:
#     for spt_name in ["bpe", "unigram"]:
#         name = f"{model_name}_{spt_name}_train"
#         train_dataset = load_model_variants_hf(name)

#         # split and save
#         save_model_variants_hf(train_dataset, f"embedded_{name}", embedded_train_chunk_sizes[model_name])

In [6]:
# # splti test data
# for model_name in ["mbert", "xlm-r"]:
#     for spt_name in ["bpe", "unigram"]:
#         name = f"{model_name}_{spt_name}_test"
#         train_dataset = load_model_variants_hf(name)

#         # split and save
#         save_model_variants_hf(train_dataset, f"embedded_{name}", embedded_test_chunk_sizes[model_name])

### Generate Embeddings

In [7]:
# LASER Embeddings
laser = Laser()

In [8]:
# mUSE (Multilingual Universal Sentence Encoder)
muse = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

2025-02-25 16:39:18.138296: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-02-25 16:39:18.138329: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-02-25 16:39:18.138334: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
I0000 00:00:1740469158.138354 14332168 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1740469158.138375 14332168 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-02-25 16:39:20.137072: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [9]:
# FastText for Morphology-Aware Training
fasttext.util.download_model('my', if_exists='ignore')  # Download Burmese FastText model
fasttext_model = fasttext.load_model('cc.my.300.bin')

In [10]:
# Initialize projection layer (1836 → 768) in bf16
projection_layer = nn.Linear(1836, 768).to(torch.bfloat16)

In [11]:
def generate_embeddings(batch):
    """
    Generates contextual embeddings using mUSE (TensorFlow), LASER, and FastText.
    Uses Hugging Face Datasets to process batches efficiently.
    Logs errors and stops execution on failure.
    """

    sentences = batch["target"]
    sentences = [str(text) if text is not None else "" for text in sentences]

    # Ensure all sentences are valid strings
    sentences = [s.replace("\n", " ").strip() if s else "" for s in sentences]

    # Generate mUSE embeddings (GPU-accelerated)
    batch_tensor = tf.convert_to_tensor(sentences)
    embeddings_muse = muse(batch_tensor).numpy()  # Shape: (batch_size, 512)

    # Generate FastText embeddings
    embeddings_fasttext = np.array([
        fasttext_model.get_sentence_vector(sentence) for sentence in sentences
    ])

    # Generate LASER embeddings
    embeddings_laser = laser.embed_sentences(sentences, lang="my")

    # Stack embeddings
    combined_embeddings = np.hstack([
        embeddings_muse,
        embeddings_laser,
        embeddings_fasttext
    ])

    # Convert to tensor & bf16
    tensor_embeddings = torch.tensor(combined_embeddings, dtype=torch.bfloat16)

    # Apply projection (1836 → 768)
    projected_embeddings = projection_layer(tensor_embeddings).detach().tolist()

    return {"contextual_embeddings": projected_embeddings}

In [12]:
# model and spt
model_name = "mbert"
spt_name = "unigram"

#### Train

In [None]:
# chunk num
chunk_num = 3

In [14]:
# train path
chunk_dataset_path = f"embedded_{model_name}_{spt_name}_train"

In [15]:
# load chunk dataset
chunk_dataset = load_model_variants_hf(chunk_dataset_path, chunk_num=chunk_num)
print(f"Chunk Dataset Length: {len(chunk_dataset)}")
display(chunk_dataset.to_pandas().head())

model-variants/data/embedded_mbert_unigram_train_hf_dataset/chunk_3
Chunk Dataset Length: 65103


Unnamed: 0,source,target,input_ids,token_type_ids,attention_mask,labels
0,you can find the center of blackpool by follow...,Blackpool ရဲ့ ဗဟိုကို အမှတ်အသားတွေကို လိုက်ပြီ...,"[101, 100, 100, 100, 100, 100, 100, 100, 10514...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 100, 10514, 30668, 100, 100, 39194, 100,..."
1,we must only hope that our kashmir can be reso...,ကျွန်တော်တို့ရဲ့ ကက်ရှ်မီးယားကို ငြိမ်းချမ်းစွ...,"[101, 100, 100, 100, 100, 100, 100, 100, 100, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...","[101, 100, 1518, 89114, 100, 39194, 100, 100, ..."
2,"each time, he sent photos to his friends and a...",အကြိမ်တိုင်း သူ့သူငယ်ချင်းတွေကို ဓာတ်ပုံတွေပို...,"[101, 100, 100, 117, 100, 100, 100, 100, 100, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[101, 100, 1507, 47341, 100, 1507, 111505, 241..."
3,there were no castes.,ဇာတ်မရှိခဲ့ပါ။,"[101, 100, 100, 100, 100, 10196, 119, 102, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[101, 100, 1516, 57586, 111470, 89114, 111489,..."
4,but the oil is gone.,ဒါပေမဲ့ ဆီအသားအိတ်က ပျောက်နေတယ်။,"[101, 100, 100, 100, 100, 100, 119, 102, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","[101, 100, 100, 1524, 69046, 1524, 53474, 2814..."


In [None]:
# generate embeddings
chunk_dataset = chunk_dataset.map(generate_embeddings, batched=True, batch_size=1024)



Map:   0%|          | 0/65103 [00:00<?, ? examples/s]

In [None]:
# display
display(chunk_dataset.to_pandas().head())

In [None]:
# save chunk dataset
save_model_variants_chunk_hf(chunk_dataset, chunk_dataset_path, chunk_num=chunk_num)

In [None]:
# not_generated_list = []
# for i in range(20):
#     dataset = load_model_variants_hf(chunk_dataset_path, chunk_num=i)

#     if "contextual_embeddings" not in dataset.to_pandas().columns:
#         not_generated_list.append(i)

# print(not_generated_list)