# Importing all the important libraries

In [1]:
import tensorflow as tf
import torch.nn as nn
import torch
import numpy as np
from tqdm import tqdm
from utils.dataframe import (
    save_model_variants_chunk_hf,
    load_model_variants_hf, save_model_variants_hf
)
from IPython.display import display
import tensorflow_hub as hub
import tensorflow_text as text
import fasttext
import fasttext.util
from laserembeddings import Laser
from utils.gpu import get_device

# Set Settings

In [2]:
tqdm.pandas()

# Common

In [3]:
# gpu device 
device = get_device()


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


# 2. Enhance Burmese Contextual Representations
- Use LASER, mUSE, and FastText for cross-lingual and morphology-aware training.
- Fine-tune mBERT, XLM-R on Burmese dataset after adding contextual embeddings.
- Train models again using combined embeddings.

## Data Preprocessing

### Split Train Data

In [4]:
# chunk size
embedded_train_chunk_sizes = {
    "mbert": 20,
    "xlm-r": 20,
}
embedded_test_chunk_sizes = {
    "mbert": 5,
    "xlm-r": 5,
}

In [5]:
# # splti train data
# for model_name in ["mbert", "xlm-r"]:
#     for spt_name in ["bpe", "unigram"]:
#         name = f"{model_name}_{spt_name}_train"
#         train_dataset = load_model_variants_hf(name)

#         # split and save
#         save_model_variants_hf(train_dataset, f"embedded_{name}", embedded_train_chunk_sizes[model_name])

In [6]:
# # splti test data
# for model_name in ["mbert", "xlm-r"]:
#     for spt_name in ["bpe", "unigram"]:
#         name = f"{model_name}_{spt_name}_test"
#         train_dataset = load_model_variants_hf(name)

#         # split and save
#         save_model_variants_hf(train_dataset, f"embedded_{name}", embedded_test_chunk_sizes[model_name])

### Generate Embeddings

In [7]:
# LASER Embeddings
laser = Laser()

In [8]:
# mUSE (Multilingual Universal Sentence Encoder)
muse = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

2025-02-25 14:08:39.035048: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-02-25 14:08:39.035076: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-02-25 14:08:39.035081: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
I0000 00:00:1740460119.035102 14023414 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1740460119.035132 14023414 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-02-25 14:08:41.068787: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [9]:
# FastText for Morphology-Aware Training
fasttext.util.download_model('my', if_exists='ignore')  # Download Burmese FastText model
fasttext_model = fasttext.load_model('cc.my.300.bin')

In [10]:
# Initialize projection layer (1836 → 768) in bf16
projection_layer = nn.Linear(1836, 768).to(torch.bfloat16)

In [11]:
def generate_embeddings(batch):
    """
    Generates contextual embeddings using mUSE (TensorFlow), LASER, and FastText.
    Uses Hugging Face Datasets to process batches efficiently.
    Logs errors and stops execution on failure.
    """

    sentences = batch["target"]
    sentences = [str(text) if text is not None else "" for text in sentences]

    # Ensure all sentences are valid strings
    sentences = [s.replace("\n", " ").strip() if s else "" for s in sentences]

    # Generate mUSE embeddings (GPU-accelerated)
    batch_tensor = tf.convert_to_tensor(sentences)
    embeddings_muse = muse(batch_tensor).numpy()  # Shape: (batch_size, 512)

    # Generate FastText embeddings
    embeddings_fasttext = np.array([
        fasttext_model.get_sentence_vector(sentence) for sentence in sentences
    ])

    # Generate LASER embeddings
    embeddings_laser = laser.embed_sentences(sentences, lang="my")

    # Stack embeddings
    combined_embeddings = np.hstack([
        embeddings_muse,
        embeddings_laser,
        embeddings_fasttext
    ])

    # Convert to tensor & bf16
    tensor_embeddings = torch.tensor(combined_embeddings, dtype=torch.bfloat16)

    # Apply projection (1836 → 768)
    projected_embeddings = projection_layer(tensor_embeddings).detach().tolist()

    return {"contextual_embeddings": projected_embeddings}

In [12]:
# model and spt
model_name = "xlm-r"
spt_name = "unigram"

#### Train

In [None]:
# chunk num
chunk_num = 19

In [14]:
# train path
chunk_dataset_path = f"embedded_{model_name}_{spt_name}_train"

In [15]:
# load chunk dataset
chunk_dataset = load_model_variants_hf(chunk_dataset_path, chunk_num=chunk_num)
print(f"Chunk Dataset Length: {len(chunk_dataset)}")
display(chunk_dataset.to_pandas().head())

model-variants/data/embedded_xlm-r_unigram_train_hf_dataset/chunk_18
Chunk Dataset Length: 65103


Unnamed: 0,source,target,input_ids,attention_mask,labels
0,the author of the book,ဆရာဝန် Menard ရဲ့,"[0, 70, 42179, 111, 70, 12877, 2, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 67230, 163701, 1111, 46760, 100556, 2, 1, ..."
1,but i don't know if i should tell you that i h...,ဒါပေမဲ့ ကျွန်မမှာ သေနတ်တစ်လက်ရှိတာကို ပြောသင့်...,"[0, 1284, 17, 2301, 242, 808, 3714, 2174, 17, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 21993, 81109, 61097, 6, 86960, 2196, 7457,..."
2,the nuremberg trials satisfied the public's ne...,Nuremberg စမ်းသပ်မှုများသည် Holocaust ကိုလက်စာ...,"[0, 70, 315, 456, 347, 40302, 110324, 7, 21452...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 1429, 456, 347, 40302, 6, 196008, 5123, 22..."
3,britain helping the starving has never been co...,ဗြိတိန်သည် အစာရေစာ ငတ်မွတ်မှုကို ကူညီပေးသည်ဟု ...,"[0, 14799, 25500, 120592, 70, 6057, 6496, 1556...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 52119, 12831, 9388, 20326, 18013, 6, 8885,..."
4,it's between .75 and 1.75 liters.,.75 လီတာကနေ 1.75 လီတာကြားမှာ ရှိတယ်။,"[0, 442, 242, 91, 17721, 6, 5, 4948, 136, 615,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 6, 5, 4948, 6, 35514, 6, 4279, 6, 99568, 6..."


In [None]:
# generate embeddings
chunk_dataset = chunk_dataset.map(generate_embeddings, batched=True, batch_size=1024)



Map:   0%|          | 0/65103 [00:00<?, ? examples/s]

In [None]:
# display
display(chunk_dataset.to_pandas().head())

In [None]:
# save chunk dataset
save_model_variants_chunk_hf(chunk_dataset, chunk_dataset_path, chunk_num=chunk_num)

In [None]:
# not_generated_list = []
# for i in range(5):
#     dataset = load_model_variants_hf(chunk_dataset_path, chunk_num=i)

#     if "contextual_embeddings" not in dataset.to_pandas().columns:
#         not_generated_list.append(i)

# print(not_generated_list)