# Importing all the important libraries

In [None]:
import tensorflow as tf
import numpy as np
from tqdm import tqdm
from utils.dataframe import (
    save_model_variants_chunk_hf,
    save_model_variants_hf, load_model_variants_hf
)
from IPython.display import display
import tensorflow_hub as hub
import tensorflow_text as text
import fasttext
import fasttext.util
from laserembeddings import Laser
from utils.gpu import get_device
from sklearn.decomposition import PCA

# Set Settings

In [2]:
tqdm.pandas()

# Common

In [3]:
# gpu device 
device = get_device()


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


# 2. Enhance Burmese Contextual Representations
- Use LASER, mUSE, and FastText for cross-lingual and morphology-aware training.
- Fine-tune mBERT, XLM-R on Burmese dataset after adding contextual embeddings.
- Train models again using combined embeddings.

## Data Preprocessing

### Split Train Data

In [None]:
# # splti train data
# for model_name in ["mbert", "xlm-r"]:
#     for spt_name in ["bpe", "unigram"]:
#         name = f"{model_name}_{spt_name}_train"
#         train_dataset = load_model_variants_hf(name)

#         # split and save
#         save_model_variants_hf(train_dataset, f"embedded_{name}", 20)

In [None]:
# # splti test data
# for model_name in ["mbert", "xlm-r"]:
#     for spt_name in ["bpe", "unigram"]:
#         name = f"{model_name}_{spt_name}_test"
#         train_dataset = load_model_variants_hf(name)

#         # split and save
#         save_model_variants_hf(train_dataset, f"embedded_{name}", 5)

### Generate Embeddings

In [4]:
# LASER Embeddings
laser = Laser()

In [5]:
# mUSE (Multilingual Universal Sentence Encoder)
muse = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

2025-02-22 11:34:12.401937: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-02-22 11:34:12.401966: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-02-22 11:34:12.401970: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
I0000 00:00:1740191652.401993 7153676 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1740191652.402016 7153676 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-02-22 11:34:14.554831: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [6]:
# FastText for Morphology-Aware Training
fasttext.util.download_model('my', if_exists='ignore')  # Download Burmese FastText model
fasttext_model = fasttext.load_model('cc.my.300.bin')

In [7]:
def generate_embeddings(batch):
    """
    Generates contextual embeddings using mUSE (TensorFlow), LASER, and FastText.
    Uses Hugging Face Datasets to process batches efficiently.
    Logs errors and stops execution on failure.
    """

    sentences = batch["source"]
    sentences = [str(text) if text is not None else "" for text in sentences]

    # Ensure all sentences are valid strings
    sentences = [s.replace("\n", " ").strip() if s else "" for s in sentences]

    # Generate mUSE embeddings (GPU-accelerated)
    batch_tensor = tf.convert_to_tensor(sentences)
    embeddings_muse = muse(batch_tensor).numpy()  # Shape: (batch_size, 512)

    # Generate FastText embeddings
    embeddings_fasttext = np.array([
        fasttext_model.get_sentence_vector(sentence) for sentence in sentences
    ])

    # Generate LASER embeddings
    embeddings_laser = laser.embed_sentences(sentences, lang="my")

    # Stack embeddings
    combined_embeddings = np.hstack([
        embeddings_muse,
        embeddings_laser,
        embeddings_fasttext
    ])

    return {"contextual_embeddings": combined_embeddings}

In [8]:
# model and spt
model_name = "mbert"
spt_name = "bpe"

#### Train

In [None]:
# chunk num
chunk_num = 0

In [None]:
# train path
chunk_dataset_path = f"embedded_{model_name}_{spt_name}_test"

In [None]:
# load chunk dataset
chunk_dataset = load_model_variants_hf(chunk_dataset_path, chunk_num=chunk_num)
print(f"Chunk Dataset Length: {len(chunk_dataset)}")
display(chunk_dataset.to_pandas().head())

In [None]:
# generate embeddings
chunk_dataset = chunk_dataset.map(generate_embeddings, batched=True, batch_size=512)

In [None]:
# display
display(chunk_dataset.to_pandas().head())

In [None]:
# save chunk dataset
save_model_variants_chunk_hf(chunk_dataset, chunk_dataset_path, chunk_num=chunk_num)

In [None]:
not_generated_list = []
for i in range(19):
    dataset = load_model_variants_hf(chunk_dataset_path, chunk_num=i)

    if "contextual_embeddings" not in dataset.to_pandas().columns:
        not_generated_list.append(i)

print(not_generated_list)