# Importing all the important libraries

In [None]:
import numpy as np
import torch
from utils.dataframe import (
    convert_to_hf,
    load_model_variants_df,
    save_model_variants_hf
)
from transformers import (
    AutoTokenizer, 
)
from IPython.display import display
import tensorflow_hub as hub
import tensorflow_text as text
import fasttext
import fasttext.util
from laserembeddings import Laser
from utils.gpu import get_device

# Common

In [2]:
# gpu device 
device = get_device()


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


# 2. Enhance Burmese Contextual Representations
- Use LASER, mUSE, and FastText for cross-lingual and morphology-aware training.
- Fine-tune mBERT, XLM-R on Burmese dataset after adding contextual embeddings.
- Train models again using combined embeddings.

In [None]:
# function to get fine tuned tokenizer
def get_fine_tuned_tokenizer(model_name, spt_name):
    # Load LoRA Weights
    lora_checkpoint_path = f"model-variants/models/{model_name}_{spt_name.upper()}"

    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(lora_checkpoint_path)

    return tokenizer

## Data Preprocessing

In [4]:
# LASER Embeddings
laser = Laser()

In [5]:
# mUSE (Multilingual Universal Sentence Encoder)
muse = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

2025-02-15 09:14:38.675295: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-02-15 09:14:38.675322: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-02-15 09:14:38.675327: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
I0000 00:00:1739578478.675350 38843022 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1739578478.675374 38843022 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-02-15 09:14:40.801946: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [6]:
# FastText for Morphology-Aware Training
fasttext.util.download_model('my', if_exists='ignore')  # Download Burmese FastText model
fasttext_model = fasttext.load_model('cc.my.300.bin')

In [None]:
# Function to Generate Contextual Embeddings
@torch.no_grad()  # Prevents autograd from storing computation graphs
def generate_contextual_embeddings(sentences):
    """
    Generates contextual embeddings using LASER, mUSE, and FastText.
    The output embeddings will be used to enhance XLM-R and mBERT fine-tuning.
    """
    # Compute Multilingual Embeddings
    embeddings_laser = laser.embed_sentences(sentences, lang="my")  # (batch_size, hidden_size)
    embeddings_muse = muse(sentences).numpy()  # (batch_size, hidden_size)
    embeddings_fasttext = np.array([fasttext_model.get_sentence_vector(sentence) for sentence in sentences])  # (batch_size, hidden_size)

    # Compute Mean-Pooled Contextual Embedding
    combined_embedding = (embeddings_laser + embeddings_muse + embeddings_fasttext) / 3

    # Convert to Torch Tensor & Move to GPU with bf16 Support
    return torch.tensor(combined_embedding, dtype=torch.bfloat16)

In [None]:
# Function to Tokenize and Add Contextual Embeddings
def tokenize_with_contextual_embeddings(examples, tokenizer):
    sentences = examples["source"]
    
    # Generate Contextual Embeddings
    contextual_embeddings = generate_contextual_embeddings(sentences)

    # Tokenize Input Text
    tokenized_input = tokenizer(sentences, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    tokenized_output = tokenizer(examples["target"], truncation=True, padding="max_length", max_length=512, return_tensors="pt")

    # Ensure Embeddings Match Expected Shape: (batch_size, sequence_length, hidden_size)
    batch_size = tokenized_input["input_ids"].shape[0]
    hidden_size = contextual_embeddings.shape[-1]

    # Expand contextual embeddings for entire sequence (assuming CLS-based representation)
    contextual_embeddings = contextual_embeddings.unsqueeze(1).expand(
        batch_size, 
        tokenized_input["input_ids"].shape[1], 
        hidden_size
    ).to(tokenized_input["input_ids"].device)

    return {
        "input_ids": tokenized_input["input_ids"],
        "attention_mask": tokenized_input["attention_mask"],
        "labels": tokenized_output["input_ids"],
        "contextual_embeds": contextual_embeddings
    }

In [None]:
# Function to Prepare Dataset with Contextual Embeddings
def prepare_contextual_embedding_dataset(model_name, spt_name):
    # get dataset
    dataset = load_model_variants_df("combined")

    # Split into 80% train, 20% test
    train_df = dataset.sample(frac=0.8, random_state=42)
    test_df = dataset.drop(train_df.index)

    # Convert to Hugging Face Dataset
    train_df = convert_to_hf(train_df)
    test_df = convert_to_hf(test_df)
    
    # get tokenizer
    tokenizer = get_fine_tuned_tokenizer(model_name, spt_name)

    # Apply Tokenization with Contextual Embeddings to train dataset
    train_tokenized = train_df.map(
        lambda x: tokenize_with_contextual_embeddings(x, tokenizer),
        batched=True,
        desc=f"Tokenizing train dataset for {model_name} with {spt_name} (Contextual Embeddings)",
        num_proc=10
    )

    # Save Processed train Dataset
    save_model_variants_hf(train_tokenized, f"contextual_embedded_{model_name.lower()}_{spt_name}_train")

    # Apply Tokenization with Contextual Embeddings to test dataset
    test_tokenized = test_df.map(
        lambda x: tokenize_with_contextual_embeddings(x, tokenizer),
        batched=True,
        desc=f"Tokenizing train dataset for {model_name} with {spt_name} (Contextual Embeddings)",
        num_proc=10
    )

    # Save Processed test Dataset
    save_model_variants_hf(test_tokenized, f"contextual_embedded_{model_name.lower()}_{spt_name}_test")

### mBERT

In [None]:
# create embedding with bpe
prepare_contextual_embedding_dataset("mBERT", "bpe")

In [None]:
# create embedding with unigram
prepare_contextual_embedding_dataset("mBERT", "unigram")

### XLM-R

In [None]:
# create embedding with bpe
prepare_contextual_embedding_dataset("XLM-R", "bpe")

In [None]:
# create embedding with unigram
prepare_contextual_embedding_dataset("XLM-R", "unigram")