# Importing all the important libraries

In [1]:
import numpy as np
import torch
from utils.dataframe import (
    convert_to_hf,
    load_model_variants_df,
    save_model_variants_hf
)
from transformers import (
    AutoTokenizer, 
)
from IPython.display import display
import tensorflow_hub as hub
import tensorflow_text as text
import fasttext
import fasttext.util
from laserembeddings import Laser
from utils.gpu import get_device
from sklearn.decomposition import PCA

# Common

In [2]:
# gpu device 
device = get_device()


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


# 2. Enhance Burmese Contextual Representations
- Use LASER, mUSE, and FastText for cross-lingual and morphology-aware training.
- Fine-tune mBERT, XLM-R on Burmese dataset after adding contextual embeddings.
- Train models again using combined embeddings.

In [3]:
# function to get fine tuned tokenizer
def get_fine_tuned_tokenizer(model_name, spt_name):
    # Load LoRA Weights
    lora_checkpoint_path = f"model-variants/models/{model_name}_{spt_name.upper()}"

    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(lora_checkpoint_path)

    return tokenizer

## Data Preprocessing

In [4]:
# LASER Embeddings
laser = Laser()

In [5]:
# mUSE (Multilingual Universal Sentence Encoder)
muse = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

2025-02-19 12:43:42.698602: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-02-19 12:43:42.698645: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-02-19 12:43:42.698649: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
I0000 00:00:1739936622.698915  780181 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1739936622.699124  780181 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-02-19 12:43:44.880282: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [6]:
# FastText for Morphology-Aware Training
fasttext.util.download_model('my', if_exists='ignore')  # Download Burmese FastText model
fasttext_model = fasttext.load_model('cc.my.300.bin')

In [26]:
# Function to Generate Contextual Embeddings
@torch.no_grad()
def generate_contextual_embeddings(sentences):
    """Generates contextual embeddings using LASER, mUSE, and FastText."""
    sentences = [sentence.replace("\n", " ").strip() for sentence in sentences]

    embeddings_laser = laser.embed_sentences(sentences, lang="my")  # (batch_size, 1024)
    embeddings_muse = muse(sentences).numpy()  # (batch_size, 512)
    embeddings_fasttext = np.array([fasttext_model.get_sentence_vector(sentence) for sentence in sentences])  # (batch_size, 300)

    # PCA should be fitted on the entire dataset or a large sample, not per batch.
    embeddings_laser = pca.transform(embeddings_laser) # Only transform here

    # Project FastText instead of padding
    embeddings_fasttext = fasttext_projection.transform(embeddings_fasttext)

    print("LASER Embedding Shape:", embeddings_laser.shape)
    print("MUSE Embedding Shape:", embeddings_muse.shape)
    print("FastText Embedding Shape:", embeddings_fasttext.shape)


    combined_embedding = (embeddings_laser + embeddings_muse + embeddings_fasttext) / 3
    return combined_embedding

In [8]:
@torch.no_grad()
def tokenize_with_contextual_embeddings(examples, tokenizer, pca, fasttext_projection):
    sentences = examples["source"]

    contextual_embeddings = generate_contextual_embeddings(sentences)
    contextual_embeddings = torch.tensor(contextual_embeddings, dtype=torch.bfloat16).to(tokenizer.device) # Move to device early

    tokenized_input = tokenizer(sentences, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    tokenized_output = tokenizer(examples["target"], truncation=True, padding="max_length", max_length=512, return_tensors="pt")

    batch_size = tokenized_input["input_ids"].shape[0]
    hidden_size = contextual_embeddings.shape[-1]

    # Expand contextual embeddings for each token (more likely the correct approach)
    # Assuming embeddings are per sentence and should be repeated for each token
    contextual_embeddings = contextual_embeddings.unsqueeze(1).expand(batch_size, tokenized_input["input_ids"].shape[1], hidden_size).to(tokenized_input["input_ids"].device)

    return {
        "input_ids": tokenized_input["input_ids"],
        "attention_mask": tokenized_input["attention_mask"],
        "labels": tokenized_output["input_ids"],
        "contextual_embeds": contextual_embeddings
    }

In [13]:
dataset = load_model_variants_df("combined")  # Replace "combined" with your dataset name

train_df = dataset.sample(frac=0.8, random_state=42)
test_df = dataset.drop(train_df.index)

train_dataset = convert_to_hf(train_df)
test_dataset = convert_to_hf(test_df)

In [16]:
# Pre-compute contextual embeddings for PCA fitting
print("Pre-compute contextual embeddings for PCA fitting")
all_train_sentences = train_df["source"].tolist()  # Adapt to your DataFrame structure
all_train_embeddings = []
for i in range(0, len(all_train_sentences), 100): # Process in batches to avoid OOM
    batch = all_train_sentences[i:i+100]
    embeddings = laser.embed_sentences(batch, lang="my")
    all_train_embeddings.append(embeddings)
all_train_embeddings = np.concatenate(all_train_embeddings)

Pre-compute contextual embeddings for PCA fitting


In [20]:
import h5py
import numpy as np

with h5py.File("model-variants/data/train_embeddings.h5", "w") as hf:
    hf.create_dataset("embeddings", data=all_train_embeddings)
    hf.create_dataset("sentences", data=np.array(all_train_sentences, dtype=h5py.string_dtype(encoding="utf-8")))

In [None]:
with h5py.File("train_embeddings.h5", "r") as hf:
    all_train_embeddings = np.array(hf["embeddings"])
    all_train_sentences = [sentence.decode("utf-8") for sentence in hf["sentences"]]

In [28]:
print("Make pca global so it's accessible in map")
global pca # Make pca global so it's accessible in map
pca = PCA(n_components=300)
pca.fit(all_train_embeddings)

Make pca global so it's accessible in map


In [None]:
# Get FastText embeddings
all_fasttext_embeddings = np.array([
    fasttext_model.get_sentence_vector(sentence.replace("\n", " "))  # Remove newlines
    for sentence in all_train_sentences
])

# Get valid n_components value
n_samples, n_features = all_fasttext_embeddings.shape
n_components = min(n_samples, n_features, 300)  # Ensure n_components ≤ min(samples, features)

# Apply PCA with adjusted n_components
fasttext_projection = PCA(n_components=n_components)
fasttext_projection.fit(all_fasttext_embeddings)

In [24]:
# Function to Prepare Dataset with Contextual Embeddings
def prepare_contextual_embedding_dataset(model_name, spt_name):
    tokenizer = get_fine_tuned_tokenizer(model_name, spt_name)

    train_tokenized = train_dataset.map(
        lambda x: tokenize_with_contextual_embeddings(x, tokenizer, pca, fasttext_projection),
        batched=True,
        desc=f"Tokenizing train dataset for {model_name} with {spt_name} (Contextual Embeddings)",
    )

    save_model_variants_hf(train_tokenized, f"contextual_embedded_{model_name.lower()}_{spt_name}_train")

    test_tokenized = test_dataset.map(
        lambda x: tokenize_with_contextual_embeddings(x, tokenizer, pca, fasttext_projection),
        batched=True,
        desc=f"Tokenizing test dataset for {model_name} with {spt_name} (Contextual Embeddings)",
    )

    save_model_variants_hf(test_tokenized, f"contextual_embedded_{model_name.lower()}_{spt_name}_test")

### mBERT

In [27]:
# create embedding with bpe
prepare_contextual_embedding_dataset("mBERT", "bpe")

Tokenizing train dataset for mBERT with bpe (Contextual Embeddings):   0%|          | 0/1302061 [00:00<?, ? ex…

LASER Embedding Shape: (1000, 512)
MUSE Embedding Shape: (1000, 512)
FastText Embedding Shape: (1000, 300)


ValueError: operands could not be broadcast together with shapes (1000,512) (1000,300) 

In [None]:
# create embedding with unigram
prepare_contextual_embedding_dataset("mBERT", "unigram")

### XLM-R

In [None]:
# create embedding with bpe
prepare_contextual_embedding_dataset("XLM-R", "bpe")

In [None]:
# create embedding with unigram
prepare_contextual_embedding_dataset("XLM-R", "unigram")