# Importing all the important libraries

In [1]:
import tensorflow as tf
import numpy as np
import h5py
import torch
from tqdm import tqdm
from utils.dataframe import (
    convert_to_hf,
    load_model_variants_df,
    save_model_variants_hf
)
from transformers import (
    AutoTokenizer, 
)
from IPython.display import display
import tensorflow_hub as hub
import tensorflow_text as text
import fasttext
import fasttext.util
from laserembeddings import Laser
from utils.gpu import get_device
from sklearn.decomposition import PCA

# Set Settings

In [2]:
tqdm.pandas()

# Common

In [3]:
# gpu device 
device = get_device()


Devices:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU details:  {'device_name': 'METAL'}
Using device: mps


# 2. Enhance Burmese Contextual Representations
- Use LASER, mUSE, and FastText for cross-lingual and morphology-aware training.
- Fine-tune mBERT, XLM-R on Burmese dataset after adding contextual embeddings.
- Train models again using combined embeddings.

In [4]:
# function to get fine tuned tokenizer
def get_fine_tuned_tokenizer(model_name, spt_name):
    # Load LoRA Weights
    lora_checkpoint_path = f"model-variants/models/{model_name}_{spt_name.upper()}"

    # Load Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(lora_checkpoint_path)

    return tokenizer

## Data Preprocessing

In [5]:
# LASER Embeddings
laser = Laser()

In [6]:
# mUSE (Multilingual Universal Sentence Encoder)
muse = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

2025-02-20 23:47:41.926471: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-02-20 23:47:41.926500: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 48.00 GB
2025-02-20 23:47:41.926503: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 18.00 GB
I0000 00:00:1740062861.926519 3720185 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1740062861.926537 3720185 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-02-20 23:47:43.994127: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


In [7]:
# FastText for Morphology-Aware Training
fasttext.util.download_model('my', if_exists='ignore')  # Download Burmese FastText model
fasttext_model = fasttext.load_model('cc.my.300.bin')

In [8]:
# load data
dataset = load_model_variants_df("combined")

# split into train and test
train_df = dataset.sample(frac=0.8, random_state=42)
test_df = dataset.drop(train_df.index)

# conert to hugging face dataset
train_dataset = convert_to_hf(train_df)
test_dataset = convert_to_hf(test_df)

### Generate Embeddings

In [9]:
import logging
import os

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

log_file = "jupyter_log.txt"

# Configure logging
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    filemode="w"  # "w" to overwrite, "a" to append
)

# Log a test message
logging.info("✅ Logging is initialized!")

# Check if the file was created
print(f"Log file exists: {os.path.exists(log_file)}")

Log file exists: True


In [10]:
logging.info("Tmp")

In [None]:
def generate_embeddings(batch):
    """
    Generates contextual embeddings using mUSE (TensorFlow), LASER, and FastText.
    Uses Hugging Face Datasets to process batches efficiently.
    Logs errors and stops execution on failure.
    """

    sentences = batch["source"]
    sentences = [str(text) if text is not None else "" for text in sentences]

    # Ensure all sentences are valid strings
    sentences = [s.replace("\n", " ").strip() if s else "" for s in sentences]

    logging.info(f"Processing {len(sentences)} sentences...")

    try:
        # Generate mUSE embeddings (GPU-accelerated)
        logging.info("Generating mUSE embeddings...")
        batch_tensor = tf.convert_to_tensor(sentences)
        embeddings_muse = muse(batch_tensor).numpy()  # Shape: (batch_size, 512)
        logging.info("mUSE embeddings generated.")

        # Generate FastText embeddings
        logging.info("Generating FastText embeddings...")
        embeddings_fasttext = np.array([
            fasttext_model.get_sentence_vector(sentence) for sentence in sentences
        ])
        logging.info("FastText embeddings generated.")

        # ✅ Generate LASER embeddings
        logging.info("Generating LASER embeddings...")
        embeddings_laser = laser.embed_sentences(sentences, lang="my")
        logging.info("✅ LASER embeddings generated.")

        # ✅ Stack embeddings
        combined_embeddings = np.hstack([
            embeddings_muse.astype(np.float16),
            embeddings_laser.astype(np.float16),
            embeddings_fasttext.astype(np.float16)
        ])
        logging.info(f"✅ Stacked embeddings shape: {combined_embeddings.shape}")

        return {"embeddings": combined_embeddings}

    except Exception as e:
        # Log error with possible fixes
        error_message = f"""
        ❌ ERROR in `generate_embeddings`: {str(e)}
        🔹 Possible Fixes:
        1️⃣ Reduce batch size if running out of memory (Try `batch_size=16`).
        2️⃣ Ensure input sentences are valid strings (Check for `None` values).
        3️⃣ If running on CPU, use `tf.config.experimental.set_memory_growth()` to prevent crashes.
        4️⃣ If running on Metal (Mac M4 GPU), ensure TensorFlow is properly installed (`pip install tensorflow-metal`).
        """
        logging.error(error_message)
        
        # 🔥 **Clear memory before raising the error**
        torch.cuda.empty_cache()
        tf.keras.backend.clear_session()

        # Raise the exception to stop execution
        raise RuntimeError(error_message)

In [None]:
# generate embeddings
dataset_with_embeddings = train_dataset.map(generate_embeddings, batched=True, batch_size=16)



Map:   0%|          | 0/1302061 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
print("Applying PCA for dimensionality reduction...")

pca = PCA(n_components=300)
pca.fit(all_train_embeddings)  # Fit PCA on training set

In [None]:
# Transform training embeddings
all_train_embeddings_pca = pca.transform(all_train_embeddings)
print(f"PCA applied! Reduced shape: {all_train_embeddings_pca.shape}")

In [None]:
print("Saving embeddings and sentences to HDF5 format...")

with h5py.File("model-variants/data/contextual_train_embeddings.h5", "w") as hf:
    hf.create_dataset("embeddings", data=all_train_embeddings_pca)
    hf.create_dataset("sentences", data=np.array(train_dataset["source"], dtype=h5py.string_dtype(encoding="utf-8")))

print("Data saved successfully!")

### Load Embeddings

In [None]:
# Load the HDF5 file
with h5py.File("model-variants/data/contextual_train_embeddings.h5", "r") as hf:
    all_train_embeddings_pca = np.array(hf["embeddings"])  # Load PCA-transformed embeddings
    all_train_sentences = [s.decode("utf-8") for s in hf["sentences"]]  # Decode stored UTF-8 sentences

In [None]:
# Debugging Output
print(f"✅ Loaded PCA Embeddings Shape: {all_train_embeddings_pca.shape}")  # Should be (num_samples, 300)
print(f"✅ Loaded Sentences: {len(all_train_sentences)} samples")  # Should match the number of embeddings
print(f"🔍 Example Sentence: {all_train_sentences[0]}")
print(f"🔍 Example Embedding (first 5 values): {all_train_embeddings_pca[0][:5]}")

### Tokenize

In [None]:
@torch.no_grad()
def generate_contextual_embeddings(sentences):
    """Generates contextual embeddings using LASER, FastText, and mUSE."""
    sentences = [sentence.replace("\n", " ").strip() for sentence in sentences]

    # Generate embeddings on GPU
    batch_tensor = tf.convert_to_tensor(sentences)
    embeddings_muse = muse(batch_tensor).numpy()  # GPU-accelerated mUSE embeddings

    embeddings_fasttext = np.array([
        fasttext_model.get_sentence_vector(sentence) for sentence in sentences
    ])  # FastText embeddings

    embeddings_laser = laser.embed_sentences(sentences, lang="my")  # LASER embeddings

    # Apply PCA transformation
    combined_embeddings = np.hstack([embeddings_laser, embeddings_muse, embeddings_fasttext])
    contextual_embeddings = pca.transform(combined_embeddings)

    return contextual_embeddings


In [None]:
@torch.no_grad()
def tokenize_with_contextual_embeddings(examples, tokenizer):
    """Tokenizes input sentences while integrating contextual embeddings."""
    sentences = examples["source"]

    # Generate contextual embeddings
    contextual_embeddings = generate_contextual_embeddings(sentences)
    contextual_embeddings = torch.tensor(contextual_embeddings, dtype=torch.bfloat16).to(tokenizer.device)

    # Tokenization
    tokenized_input = tokenizer(sentences, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    tokenized_output = tokenizer(examples["target"], truncation=True, padding="max_length", max_length=512, return_tensors="pt")

    batch_size = tokenized_input["input_ids"].shape[0]
    seq_len = tokenized_input["input_ids"].shape[1]
    hidden_size = contextual_embeddings.shape[-1]

    # Expand contextual embeddings for each token
    contextual_embeddings = contextual_embeddings.unsqueeze(1).expand(batch_size, seq_len, hidden_size).to(tokenized_input["input_ids"].device)

    return {
        "input_ids": tokenized_input["input_ids"],
        "attention_mask": tokenized_input["attention_mask"],
        "labels": tokenized_output["input_ids"],
        "contextual_embeds": contextual_embeddings
    }

In [None]:
def prepare_contextual_embedding_dataset(model_name, spt_name):
    """
    Tokenizes datasets with contextual embeddings for fine-tuning XLM-R & mBERT.
    Saves processed train and test datasets with embeddings.
    """
    print(f"Preparing dataset for {model_name} with {spt_name} embeddings...")

    # Load tokenizer
    tokenizer = get_fine_tuned_tokenizer(model_name, spt_name)

    # Process Training Dataset
    print("⚙️ Tokenizing train dataset with contextual embeddings...")
    train_tokenized = train_dataset.map(
        lambda x: tokenize_with_contextual_embeddings(x, tokenizer),
        batched=True,
        num_proc=8,  # Parallel processing for speed
        load_from_cache_file=False,
        desc=f"Tokenizing train dataset for {model_name} with {spt_name} (Contextual Embeddings)",
    )

    # Save train dataset
    train_filename = f"contextual_embedded_{model_name.lower()}_{spt_name}_train"
    save_model_variants_hf(train_tokenized, train_filename)
    print(f"✅ Saved processed train dataset as {train_filename}")

    # Process Test Dataset
    print("⚙️ Tokenizing test dataset with contextual embeddings...")
    test_tokenized = test_dataset.map(
        lambda x: tokenize_with_contextual_embeddings(x, tokenizer),
        batched=True,
        num_proc=8,  # Parallel processing
        load_from_cache_file=False,
        desc=f"Tokenizing test dataset for {model_name} with {spt_name} (Contextual Embeddings)",
    )

    # Save test dataset
    test_filename = f"contextual_embedded_{model_name.lower()}_{spt_name}_test"
    save_model_variants_hf(test_tokenized, test_filename)
    print(f"✅ Saved processed test dataset as {test_filename}")

    print(f"🎉 Finished processing dataset for {model_name} with {spt_name}")

#### mBERT

In [None]:
# create embedding with bpe
prepare_contextual_embedding_dataset("mBERT", "bpe")

In [None]:
# create embedding with unigram
prepare_contextual_embedding_dataset("mBERT", "unigram")

#### XLM-R

In [None]:
# create embedding with bpe
prepare_contextual_embedding_dataset("XLM-R", "bpe")

In [None]:
# create embedding with unigram
prepare_contextual_embedding_dataset("XLM-R", "unigram")