# Importing all the important libraries

In [3]:
import numpy as np
from utils.dataframe import (
    convert_to_hf,
    load_model_variants_df,
    save_model_variants_hf
)
from utils.common import (
    get_fine_tuned_model
)
from IPython.display import display
import tensorflow_hub as hub
import tensorflow_text as text
import fasttext
import fasttext.util
from laserembeddings import Laser

# 2. Enhance Burmese Contextual Representations
- Use LASER, mUSE, and FastText for cross-lingual and morphology-aware training.
- Fine-tune mBERT, XLM-R on Burmese dataset after adding contextual embeddings.
- Train models again using combined embeddings.

## Data Preprocessing

In [None]:
# LASER Embeddings
laser = Laser()

In [None]:
# mUSE (Multilingual Universal Sentence Encoder)
muse = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [None]:
# FastText for Morphology-Aware Training
fasttext.util.download_model('my', if_exists='ignore')  # Download Burmese FastText model
fasttext_model = fasttext.load_model('cc.my.300.bin')

In [None]:
# function to tokenize for contextual embeddings
def tokenize_contextual_embeddings(examples, tokenizer):
    sentences = examples["source"]
    
    # Compute Multilingual Embeddings in Batch
    embeddings_laser = laser.embed_sentences(sentences, lang="my")
    embeddings_muse = muse(sentences).numpy()
    embeddings_fasttext = np.array([fasttext_model.get_sentence_vector(sentence) for sentence in sentences])

    combined_embedding = (embeddings_laser + embeddings_muse + embeddings_fasttext) / 3

    # Tokenize Sentences in Batch
    tokenized_input = tokenizer(sentences, truncation=True, padding="max_length", max_length=512)
    tokenized_output = tokenizer(examples["target"], truncation=True, padding="max_length", max_length=512)

    return {
        "input_ids": tokenized_input["input_ids"],
        "attention_mask": tokenized_input["attention_mask"],
        "labels": tokenized_output["input_ids"],
        "input_embeds": combined_embedding.tolist()
    }

In [None]:
# function to prepare dataset for contextual embeddings
def prepare_contextual_embeddings(model_name, spt_name):
    dataset = load_model_variants_df("combined")

    # Convert to Hugging Face Dataset
    dataset = convert_to_hf(dataset)

    _, tokenizer = get_fine_tuned_model(model_name, spt_name)

    # apply tokenize
    dataset = dataset.map(
        lambda x, _: tokenize_contextual_embeddings(x, tokenizer),
        batched=True,
        desc=f"Tokenizing dataset for {model_name} with {spt_name}",
        with_indices=True,  # Passing index as a second argument
        num_proc=10
    )

    # save
    save_model_variants_hf(dataset, f"{model_name.lower()}_{spt_name}_embeddings")

### mBERT

In [None]:
# create embedding with bpe
prepare_contextual_embeddings("mBERT", "bpe")

In [None]:
# create embedding with unigram
prepare_contextual_embeddings("mBERT", "unigram")

### XLM-R

In [None]:
# create embedding with bpe
prepare_contextual_embeddings("XLM-R", "bpe")

In [None]:
# create embedding with unigram
prepare_contextual_embeddings("XLM-R", "unigram")