In [None]:
# ✅ Install Required Libraries
!pip install torch transformers sentencepiece datasets peft accelerate pandas

In [2]:
# 📌 Import Required Libraries
import torch
import pandas as pd
import sentencepiece as spm
from datasets import Dataset
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model

In [None]:
# ✅ Check if GPU is Available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using device: {device}")

In [3]:
device = "mps"

In [4]:
# ✅ Load and Split CSV Dataset
dataset_path = "model-variants/combined.csv"
df = pd.read_csv(dataset_path)

In [5]:
# ✅ Split into 80% train, 20% test
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

In [6]:
# ✅ Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

In [7]:
# ✅ Load Pretrained XLM-R Model & Tokenizer
model_name = "xlm-roberta-base"
xlmr_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
# ✅ Load Pre-Trained SentencePiece Model (SPT-BPE)
spt_bpe_model_path = "spt/spt_bpe.model"
spt_bpe = spm.SentencePieceProcessor(model_file=spt_bpe_model_path)

In [12]:
# ✅ Correct Tokenization Function to Handle Batches
def tokenize_function(examples):
    batch_source_tokens = [spt_bpe.encode(text, out_type=str) for text in examples["source"]]
    batch_target_tokens = [spt_bpe.encode(text, out_type=str) for text in examples["target"]]

    # ✅ Flatten and convert tokens to text
    batch_source_texts = [" ".join(tokens) for tokens in batch_source_tokens]
    batch_target_texts = [" ".join(tokens) for tokens in batch_target_tokens]

    # ✅ Tokenize using XLM-R tokenizer
    model_inputs = xlmr_tokenizer(batch_source_texts, padding="max_length", truncation=True, max_length=512)
    labels = xlmr_tokenizer(batch_target_texts, padding="max_length", truncation=True, max_length=512)["input_ids"]

    model_inputs["labels"] = labels  # ✅ Assign labels for MLM-style training

    return model_inputs

In [13]:
# ✅ Apply Tokenization to Dataset
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, batch_size=16, desc="Tokenizing Train Dataset")
tokenized_train_dataset.save_to_disk(f"tokenized_train_dataset")

Tokenizing Train Dataset:   0%|          | 0/1302061 [00:00<?, ? examples/s]

Saving the dataset (0/19 shards):   0%|          | 0/1302061 [00:00<?, ? examples/s]

In [14]:
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, batch_size=16, desc="Tokenizing Test Dataset")
tokenized_test_dataset.save_to_disk(f"tokenized_test_dataset")

Tokenizing Test Dataset:   0%|          | 0/325515 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/325515 [00:00<?, ? examples/s]

In [None]:
# ✅ Apply Tokenization to Dataset
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, desc= "Tokenizing Test Dataset")

In [None]:
# ✅ Apply LoRA Configuration
def apply_lora(model):
    lora_config = LoraConfig(
        r=8,  # ✅ Low-rank dimension
        lora_alpha=16,
        target_modules=["query", "value"],  # ✅ Apply LoRA to attention layers
        lora_dropout=0.1,
        type=""
    )
    return get_peft_model(model, lora_config).to(device)

In [None]:
# ✅ Load XLM-R Model and Apply LoRA
model = AutoModelForMaskedLM.from_pretrained(model_name).to(device)
model = apply_lora(model)

In [None]:
# ✅ Function to Check Which Layers Have LoRA Parameters
def get_lora_layers(model):
    lora_layers = {}
    for name, param in model.named_parameters():
        if "lora" in name.lower():  # ✅ LoRA layers contain "lora" in their names
            lora_layers[name] = param.shape
    return lora_layers

In [None]:
# ✅ Retrieve LoRA-Affected Layers
lora_layers_info = get_lora_layers(model)
df_lora_layers = pd.DataFrame(lora_layers_info.items(), columns=["Layer Name", "Shape"])
display(df_lora_layers)

In [None]:
train_agrs = {
    "warmup_steps": 500,
    "weight_decay": 0.01,
    "save_strategy": "epoch",
    "save_total_limit": 2,
    "fp16": False,
    "bf16": True,
    "eval_strategy": "epoch",
    "load_best_model_at_end": True,
    "metric_for_best_model": "loss",
    "greater_is_better": False,
    "logging_steps": 1000,
    "optim": "adamw_torch_fused",
    "auto_find_batch_size": True,
    "disable_tqdm": False,
    "label_names": ["labels"],
}

In [None]:
# ✅ Define Training Arguments
training_args = TrainingArguments(
    output_dir=f"model-variants/results/XLM-R_BPE",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=3e-5,
    warmup_steps=500,
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=2,
    fp16= False,
    bf16= True,
    logging_dir="logs/XLM-R_BPE",
    logging_steps=1000,
    optim="adamw_torch_fused",
    load_best_model_at_end=True,
    metric_for_best_model= "loss",
    greater_is_better= False,
    auto_find_batch_size= True,
    disable_tqdm= False,
    label_names= ["labels"],
)

In [None]:
# ✅ Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    processing_class=xlmr_tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
# ✅ Train the Model
trainer.train()

In [None]:
# ✅ Save the Trained Model and Tokenizer
save_path = "./models/XLM-R_BPE"
model.save_pretrained(save_path)
xlmr_tokenizer.save_pretrained(save_path)
print(f"✅ Fine-tuned XLM-R on English-to-Burmese Parallel Data with SPT-BPE and LoRA saved at `{save_path}`.")