In [None]:
!pip install transformers==4.45.2 sentence-transformers==3.1.1

In [1]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, TrainingArguments, EarlyStoppingCallback, Trainer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from utils.gpu import get_device
from utils.common import (
    generate_masked_predictions_hf_batch, generate_mt5_predictions,
    compute_metrics_hf,
    convert_to_mean_scores_df,
    get_fine_tuned_model, get_embedded_fine_tuned_model,
    compute_multilingual_masked_perplexity_single, compute_multilingual_mt5_perplexity_single,
    extract_extended_metrics_from_logs,
)
from utils.dataframe import (
    load_gen_df, save_tmp_df, load_tmp_df, load_models_df,
    save_model_variants_df, load_model_variants_df,
    save_model_variants_hf, load_model_variants_hf,
    save_model_variants_gen_df, load_model_variants_gen_df,
    convert_to_hf,
)

2025-02-19 05:41:58.067615: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-19 05:41:58.091754: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-19 05:41:58.091785: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-19 05:41:58.107221: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Transformers is only compatible with Keras 2, but yo

In [2]:
# gpu device 
device = get_device()

Tensorflow GPUs:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Using PyTorch device: cuda
GPU Name: NVIDIA A10G


2025-02-19 05:42:00.879213: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-19 05:42:00.917558: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-02-19 05:42:00.919528: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [3]:
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small", use_fast=False, legacy=True)
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-small").to(device)

In [4]:
# 2. Define Prefix Tuning Parameters (THIS IS WHERE THEY ARE CREATED)
prefix_length = 10
num_prefixes = model.config.num_layers * 2
prefix_projection_dim = 512

In [5]:
# Create the prefix embeddings:
prefixes = torch.nn.Embedding(num_prefixes, prefix_length * prefix_projection_dim).to(device)

In [6]:
print("Device of prefixes:", prefixes.weight.device)

Device of prefixes: cuda:0


In [7]:
# Create the prefix projection layer (optional, but often recommended):
prefix_projection = torch.nn.Sequential(
    torch.nn.Linear(prefix_length * prefix_projection_dim, model.config.d_model),
    torch.nn.Tanh()  # Or another activation function
).to(device)

In [8]:
def apply_lora(model, model_name):
    """
    Applies LoRA for efficient fine-tuning.
    """

    # Select correct LoRA target layers
    if "t5" in model_name.lower():
        target_modules = ["q", "v"]  # LoRA for T5/mT5
    else:
        target_modules = ["query", "value"]  # LoRA for BERT

    # Define LoRA Configuration
    lora_config = LoraConfig(
        r=8,                    # Rank of LoRA matrices
        lora_alpha=16,          # Scaling factor
        target_modules=target_modules,  
        lora_dropout=0.1,       # Prevents overfitting
    )

    # Apply LoRA
    model = get_peft_model(model, lora_config)

    # Move model to GPU
    model.to(device)

    print(f"LoRA applied to {model_name} (Target Modules: {target_modules})")
    
    return model

In [9]:
model = apply_lora(model, "mT5-small")

LoRA applied to mT5-small (Target Modules: ['q', 'v'])


In [10]:
trained_model_name = f"Prefix_mT5_UNIGRAM"

In [11]:
# ✅ Define Training Arguments
train_args = {
    "warmup_steps": 500,
    "weight_decay": 0.01,
    "save_strategy": "epoch",
    "save_total_limit": 2,
    "fp16": False,
    "bf16": True,  # Mixed precision training
    "eval_strategy": "epoch",
    "load_best_model_at_end": True,
    "metric_for_best_model": "loss",
    "greater_is_better": False,
    "logging_steps": 1000,
    "optim": "adamw_torch_fused",
    "auto_find_batch_size": True,
    "disable_tqdm": False,
}

In [12]:
# ✅ Define TrainingArguments
training_args = TrainingArguments(
    **train_args,
    output_dir=f"model-variants/results/{trained_model_name}",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    logging_dir=f"model-variants/logs/{trained_model_name}",
    label_names=["labels", "input_ids"]
)

In [13]:
class PrefixTuningTrainer(Trainer):
    def __init__(self, *args, prefixes, prefix_projection, **kwargs):
        super().__init__(*args, **kwargs)
        device = self.model.device
        self.prefixes = prefixes.to(device)  # Ensure prefix embeddings are on GPU
        self.prefix_projection = prefix_projection.to(device)  # Ensure prefix projection is on GPU

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Compute loss for prefix tuning.
        """
        device = model.device  # Ensure all tensors are on the same device

        # Move inputs to GPU
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        decoder_input_ids = inputs["labels"].to(device)

        # Fix padding issue (-100 should be replaced with decoder_start_token_id)
        decoder_input_ids = decoder_input_ids.masked_fill(
            decoder_input_ids == -100, model.config.decoder_start_token_id
        )

        # Ensure prefix tensor is on GPU
        num_prefixes = self.prefixes.num_embeddings
        prefix_ids = torch.arange(num_prefixes, device=device)
        expanded_prefixes = self.prefixes(prefix_ids).unsqueeze(0).expand(input_ids.shape[0], -1, -1)

        # Project prefix embeddings to model hidden dimension
        projected_prefixes = self.prefix_projection(expanded_prefixes)
        assert projected_prefixes.shape[-1] == model.config.d_model, "Prefix projection mismatch!"

        # Convert token IDs to embeddings
        inputs_embeds = model.encoder.embed_tokens(input_ids).to(device)
        decoder_inputs_embeds = model.decoder.embed_tokens(decoder_input_ids).to(device)

        # Concatenate prefix embeddings with inputs
        inputs_embeds = torch.cat([projected_prefixes, inputs_embeds], dim=1)

        # Update attention mask
        new_seq_length = inputs_embeds.shape[1]
        updated_attention_mask = torch.ones((attention_mask.shape[0], new_seq_length), device=device)
        updated_attention_mask[:, projected_prefixes.shape[1]:] = attention_mask

        # Forward pass with correct decoder embeddings
        outputs = model(
            inputs_embeds=inputs_embeds,
            attention_mask=updated_attention_mask,
            decoder_inputs_embeds=decoder_inputs_embeds,  # ✅ Use embeddings instead of decoder_input_ids
            labels=decoder_input_ids  # ✅ Ensure loss is computed
        )

        # Extract loss
        loss = outputs.loss if hasattr(outputs, "loss") else outputs["loss"]

        if loss is None:
            raise ValueError("Model did not return a loss. Ensure labels are provided.")

        return (loss, outputs) if return_outputs else loss

In [14]:
# load dataset
train_data = load_model_variants_hf(f"mt5_unigram_train")
val_data = load_model_variants_hf(f"mt5_unigram_test")

In [15]:
# for debug, remove comment
#train_data = train_data.select(range(100))
#val_data = val_data.select(range(100))

In [16]:
# ✅ Initialize PrefixTuningTrainer
trainer = PrefixTuningTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    prefixes=prefixes,
    prefix_projection=prefix_projection
)

In [None]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Save trained model and tokenizer
save_path = f"model-variants/models/{trained_model_name}"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model `mT5` fine-tuned and saved at `{save_path}`.")