In [None]:
import torch
from datasets import load_dataset
import evaluate
from transformers import (
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)
from peft import LoraConfig, get_peft_model

# -----------------------------
# CONFIG
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

MODEL_NAME = "google/mt5-small"
ADAPTER_SAVE_DIR = "./mt5-ceb-tl-lora-final"

# -----------------------------
# LOAD DATASET
# -----------------------------
dataset = load_dataset("csv", data_files="ceb_tgl.tsv", delimiter="\t")
dataset = dataset.rename_column("source_text", "src")
dataset = dataset.rename_column("target_text", "tgt")

split = dataset["train"].train_test_split(test_size=0.1)
train_dataset = split["train"]
eval_dataset = split["test"]

# -----------------------------
# TOKENIZER & MODEL
# -----------------------------
tokenizer = MT5Tokenizer.from_pretrained(MODEL_NAME)
model = MT5ForConditionalGeneration.from_pretrained(MODEL_NAME)

# -----------------------------
# APPLY LoRA
# -----------------------------
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q", "k", "v", "o"],  # all attention projections
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)
model.to(device)
model.print_trainable_parameters()  # should be non-zero

# -----------------------------
# PREPROCESS FUNCTION WITH ERROR HANDLING
# -----------------------------
def preprocess(batch):
    inputs = tokenizer(batch["src"], max_length=32, truncation=True, padding="max_length")
    labels = tokenizer(batch["tgt"], max_length=32, truncation=True, padding="max_length")

    new_labels = []
    for l in labels["input_ids"]:
        new_l = [token if token != tokenizer.pad_token_id else -100 for token in l]
        # Force at least one token
        if all(t == -100 for t in new_l):
            new_l[0] = tokenizer.eos_token_id
        new_labels.append(new_l)

    inputs["labels"] = new_labels
    return inputs


tokenized_train = train_dataset.map(preprocess, batched=True)
tokenized_eval = eval_dataset.map(preprocess, batched=True)

# -----------------------------
# DATA COLLATOR & TRAINING ARGS
# -----------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-ceb-tl-lora",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=10,
    learning_rate=1e-4,
    fp16=False,
    logging_steps=50,
    save_total_limit=2,
    eval_strategy="steps",
    save_strategy="steps",
    predict_with_generate=True,
    report_to="none"
)

# -----------------------------
# BLEU METRIC
# -----------------------------
bleu_metric = evaluate.load("sacrebleu")

def safe_decode(ids, tokenizer):
    # Replace -100 with pad_token_id
    ids = [token if 0 <= token < tokenizer.vocab_size else tokenizer.pad_token_id for token in ids]
    ids = [token if token != -100 else tokenizer.pad_token_id for token in ids]
    return tokenizer.decode(ids, skip_special_tokens=True)


def compute_bleu(eval_preds):
    preds, labels = eval_preds
    # Safe decode predictions
    decoded_preds = [safe_decode(p, tokenizer) for p in preds]

    # Safe decode labels
    decoded_labels = [[safe_decode(l, tokenizer)] for l in labels]  # sacrebleu expects list of references

    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

# -----------------------------
# TRAINER
# -----------------------------
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_bleu
)

for i in range(5):
    print("Input:", tokenizer.decode(tokenized_train[i]["input_ids"], skip_special_tokens=True))
    safe_labels = [t if t != -100 else tokenizer.pad_token_id for t in tokenized_train[i]["labels"]]
    print("Label:", tokenizer.decode(safe_labels, skip_special_tokens=True))


# -----------------------------
# TRAIN
# -----------------------------
trainer.train()



# -----------------------------
# SAVE LoRA ADAPTER
# -----------------------------
model.save_pretrained(ADAPTER_SAVE_DIR)
print("LoRA adapter saved at:", ADAPTER_SAVE_DIR)

Using device: cuda


In [11]:
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from peft import PeftModel

# ---------------------------------------------------------
# CONFIG - CHANGE THESE
# ---------------------------------------------------------
BASE_MODEL = "google/mt5-small"              # or mt5-base (must match what you trained on)
ADAPTER_DIR = "./adapter"                    # path to folder with adapter_model.safetensors
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("DEVICE =", DEVICE)

# ---------------------------------------------------------
# LOAD TOKENIZER
# ---------------------------------------------------------
print("\nLoading tokenizer...")
tokenizer = MT5Tokenizer.from_pretrained(BASE_MODEL)

# ---------------------------------------------------------
# LOAD BASE MODEL
# ---------------------------------------------------------
print("Loading base model...")
model = MT5ForConditionalGeneration.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
    device_map="auto" if DEVICE == "cuda" else None
)

# ---------------------------------------------------------
# LOAD LORA ADAPTER
# ---------------------------------------------------------
print("\nLoading LoRA adapter from:", ADAPTER_DIR)
model = PeftModel.from_pretrained(
    model,
    ADAPTER_DIR,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
)

model.to(DEVICE)
model.eval()

# ---------------------------------------------------------
# VERIFY LORA SUCCESSFULLY LOADED
# ---------------------------------------------------------
print("\n=== Trainable Parameters Check ===")
model.print_trainable_parameters()
# EXPECTED OUTPUT:
# trainable params: something NON-ZERO
# If it's 0 â†’ your adapter folder is wrong

# ---------------------------------------------------------
# INFERENCE FUNCTION
# ---------------------------------------------------------
def generate(text):
    inputs = tokenizer(text, return_tensors="pt").to(DEVICE)

    output_ids = model.generate(
        **inputs,
        max_length=100,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# ---------------------------------------------------------
# RUN TESTS
# ---------------------------------------------------------
tests = [
    "unsa imong pangalan",
    "ganahan ko mokaon og isda",
    "kumusta ka",
    "asa ka gikan",
]

print("\n===== TEST RESULTS =====")
for t in tests:
    print(f"\nInput: {t}")
    print("Output:", generate(t))


DEVICE = cuda

Loading tokenizer...


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
`torch_dtype` is deprecated! Use `dtype` instead!


Loading base model...

Loading LoRA adapter from: ./adapter


ValueError: Can't find 'adapter_config.json' at './adapter'

In [10]:
model.print_trainable_parameters()


trainable params: 0 || all params: 300,520,832 || trainable%: 0.0000
