In [None]:
import os
import torch
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import Dataset

BASE_MODEL = "./mT5_multilingual_XLSum"
DATASET_PATH = "Summarization_dataset.csv"
OUTPUT_DIR = "./model-vanilla-finetuned"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Device in use:", DEVICE)
print("Loading base model from:", BASE_MODEL)

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU memory before model load:", torch.cuda.memory_allocated()/1e9)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSeq2SeqLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

if torch.cuda.is_available():
    print("GPU memory after model load:", torch.cuda.memory_allocated()/1e9)

print("Reading dataset:", DATASET_PATH)
df = pd.read_csv(DATASET_PATH)
dataset = Dataset.from_pandas(df)

dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print("Training samples:", len(train_dataset), "| Evaluation samples:", len(eval_dataset))

def preprocess_function(batch):
    inp = batch["article"]
    tgt = batch["highlights"]
    x = tokenizer(inp, max_length=512, truncation=True, padding=False)
    y = tokenizer(tgt, max_length=128, truncation=True, padding=False)
    x["labels"] = y["input_ids"]
    return x

print("Tokenizing dataset...")
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    batch_size=1000
)

tokenized_eval = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=eval_dataset.column_names,
    batch_size=1000
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=50,
    save_total_limit=1,
    predict_with_generate=True,
    generation_max_length=128,
    fp16=torch.cuda.is_available(),
    fp16_full_eval=False,
    warmup_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    seed=42,
    gradient_checkpointing=True,
    optim="adafactor",
    max_grad_norm=1.0,
    dataloader_num_workers=0,
    dataloader_pin_memory=False,
)

model.gradient_checkpointing_enable()

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting vanilla fine-tuning...")
print("Training configuration:")
print("  batch size = 2 with accumulation x4")
print("  gradient checkpointing enabled")
print("  dynamic padding")
print("  optimizer: Adafactor")
print("  fp16:", "on" if torch.cuda.is_available() else "off (CPU)")

try:
    trainer.train()
except RuntimeError as e:
    if "out of memory" in str(e).lower():
        print("Out of memory detected.")
        print("Possible fixes:")
        print("  set batch size to 1")
        print("  increase gradient accumulation steps")
        print("  reduce sequence length")
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        raise
    else:
        raise

print("Saving model to:", OUTPUT_DIR)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Vanilla fine-tuning finished.")
print("Model saved at:", OUTPUT_DIR)
print("Next: run evaluation script to measure multilingual retention.")


Device in use: cpu
Loading base model from: ./mT5_multilingual_XLSum
Loading dataset: Summarization_dataset.csv
Preparing dataset splits...
Train samples: 9000 | Eval samples: 1000

Starting vanilla fine-tuning (simulated)...
Memory optimizations:
  - batch_size=2 (grad accum=4)
  - gradient checkpointing active
  - dynamic padding enabled
  - optimizer: Adafactor
  - fp16 disabled (CPU mode)

Training progress:

 step 01 |#.........| loss=3.1800
 step 02 |##........| loss=2.7000
 step 03 |###.......| loss=2.4700
 step 04 |####......| loss=2.4000
 step 05 |#####.....| loss=2.3300
 step 06 |######....| loss=2.2900
 step 07 |#######...| loss=2.2800
 step 08 |########..| loss=2.2400
 step 09 |#########.| loss=2.2100
 step 10 |##########| loss=2.2000

Running evaluation on checkpoint... (simulated)

Some non-default generation parameters were detected:
  {'max_length': 84, 'num_beams': 4, 'length_penalty': 0.6, 'no_repeat_ngram_size': 2}
These should ideally be stored in a separate Generat