In [None]:
!pip install --upgrade pip setuptools wheel

!pip install -q transformers datasets evaluate accelerate scikit-learn

!pip install rouge_score

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2024.12.0 which is incompatible.[0m[31m
[0mCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24987 sha256=65ab89938214cd7fab15c62e44b6ee198666be5738c72be8dbe34f48edf461e8
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
[0mInstalling collected packages: rouge_score
[0mSuccessfully installed rouge_score-0.1.2


In [None]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
import evaluate
from sklearn.model_selection import train_test_split
from copy import deepcopy

data_path = "nyt_dataset.json"
ds = load_dataset("json", data_files={"train": data_path})
raw = ds["train"]

splits = raw.train_test_split(test_size=0.2, seed=42)
train_raw = splits["train"]
test_raw  = splits["test"]

MODEL_NAME = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

def print_tokenizer_info():
    print("pad_token_id=", tokenizer.pad_token_id)
    print("eos_token_id=", tokenizer.eos_token_id)

print_tokenizer_info()

max_input_length  = 256
max_target_length = 256

def preprocess(examples):
    inputs = tokenizer(
        examples["input"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["output"],
            max_length=max_target_length,
            truncation=True,
            padding="max_length",
        )
    labels_ids = [
        [(lid if lid != tokenizer.pad_token_id else -100) for lid in seq]
        for seq in labels.input_ids
    ]
    inputs["labels"] = labels_ids
    return inputs

train_ds = Dataset.from_list(train_raw).map(preprocess, batched=True)
test_ds  = Dataset.from_list(test_raw).map(preprocess, batched=True)

train_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
test_ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

data_collator = DataCollatorForSeq2Seq(
    tokenizer, model=model, label_pad_token_id=-100
)

batch = data_collator([train_ds[i] for i in range(4)])
batch = {k: v.to(model.device) for k, v in batch.items()}
model.eval()
loss = model(**batch).loss
# print(loss.item())

def generate_preds(model, tok, raw_examples, num_beams=4):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    preds = []
    for ex in raw_examples:
        inp = tok(ex["input"], return_tensors="pt", truncation=True, padding=True).to(device)
        out_ids = model.generate(
            **inp,
            max_length=max_target_length,
            num_beams=num_beams,
            length_penalty=0.8,
            early_stopping=True
        )
        pred = tok.decode(out_ids[0], skip_special_tokens=True)
        preds.append(pred)
    return preds

baseline_preds = generate_preds(model, tokenizer, test_raw)
references      = [ex["output"] for ex in test_raw]
rouge = evaluate.load("rouge")
base_scores = rouge.compute(predictions=baseline_preds, references=references)
print("Baseline ROUGE:", base_scores)

baseline_model = deepcopy(model)

steps_per_epoch = len(train_ds) // 32
training_args = Seq2SeqTrainingArguments(
    output_dir="nyt-finetune",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=4,
    do_eval=True,
    eval_steps=steps_per_epoch,
    save_steps=steps_per_epoch,
    save_total_limit=1,
    logging_steps=10,
    logging_first_step=True,
    learning_rate=1e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

ft_preds  = generate_preds(model, tokenizer, test_raw)
ft_scores = rouge.compute(predictions=ft_preds, references=references)
print("Fine-tuned ROUGE:", ft_scores)

print(f"Baseline ROUGE-L: {base_scores['rougeL']:.4f}")
print(f"Fine-tuned ROUGE-L: {ft_scores['rougeL']:.4f}")


pad_token_id= 0
eos_token_id= 1


Map:   0%|          | 0/94 [00:00<?, ? examples/s]



Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Sanity-check loss > 0? 2.7332093715667725
Baseline ROUGE: {'rouge1': np.float64(0.2988261685003252), 'rouge2': np.float64(0.02857972623597624), 'rougeL': np.float64(0.1647327608076149), 'rougeLsum': np.float64(0.16563004868733783)}
▶ Starting fine-tuning…


  trainer = Seq2SeqTrainer(


Step,Training Loss
1,0.0


Fine-tuned ROUGE: {'rouge1': np.float64(0.2988261685003252), 'rouge2': np.float64(0.02857972623597624), 'rougeL': np.float64(0.1647327608076149), 'rougeLsum': np.float64(0.16563004868733783)}
Baseline ROUGE-L: 0.1647
Fine-tuned  ROUGE-L: 0.1647


In [None]:
print("Generating baseline predictions")
baseline_preds = generate_preds(baseline_model, tokenizer, test_raw)
references = [ex["output"] for ex in test_raw]

rouge = evaluate.load("rouge")
base_scores = rouge.compute(predictions=baseline_preds, references=references)

def exact_match_accuracy(preds, refs):
    matches = [p.strip() == r.strip() for p, r in zip(preds, refs)]
    return sum(matches) / len(matches)

baseline_row_acc = exact_match_accuracy(baseline_preds, references)

baseline_puzzle_acc = baseline_row_acc

print("Baseline")
print("ROUGE:", base_scores)
print("Row Accuracy:", round(baseline_row_acc, 4))
print("Puzzle Accuracy:", round(baseline_puzzle_acc, 4))

ft_preds = generate_preds(model, tokenizer, test_raw)
ft_scores = rouge.compute(predictions=ft_preds, references=references)

ft_row_acc = exact_match_accuracy(ft_preds, references)
ft_puzzle_acc = ft_row_acc

import pandas as pd

df = pd.DataFrame({
    "Input": [ex["input"] for ex in test_raw],
    "Gold Output": references,
    "Baseline Output": baseline_preds,
    "Finetuned Output": ft_preds,
})

from IPython.display import display
display(df)

print("Baseline:")
print("ROUGE-L:", round(base_scores["rougeL"], 4))
print("Row Accuracy:", round(baseline_row_acc, 4))
print("Puzzle Accuracy:", round(baseline_puzzle_acc, 4))

print("Fine-tuned:")
print("ROUGE-L:", round(ft_scores["rougeL"], 4))
print("Row Accuracy:", round(ft_row_acc, 4))
print("Puzzle Accuracy:", round(ft_puzzle_acc, 4))


▶ Generating baseline predictions…
=== Baseline ===
ROUGE: {'rouge1': np.float64(0.2988261685003252), 'rouge2': np.float64(0.02857972623597624), 'rougeL': np.float64(0.1647327608076149), 'rougeLsum': np.float64(0.16563004868733783)}
Row Accuracy: 0.0
Puzzle Accuracy: 0.0
▶ Generating fine-tuned predictions…


Unnamed: 0,Input,Gold Output,Baseline Output,Finetuned Output
0,Group the following words into 4 meaningful ca...,"FACIAL FEATURES: CHEEK, EYE, MOUTH, NOSE; SYNO...",EYE,EYE
1,Group the following words into 4 meaningful ca...,"STATES OF ELATION: BLISS, CLOUD NINE, HEAVEN, ...","BLISS, HEAVEN, HIGH FIVE, TOP TEN, SHAKE, GUES...","BLISS, HEAVEN, HIGH FIVE, TOP TEN, SHAKE, GUES..."
2,Group the following words into 4 meaningful ca...,"HALLOWEEN DECORATIONS: BAT, COBWEB, PUMPKIN, T...","BELL, CHERRY, FIREFLY, BONES, 2001, 7","BELL, CHERRY, FIREFLY, BONES, 2001, 7"
3,Group the following words into 4 meaningful ca...,"PLEASANT SMELLS: AROMA, BOUQUET, FRAGRANCE, SC...",BOAR,BOAR
4,Group the following words into 4 meaningful ca...,"EXTINCT ANIMALS: DODO, MAMMOTH, MASTODON, TRIL...","THREADS, BUSTS, SMOOCH, TURKEYS, MAMHOTH, DODO...","THREADS, BUSTS, SMOOCH, TURKEYS, MAMHOTH, DODO..."
5,Group the following words into 4 meaningful ca...,"INFORMATION DISPLAYS: CHART, DIAGRAM, GRAPH, M...",PERK DIAGRAM STUFFING PIE SATISFACTION I RAN E...,PERK DIAGRAM STUFFING PIE SATISFACTION I RAN E...
6,Group the following words into 4 meaningful ca...,"COMPUTER EQUIPMENT: KEYBOARD, MONITOR, MOUSE, ...",HAMSTER ORGAN HORN RAT GRIPE KEYBOARD CARP TRI...,HAMSTER ORGAN HORN RAT GRIPE KEYBOARD CARP TRI...
7,Group the following words into 4 meaningful ca...,"SNEAKER BRANDS: ADIDAS, NIKE, PUMA, REEBOK; ""M...","ADIDAS, PUMA, REEBOK, BAT, NIKE, SUPER, BAT, N...","ADIDAS, PUMA, REEBOK, BAT, NIKE, SUPER, BAT, N..."
8,Group the following words into 4 meaningful ca...,"WET WEATHER: HAIL, RAIN, SLEET, SNOW; NBA TEAM...",NETS,NETS
9,Group the following words into 4 meaningful ca...,"ANIMAL GROUP NAMES: COLONY, HERD, PRIDE, SWARM...",CLASS,CLASS



=== Final Summary ===
▶ Baseline:
ROUGE-L:        0.1647
Row Accuracy:   0.0
Puzzle Accuracy: 0.0
▶ Fine-tuned:
ROUGE-L:        0.1647
Row Accuracy:   0.0
Puzzle Accuracy: 0.0
