In [None]:
!pip install rouge_score


In [None]:
!pip install -U transformers==4.46.3 evaluate==0.4.2 rouge_score==0.1.2


In [None]:
import transformers, evaluate
print("Transformers:", transformers.__version__)   # should print 4.46.3
print("Evaluate:", evaluate.__version__)           # should print 0.4.2


In [None]:
# =========================
# ROND ‚Üí BART Fine-tuning (Summarization) ‚Äî FIXED VERSION
# =========================

# (Colab) Installs
!pip -q install transformers==4.46.3 datasets evaluate accelerate sentencepiece rouge_score

# ---- Setup
import os, numpy as np, pandas as pd, torch
os.environ["WANDB_DISABLED"] = "true"          # no wandb
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"   # quieter logs
SEED = 42

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    set_seed
)
import evaluate

set_seed(SEED)

# ---- Paths (use your processed CSVs)
TRAIN = "./rond_train_processed.csv"
VAL   = "./rond_val_processed.csv"
TEST  = "./rond_test_processed.csv"

# ---- Load CSVs ‚Üí HF Datasets
def load_split(path):
    df = pd.read_csv(path)
    # Build a single "source" field (instruction + input if present)
    if "instruction" in df.columns:
        df["source"] = "Instruction: " + df["instruction"].astype(str) + "\nInput: " + df["input"].astype(str)
    else:
        df["source"] = df["input"].astype(str)
    df = df[["source", "output"]].rename(columns={"output": "target"})
    return Dataset.from_pandas(df)

ds = DatasetDict({
    "train": load_split(TRAIN),
    "validation": load_split(VAL),
    "test": load_split(TEST)
})

# ---- Model & Tokenizer
MODEL_NAME = "facebook/bart-base"
tokenizer  = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model      = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# üîß Set generation behaviour on the model config
gen_cfg = model.generation_config
gen_cfg.max_length = 128
gen_cfg.num_beams = 4
gen_cfg.no_repeat_ngram_size = 3
model.generation_config = gen_cfg

# ---- Tokenization
max_source_len = 512
max_target_len = 128

def preprocess(ex):
    model_in = tokenizer(
        ex["source"],
        max_length=max_source_len,
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            ex["target"],
            max_length=max_target_len,
            truncation=True
        )
    model_in["labels"] = labels["input_ids"]
    return model_in

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
collator  = DataCollatorForSeq2Seq(tokenizer, model=model)

# ---- Metrics (ROUGE, computed on generated text)
rouge = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds  = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):  # some versions return (logits, ...)
        preds = preds[0]

    # Replace -100 with pad_token_id so we can decode labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }

# ---- Training Args (fine-tune settings, cleaned)
args = Seq2SeqTrainingArguments(
    output_dir="./bart_rond_fixed_run",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,

    # üîß Hyperparameters (closer to baseline)
    learning_rate=5e-5,              # back to baseline LR
    num_train_epochs=3,              # a bit longer than 2, but not crazy
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    warmup_ratio=0.03,               # small warmup
    lr_scheduler_type="linear",
    label_smoothing_factor=0.0,      # turn off smoothing for now
    fp16=True,

    predict_with_generate=True,      # still evaluate on generated summaries
    logging_steps=100,
    report_to=[],
    seed=SEED
)

# ---- Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

# ---- Train
print("üöÄ Fine-tuning BART on ROND (fixed script)...")
trainer.train()

# ---- Evaluate & Generate on TEST
print("üß™ Generating test summaries & computing ROUGE...")
test_out = trainer.predict(tokenized["test"], metric_key_prefix="test")
print("‚úÖ Test metrics:", {k: float(v) for k, v in test_out.metrics.items() if k.startswith("test_")})

# Decode predictions + references and save a CSV
pred_ids = test_out.predictions
if isinstance(pred_ids, tuple):
    pred_ids = pred_ids[0]
decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

# Rebuild refs from original test split to save a nice CSV
raw_test = ds["test"].to_pandas()
out_df = pd.DataFrame({
    "instruction": raw_test["source"].str.extract(r"Instruction:\s*(.*)\n", expand=False),
    "input_text":  raw_test["source"],
    "reference_summary": raw_test["target"],
    "predicted_summary": decoded_preds
})
out_df.to_csv("rond_predictions_fixed.csv", index=False)
print("üíæ Saved test summaries ‚Üí rond_predictions_fixed.csv")


In [None]:
# ==== BART on ROND ‚Äì LR = 1e-5 ====
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
import evaluate
import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)
import torch

# ==== Paths (ROND: no chunking) ====
TRAIN = "./rond_train_processed.csv"
VAL   = "./rond_val_processed.csv"
TEST  = "./rond_test_processed.csv"

# ==== Load CSVs ‚Üí HuggingFace Datasets ====
def load_split(path):
    df = pd.read_csv(path)
    if "instruction" in df.columns:
        df["source"] = "Instruction: " + df["instruction"].astype(str) + "\nInput: " + df["input"].astype(str)
    else:
        df["source"] = df["input"].astype(str)
    df = df[["source", "output"]].rename(columns={"output": "target"})
    return Dataset.from_pandas(df)

ds = DatasetDict({
    "train": load_split(TRAIN),
    "validation": load_split(VAL),
    "test": load_split(TEST)
})

# ==== Model & Tokenizer ====
MODEL_NAME = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

max_source_len = 512
max_target_len = 128

def preprocess(batch):
    model_in = tokenizer(batch["source"], max_length=max_source_len, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["target"], max_length=max_target_len, truncation=True)
    model_in["labels"] = labels["input_ids"]
    return model_in

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# ==== Metrics (ROUGE-L) ====
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    # Handle tuple outputs and logits ‚Üí token IDs
    if isinstance(preds, tuple):
        preds = preds[0]
    if isinstance(preds, np.ndarray) and preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)  # convert logits ‚Üí token IDs

    # Convert torch tensors ‚Üí numpy
    if isinstance(preds, torch.Tensor):
        preds = preds.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    # Replace -100 (ignored tokens) with pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    scores = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {"rougeL": scores["rougeL"]}

# ==== TrainingArguments (LR = 1e-5) ====
args = TrainingArguments(
    output_dir="./bart_rond_lr1e-5",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,          # üîπ changed
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    logging_dir="./logs_lr1e-5",
    logging_steps=100,
    report_to=[],
    disable_tqdm=False,
)

# ==== Trainer ====
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)

print("üöÄ Training started (LR=1e-5)...\n")
train_result = trainer.train()

print("\nüîç Evaluating on test set (LR=1e-5)...\n")
test_metrics = trainer.evaluate(tokenized["test"], metric_key_prefix="test")
print("‚úÖ Test set metrics (LR=1e-5):", test_metrics)


In [None]:
# ==== BART on ROND ‚Äì LR = 3e-5 ====
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
import evaluate
import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)
import torch

TRAIN = "./rond_train_processed.csv"
VAL   = "./rond_val_processed.csv"
TEST  = "./rond_test_processed.csv"

def load_split(path):
    df = pd.read_csv(path)
    if "instruction" in df.columns:
        df["source"] = "Instruction: " + df["instruction"].astype(str) + "\nInput: " + df["input"].astype(str)
    else:
        df["source"] = df["input"].astype(str)
    df = df[["source", "output"]].rename(columns={"output": "target"})
    return Dataset.from_pandas(df)

ds = DatasetDict({
    "train": load_split(TRAIN),
    "validation": load_split(VAL),
    "test": load_split(TEST)
})

MODEL_NAME = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

max_source_len = 512
max_target_len = 128

def preprocess(batch):
    model_in = tokenizer(batch["source"], max_length=max_source_len, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["target"], max_length=max_target_len, truncation=True)
    model_in["labels"] = labels["input_ids"]
    return model_in

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    if isinstance(preds, np.ndarray) and preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)
    if isinstance(preds, torch.Tensor):
        preds = preds.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    scores = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {"rougeL": scores["rougeL"]}

args = TrainingArguments(
    output_dir="./bart_rond_lr3e-5",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,          # üîπ changed
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    logging_dir="./logs_lr3e-5",
    logging_steps=100,
    report_to=[],
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)

print("üöÄ Training started (LR=3e-5)...\n")
train_result = trainer.train()

print("\nüîç Evaluating on test set (LR=3e-5)...\n")
test_metrics = trainer.evaluate(tokenized["test"], metric_key_prefix="test")
print("‚úÖ Test set metrics (LR=3e-5):", test_metrics)


In [None]:
# ==== BART on ROND ‚Äì LR = 5e-5 (baseline) ====
import os
os.environ["WANDB_DISABLED"] = "true"

import pandas as pd
import numpy as np
import evaluate
import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)
import torch

TRAIN = "./rond_train_processed.csv"
VAL   = "./rond_val_processed.csv"
TEST  = "./rond_test_processed.csv"

def load_split(path):
    df = pd.read_csv(path)
    if "instruction" in df.columns:
        df["source"] = "Instruction: " + df["instruction"].astype(str) + "\nInput: " + df["input"].astype(str)
    else:
        df["source"] = df["input"].astype(str)
    df = df[["source", "output"]].rename(columns={"output": "target"})
    return Dataset.from_pandas(df)

ds = DatasetDict({
    "train": load_split(TRAIN),
    "validation": load_split(VAL),
    "test": load_split(TEST)
})

MODEL_NAME = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

max_source_len = 512
max_target_len = 128

def preprocess(batch):
    model_in = tokenizer(batch["source"], max_length=max_source_len, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["target"], max_length=max_target_len, truncation=True)
    model_in["labels"] = labels["input_ids"]
    return model_in

tokenized = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    if isinstance(preds, np.ndarray) and preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)
    if isinstance(preds, torch.Tensor):
        preds = preds.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    scores = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {"rougeL": scores["rougeL"]}

args = TrainingArguments(
    output_dir="./bart_rond_lr5e-5",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,          # üîπ baseline
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True,
    logging_dir="./logs_lr5e-5",
    logging_steps=100,
    report_to=[],
    disable_tqdm=False,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics
)

print("üöÄ Training started (LR=5e-5)...\n")
train_result = trainer.train()

print("\nüîç Evaluating on test set (LR=5e-5)...\n")
test_metrics = trainer.evaluate(tokenized["test"], metric_key_prefix="test")
print("‚úÖ Test set metrics (LR=5e-5):", test_metrics)
