In [2]:
# Load datasets for summarization

import pandas as pd
from datasets import Dataset

train_df_s = pd.read_excel("summarization_train.xlsx")
val_df_s = pd.read_excel("summarization_validation.xlsx")

# Convert to Hugging Face dataset format
def hf_format(row):
    return {
        "context": row['context'],
        "summary": row['summary']
    }

formatted = train_df_s.apply(hf_format, axis=1).tolist()
formatted_df = pd.DataFrame(formatted)
train_df_s = Dataset.from_pandas(formatted_df)

formatted = val_df_s.apply(hf_format, axis=1).tolist()
formatted_df = pd.DataFrame(formatted)
val_df_s = Dataset.from_pandas(formatted_df)

Flan-T5 for summarization

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

flant5 = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(flant5)
model = AutoModelForSeq2SeqLM.from_pretrained(flant5)

# Preprocessing
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    model_inputs = tokenizer(examples["context"], max_length=max_input_length, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenised_train = train_df_s.map(preprocess_function, batched=True, remove_columns=train_df_s.column_names)
tokenised_val = val_df_s.map(preprocess_function, batched=True, remove_columns=val_df_s.column_names)

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-summarization",
    do_eval=True,
    eval_steps=200,
    logging_steps=100,
    save_steps=400,
    save_total_limit=2,
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
)

In [4]:
# Define compute_metrics function and load metrics

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
import numpy as np

rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]

    # Convert logits to token IDs if needed
    if preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)

    preds = np.asarray(preds, dtype=np.int64)
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    # Decode
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Clean and align
    decoded_preds = [p.strip() for p in decoded_preds]
    decoded_labels = [l.strip() for l in decoded_labels]
    decoded_pairs = [(p, l) for p, l in zip(decoded_preds, decoded_labels) if p and l]
    if not decoded_pairs:
        return {"rougeL": 0.0, "bertscore_f1": 0.0}

    decoded_preds, decoded_labels = zip(*decoded_pairs)

    # Compute
    rouge_result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    bertscore_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

    return {
        "rougeL": float(rouge_result["rougeL"]),
        "bertscore_f1": float(np.mean(bertscore_result["f1"]))
    }

In [None]:
# Find best hyperparameters 

learning_rates = [5e-6, 3e-5, 5e-5]
batch_sizes = [4, 8]

best_rouge = 0
best_args = None

for lr in learning_rates:
    for bs in batch_sizes:
        print(f"\n Trying lr={lr}, batch_size={bs}")

        training_args = Seq2SeqTrainingArguments(
            output_dir="./flan-t5-summarization-temp",
            do_train=True,
            do_eval=True,
            eval_steps=200,
            logging_steps=100,
            save_steps=400,
            save_total_limit=2,
            learning_rate=lr,
            per_device_train_batch_size=bs,
            per_device_eval_batch_size=bs,
            num_train_epochs=3, 
            weight_decay=0.01,
            predict_with_generate=True, 
            generation_max_length=256, # Set max summary length
            generation_num_beams=4
        )

        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenised_train,
            eval_dataset=tokenised_val,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        trainer.train()
        
        # Evaluate on validation set
        metrics = trainer.evaluate()
        
        val_rouge = metrics.get("eval_rougeL", 0)
        val_bert = metrics.get("eval_bertscore_f1", 0)
        
        # Use ROUGE-L to track the best model, use BERTScore as tiebreaker 
        val_metric_for_best = val_rouge
        print(f"ROUGE-L F1 for lr={lr}, bs={bs}: {val_rouge}")
        print(f"BERTScore F1 for lr={lr}, bs={bs}: {val_bert}")

        if val_metric_for_best > best_rouge:
            best_rouge = val_metric_for_best
            best_args = (lr, bs)

print("Best hyperparameters (based on ROUGE-L):", best_args)

In [None]:
# Train Flan-T5 using the best hyperparameters

best_lr, best_bs = best_args

final_training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-summarization-final",
    do_train=True,
    do_eval=True,
    eval_steps=200,
    logging_steps=100,
    save_steps=400,
    save_total_limit=2,
    learning_rate=best_lr,
    per_device_train_batch_size=best_bs,
    per_device_eval_batch_size=best_bs,
    num_train_epochs=3, 
    weight_decay=0.01,
    predict_with_generate=True, 
    generation_max_length=256, 
    generation_num_beams=4    
)

trainer = Seq2SeqTrainer(
    model=model,
    args=final_training_args,
    train_dataset=tokenised_train,
    eval_dataset=tokenised_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

metrics = trainer.evaluate()

print(metrics)

In [None]:
# Test Flan-T5 on test dataset

import pandas as pd
from datasets import Dataset
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch 
from evaluate import load
from torch.utils.data import DataLoader

test_df = pd.read_excel("summarization_test.xlsx")

formatted = test_df.apply(hf_format, axis=1).tolist()
formatted_df = pd.DataFrame(formatted)
test_df_s = Dataset.from_pandas(formatted_df)

tokenised_test = test_df_s.map(preprocess_function, batched=True, remove_columns=test_df_s.column_names)
tokenised_test.set_format(type='torch', columns=["input_ids", "attention_mask", "labels"])

final_eval_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-summarization-test",
    do_train=False,
    do_eval=True,
    per_device_eval_batch_size=best_bs,
    predict_with_generate=True,
    generation_max_length=256,
    generation_num_beams=4,
    logging_steps=50,
)

eval_trainer = Seq2SeqTrainer(
    model=trainer.model, # use fine-tuned Flan-t5 model
    args=final_eval_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset=tokenised_test
)

rouge = load("rouge")

test_loader = DataLoader(tokenised_test, batch_size=16)  

all_preds = []

trainer.model.eval()  # set to evaluation mode
device = trainer.model.device

with torch.no_grad():  # disable gradient tracking
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = trainer.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=4
        )
        decoded_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_preds.extend(decoded_batch)

decoded_preds = all_preds

decoded_labels = tokenizer.batch_decode(
    tokenised_test["labels"],
    skip_special_tokens=True
)

# Compute ROUGE-1/2/L scores  
rouge_scores = rouge.compute(
    predictions=decoded_preds,
    references=decoded_labels,
    rouge_types=["rouge1", "rouge2", "rougeL"]
)

metrics = eval_trainer.evaluate()
print(metrics)

print("\nROUGE-1:", rouge_scores["rouge1"])
print("ROUGE-2:", rouge_scores["rouge2"])
print("ROUGE-L:", rouge_scores["rougeL"])

In [None]:
# Compute readability metrics (Flesch-Kincaid)

from textstat import flesch_kincaid_grade, flesch_reading_ease

print("\nSample generated summaries with readability metrics:")

sample_indices = (9, 99, 499, 999, 1999)

for i in sample_indices:
    pred = decoded_preds[i].strip()
    label = decoded_labels[i].strip()
    fk_pred = flesch_kincaid_grade(pred)
    fk_label = flesch_kincaid_grade(label)
    fre_pred = flesch_reading_ease(pred)
    fre_label = flesch_reading_ease(label)

    print(f"\n Sample {i+1}")
    print(f"Predicted Summary:\n{pred}")
    print(f"Actual Summary:\n{label}")
    print(f"Flesch–Kincaid Grade (Pred): {fk_pred:.2f}, (Label): {fk_label:.2f}")
    print(f"Flesch Reading Ease (Pred): {fre_pred:.2f}, (Label): {fre_label:.2f}")

# Compute average readability for the entire test set
fk_pred_all = []
fk_label_all = []
fre_pred_all = []
fre_label_all = []

for pred, label in zip(decoded_preds, decoded_labels):
    pred = pred.strip()
    label = label.strip()
    if pred and label:  # skip empty
        fk_pred_all.append(flesch_kincaid_grade(pred))
        fk_label_all.append(flesch_kincaid_grade(label))
        fre_pred_all.append(flesch_reading_ease(pred))
        fre_label_all.append(flesch_reading_ease(label))

avg_fk_pred = sum(fk_pred_all) / len(fk_pred_all)
avg_fk_label = sum(fk_label_all) / len(fk_label_all)
avg_fre_pred = sum(fre_pred_all) / len(fre_pred_all)
avg_fre_label = sum(fre_label_all) / len(fre_label_all)

print("\nAverage readability scores for entire test set:")
print(f"Flesch–Kincaid Grade (Pred): {avg_fk_pred:.2f}, (Label): {avg_fk_label:.2f}")
print(f"Flesch Reading Ease (Pred): {avg_fre_pred:.2f}, (Label): {avg_fre_label:.2f}")