# Fine-tune T5-Small for News Headline Generation

This notebook fine-tunes the T5-small model on the CNN/DailyMail dataset to generate catchy news headlines from articles.


## 1. Setup and Imports


In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
from transformers import pipeline
import numpy as np

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


## 2. Load CNN/DailyMail Dataset

We'll use a subset of the dataset for faster training. The dataset contains news articles with highlights that we'll use as target headlines.


In [None]:
# Load dataset - using version 3.0.0
print("Loading CNN/DailyMail dataset...")
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Use a subset for faster training
# For a simple project, 10,000 training samples is sufficient
train_size = 10000
val_size = 1000

train_dataset = dataset["train"].shuffle(seed=42).select(range(train_size))
val_dataset = dataset["validation"].shuffle(seed=42).select(range(val_size))

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print("\nExample article:")
print(train_dataset[0]["article"][:300])
print("\nExample highlights:")
print(train_dataset[0]["highlights"])


## 3. Initialize Model and Tokenizer


In [None]:
# Load T5-small model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")


## 4. Preprocess Dataset

T5 requires a specific format: "summarize: [article]" as input and the summary/headline as output.


In [None]:
# Preprocessing function
max_input_length = 512  # Maximum tokens for input article
max_target_length = 128  # Maximum tokens for output headline

def preprocess_function(examples):
    """Tokenize articles and highlights for T5 model."""
    # Add T5 prefix for summarization task
    inputs = ["summarize: " + doc for doc in examples["article"]]
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )
    
    # Tokenize targets (highlights/summaries)
    labels = tokenizer(
        examples["highlights"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length"
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
print("Tokenizing datasets...")
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

tokenized_val = val_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset.column_names
)

print("Tokenization complete!")


## 5. Configure Training Arguments


In [None]:
# Create output directory
output_dir = "./models/t5-small-headlines"
os.makedirs(output_dir, exist_ok=True)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=3e-4,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    push_to_hub=False,
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  Output directory: {training_args.output_dir}")


## 6. Initialize Trainer and Start Training


In [None]:
# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Starting training...")
print("This may take 30-60 minutes depending on your hardware.")
print("="*50)


In [None]:
# Train the model
trainer.train()


## 7. Save the Fine-tuned Model


In [None]:
# Save the final model
final_model_path = "./models/t5-small-headlines-final"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"\nModel saved to: {final_model_path}")
print("Training complete!")


## 8. Quick Test of the Fine-tuned Model


In [None]:
# Load the fine-tuned model for testing
fine_tuned_model = T5ForConditionalGeneration.from_pretrained(final_model_path)
fine_tuned_tokenizer = T5Tokenizer.from_pretrained(final_model_path)

# Create a summarization pipeline
summarizer = pipeline(
    "summarization",
    model=fine_tuned_model,
    tokenizer=fine_tuned_tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Test with a sample article
test_article = dataset["test"][0]["article"]
original_highlights = dataset["test"][0]["highlights"]

print("Original article (first 500 chars):")
print(test_article[:500])
print("\n" + "="*50)
print("\nOriginal highlights:")
print(original_highlights)
print("\n" + "="*50)
print("\nGenerated headline:")
result = summarizer(test_article, max_length=128, min_length=10, do_sample=False)
print(result[0]["summary_text"])


## Next Steps

Now that the model is trained, proceed to `headline_summarizer.ipynb` to:
- Use the fine-tuned model for headline generation
- Integrate with LangChain for flexible prompting
- Test different headline styles (exciting, formal, short, etc.)
