Installations


In [None]:
!pip install Rouge
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import time
from collections import Counter
from rouge import Rouge
import os
import spacy
from tqdm import tqdm
import pandas as pd
import torch
import numpy as np

from datasets import Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)


# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED) if torch.cuda.is_available() else None
np.random.seed(SEED)
random.seed(SEED)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
stopwords=set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
try:
    nlp = spacy.load("en_core_web_sm")
except:
    import subprocess
    subprocess.call("python -m spacy download en_core_web_sm", shell=True)
    nlp = spacy.load("en_core_web_sm")

In [None]:
print("Loading data...")
train_df = pd.read_csv('/content/drive/MyDrive/train.csv')

# Extract validation set
val_df = train_df.sample(n=500, random_state=42)
train_df = train_df.drop(val_df.index)

test_df = pd.read_csv('/content/drive/MyDrive/test.csv')



Loading data...


In [None]:
#Helper function to calculate Rouge scores
def calculate_rouge(generated_titles, reference_titles):
    """Calculate ROUGE scores between generated and reference titles"""
    rouge = Rouge()

    # Ensure we have valid inputs for ROUGE calculation
    valid_pairs = []
    for gen, ref in zip(generated_titles, reference_titles):
        # ROUGE requires non-empty strings
        if len(gen.strip()) == 0:
            gen = "empty"
        if len(ref.strip()) == 0:
            ref = "empty"
        valid_pairs.append((gen, ref))

    # Separate the valid pairs
    hyps, refs = zip(*valid_pairs)

    try:
        # Calculate ROUGE scores
        scores = rouge.get_scores(hyps, refs, avg=True)
        return scores
    except Exception as e:
        print(f"Error calculating ROUGE scores: {e}")
        # Return default scores in case of error
        return {
            'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0},
            'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
            'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}
        }

# Function to print Rouge scores
def print_rouge_scores(scores, model_name):
    print(f"ROUGE scores for {model_name}:")
    print(f"ROUGE-1: {scores['rouge-1']['f']:.4f}")
    print(f"ROUGE-2: {scores['rouge-2']['f']:.4f}")
    print(f"ROUGE-L: {scores['rouge-l']['f']:.4f}")

# Convert DataFrames to HuggingFace datasets
def convert_to_dataset(df):
    return Dataset.from_pandas(df)

In [None]:
def prepare_datasets_for_t5(train_df, val_df, test_df, tokenizer, max_input_length=512, max_target_length=30):
    # Convert to HuggingFace datasets
    train_dataset = convert_to_dataset(train_df)
    val_dataset = convert_to_dataset(val_df)
    test_dataset = convert_to_dataset(test_df)

    # Preprocessing function
    def preprocess_function(examples):
        # T5 expects inputs in the format: "summarize: {text}"
        inputs = ["summarize: " + doc for doc in examples["text"]]
        model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples["title"], max_length=max_target_length, truncation=True, padding="max_length")

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # Apply preprocessing
    train_dataset = train_dataset.map(preprocess_function, batched=True)
    val_dataset = val_dataset.map(preprocess_function, batched=True)
    test_dataset = test_dataset.map(preprocess_function, batched=True)

    return train_dataset, val_dataset, test_dataset

def train_t5_model(model_name="google-t5/t5-small"):
    start_time = time.time()
    print(f"\nTraining and evaluating {model_name}...")

    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

    # Prepare datasets
    train_dataset, val_dataset, test_dataset = prepare_datasets_for_t5(train_df, val_df, test_df, tokenizer)

    # Data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    # Training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=f"./results/{model_name.split('/')[-1]}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=5,
        predict_with_generate=True,
        fp16=torch.cuda.is_available(),
        logging_dir=f"./logs/{model_name.split('/')[-1]}",
        logging_steps=100,
        report_to="none"  # Disable wandb, etc.
    )

    # Initialize trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Train the model
    print("Starting training...")
    trainer.train()

    # Generate titles for test set with greedy search
    print("Generating titles with greedy search...")
    generated_titles_greedy = []
    reference_titles = test_df["title"].tolist()

    for i, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Generating titles"):
        input_text = "summarize: " + row["text"]
        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
        output = model.generate(**inputs, max_length=30)
        title = tokenizer.decode(output[0], skip_special_tokens=True)
        generated_titles_greedy.append(title)

    # Calculate ROUGE scores for greedy search
    rouge_scores_greedy = calculate_rouge(generated_titles_greedy, reference_titles)
    print_rouge_scores(rouge_scores_greedy, f"{model_name} with greedy search")

    # Generate titles with beam search
    print("Generating titles with beam search...")
    generated_titles_beam = []

    for i, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Generating titles with beam search"):
        input_text = "summarize: " + row["text"]
        inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
        output = model.generate(
            **inputs,
            max_length=30,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=2
        )
        title = tokenizer.decode(output[0], skip_special_tokens=True)
        generated_titles_beam.append(title)

    # Calculate ROUGE scores for beam search
    rouge_scores_beam = calculate_rouge(generated_titles_beam, reference_titles)
    print_rouge_scores(rouge_scores_beam, f"{model_name} with beam search")

    end_time = time.time()
    print(f"Total time for {model_name}: {end_time - start_time:.2f} seconds")

    return generated_titles_greedy, generated_titles_beam, rouge_scores_greedy, rouge_scores_beam


In [None]:
def evaluate_flan_t5_with_prompts(model_name, prompts):
    start_time = time.time()
    print(f"\nEvaluating {model_name} with different prompts...")

    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

    reference_titles = test_df["title"].tolist()
    results = {}

    for prompt_template in prompts:
        print(f"Using prompt: '{prompt_template}'")
        generated_titles = []

        for i, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Generating titles"):
            # Format the prompt with the article text
            input_text = prompt_template.format(text=row["text"])
            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

            # Generate with beam search
            output = model.generate(
                **inputs,
                max_length=30,
                num_beams=5,
                early_stopping=True,
                no_repeat_ngram_size=2
            )
            title = tokenizer.decode(output[0], skip_special_tokens=True)
            generated_titles.append(title)

        # Calculate ROUGE scores
        rouge_scores = calculate_rouge(generated_titles, reference_titles)
        print_rouge_scores(rouge_scores, f"{model_name} with prompt: '{prompt_template}'")

        # Store results
        results[prompt_template] = {
            "generated_titles": generated_titles,
            "rouge_scores": rouge_scores
        }

    end_time = time.time()
    print(f"Total time for {model_name}: {end_time - start_time:.2f} seconds")

    return results



In [None]:
# Part C1: Fine-tune t5-small model
print("\n========= Part C1: Fine-tuning T5 =========")
t5_greedy, t5_beam, t5_rouge_greedy, t5_rouge_beam = train_t5_model("google-t5/t5-small")



Training and evaluating google-t5/t5-small...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/13379 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0867,0.073631
2,0.075,0.063886
3,0.0686,0.062135
4,0.0647,0.05993
5,0.0651,0.059944


Generating titles with greedy search...


Generating titles: 100%|██████████| 100/100 [00:10<00:00,  9.63it/s]


ROUGE scores for google-t5/t5-small with greedy search:
ROUGE-1: 0.8780
ROUGE-2: 0.6778
ROUGE-L: 0.8780
Generating titles with beam search...


Generating titles with beam search: 100%|██████████| 100/100 [00:16<00:00,  6.05it/s]

ROUGE scores for google-t5/t5-small with beam search:
ROUGE-1: 0.8719
ROUGE-2: 0.6678
ROUGE-L: 0.8719
Total time for google-t5/t5-small: 1706.30 seconds





In [None]:
print("\n========= Part C2: Prompt Engineering with Flan-T5 =========")

# Define prompts to try
prompts = [
    "Generate a title for this Wikipedia article: {text}",
    "Create a concise, informative title for the following text: {text}",
    "Summarize the following article into a short title: {text}",
    "What would be an appropriate title for this article? {text}"
]





In [None]:
# Evaluate base model
print("\nEvaluating Flan-T5-base")
flan_t5_base_results = evaluate_flan_t5_with_prompts("google/flan-t5-base", prompts[:2])

# Evaluate large model
print("\nEvaluating Flan-T5-large")
flan_t5_large_results = evaluate_flan_t5_with_prompts("google/flan-t5-large", prompts[2:])
