# **Task KDT04: Text Summarization Using NLP**

**Requirements:**
-  Use T5, BART, or PEGASUS from Hugging Face
-  Train on a small custom dataset (or use CNN/DailyMail)
-  Generate summaries for at least 5 sample paragraphs

**Bonus:**
-  Compare model-generated summaries vs. extractive method (e.g., spaCy)
- Include a web form to paste text and view a summary

## Installing Required Libraries and Modules

In [None]:
pip install -q datasets sacrebleu evaluate rouge_score

In [None]:
pip install -U datasets fsspec

In [None]:
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollator, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback

from datasets import load_dataset, DatasetDict

import evaluate

In [None]:
from transformers import (
    T5ForConditionalGeneration,
    T5TokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from datasets import load_dataset
import evaluate
import numpy as np
import torch


In [None]:
# from datasets import load_dataset

# ds = load_dataset("cnn_dailymail", "1.0.0", split="train[:20000]")

In [None]:
# ds

## Merging Multiple Dataset Versions of CNN/DailyMail Dataset from hugging face

In [None]:
from datasets import load_dataset, concatenate_datasets, DatasetDict

v1 = load_dataset("cnn_dailymail", "1.0.0", split="train[:20000]")
v2 = load_dataset("cnn_dailymail", "2.0.0", split="train[:20000]")
v3 = load_dataset("cnn_dailymail", "3.0.0", split="train[:20000]")

data = concatenate_datasets([v1, v2, v3])

In [None]:
data = data.remove_columns("id")

In [None]:
data

In [None]:
d1 = data.train_test_split(test_size=0.2)

In [None]:
d1

In [None]:
# d1['train']

In [None]:
d2 = d1['train'].train_test_split(test_size=0.2)

In [None]:
d2

### Splitting Data into Train/Val/Test

In [None]:
final_data = DatasetDict({'train' : d2['train'], 'validation' : d2['test'], 'test' : d1['test']})

In [None]:
final_data

## Loading Pretrained T5 Model & Tokenizer

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
tokenizer = T5TokenizerFast.from_pretrained("google-t5/t5-small")

## Preprocessing Function for Tokenization

In [None]:
max_input_length = 512
max_target_length = 128

def preprocess(example):
    inputs = ["summarize: " + article for article in example["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["highlights"], max_length=max_target_length, truncation=True, padding="max_length")

    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_seq]
        for labels_seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = final_data.map(preprocess, batched=True, remove_columns=["article", "highlights"])


## Metric Computation Setup

In [None]:
# Metric
import evaluate
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in labels (used as ignore index) with pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Strip leading/trailing whitespaces
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Compute ROUGE scores
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Return the scalar float values directly
    return {
        "rouge1": result["rouge1"] * 100,
        "rouge2": result["rouge2"] * 100,
        "rougeL": result["rougeL"] * 100,
        "rougeLsum": result["rougeLsum"] * 100,
    }


# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

## Define Training Arguments and Training the Model

In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./T5-small/results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    predict_with_generate=True,
    report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="rougeL",
    greater_is_better=True,
    run_name="T5-small-summarization",
    fp16=True
)


In [None]:

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train
model = trainer.train()

## Load Fine-Tuned Model from Latest Checkpoint and Inference on 10 Sample Inputs

In [None]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast

# Load your fine-tuned model
model_path = "/content/T5-small/results/checkpoint-14400"  # path to fine-tuned model
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5TokenizerFast.from_pretrained(model_path)

# 10 sample input texts for summarization
sample_texts = [
    "summarize: Climate change is accelerating faster than previously predicted. Global temperatures are rising, leading to extreme weather events like hurricanes, droughts, and floods. Scientists urge immediate reduction of greenhouse gas emissions to prevent irreversible damage.",

    "summarize: The Indian economy grew at 6.1% in the last quarter, driven by strong performance in manufacturing and services. The Reserve Bank of India may consider adjusting interest rates in response to inflation trends and global economic pressures.",

    "summarize: In the final match of the FIFA World Cup, Argentina beat France in a thrilling penalty shootout. Messi scored twice, securing his legacy as one of the greatest footballers of all time. The match was widely regarded as one of the most exciting in history.",

    "summarize: Apple's new iPhone 15 introduces a periscope zoom camera, titanium frame, and USB-C port. It also features improvements in battery life and AI-powered photography. Reviews have been largely positive, praising performance and design.",

    "summarize: A new study shows that a Mediterranean diet, rich in vegetables, olive oil, and fish, can significantly reduce the risk of heart disease. Participants also reported improved mental clarity and energy levels.",

    "summarize: The Artemis I mission successfully completed its journey around the moon and returned safely to Earth. NASA plans to send astronauts to the moon in the next phase, establishing a long-term human presence as part of the Artemis program.",

    "summarize: The COVID-19 pandemic exposed vulnerabilities in global healthcare systems. Many countries faced shortages of medical supplies, staff, and ICU beds. Lessons learned have led to new policies aimed at future pandemic preparedness.",

    "summarize: Elon Musk's acquisition of Twitter sparked both praise and controversy. He implemented mass layoffs, introduced paid verification, and promised more transparency in algorithms. Users and advertisers remain divided over the platform’s direction.",

    "summarize: Researchers have developed a breakthrough cancer treatment using mRNA technology. Early trials show promising results in targeting tumors with minimal side effects, potentially revolutionizing oncology in the next decade.",

    "summarize: A massive volcanic eruption in Iceland disrupted air travel across Europe. Ash clouds spread rapidly, grounding flights and affecting millions of travelers. Emergency services were deployed to monitor and manage the situation."
]

# Generate summaries
for i, text in enumerate(sample_texts):
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs,
        max_length=150,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print(f"\nText {i+1} Summary:\n{summary}")


In [None]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast

model = T5ForConditionalGeneration.from_pretrained("/content/T5-small/results/checkpoint-14400")
tokenizer = T5TokenizerFast.from_pretrained("/content/T5-small/results/checkpoint-14400")

## Generate summaries for at least 5 sample paragraphs

In [None]:
from transformers import T5ForConditionalGeneration, T5TokenizerFast

# Load your fine-tuned model and tokenizer
model_path = "/content/T5-small/results/checkpoint-14400"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5TokenizerFast.from_pretrained(model_path)

# 5 long texts
long_texts = [
    "summarize: The global climate crisis has reached a critical point, with temperatures rising faster than expected. Scientists have observed a dramatic increase in ice melt in both the Arctic and Antarctic regions. Sea levels are projected to rise significantly by the end of the century, threatening coastal cities worldwide. Forest fires, heatwaves, and hurricanes are becoming more frequent and intense, indicating the need for urgent action. International agreements such as the Paris Accord have called for reducing emissions, but many countries are falling short of their targets. Experts emphasize the need for systemic change in energy production, transportation, and agriculture to curb greenhouse gases effectively.",

    "summarize: India's digital transformation has accelerated rapidly in the last decade, driven by initiatives like Digital India, UPI, and Aadhaar. The country's fintech sector is now one of the fastest-growing in the world, with millions of users adopting mobile wallets, online banking, and digital lending platforms. Government schemes have helped bring internet access to rural areas, empowering small businesses and farmers through e-commerce and mobile applications. However, challenges like digital literacy, cybersecurity, and data privacy remain significant. Experts suggest that continued investment in infrastructure and policy reform will be essential to sustain this digital revolution and make it inclusive.",

    "summarize: The Artemis program marks NASA’s ambitious return to lunar exploration. Unlike previous missions, Artemis aims to create a sustainable human presence on the moon. Artemis I successfully tested the Space Launch System and Orion spacecraft in an uncrewed mission around the moon. Artemis II will carry astronauts, and Artemis III plans to land the first woman and person of color on the lunar surface. These missions are stepping stones toward Mars exploration. NASA is collaborating with private partners like SpaceX to develop lunar landers and support systems. Scientists hope to establish a lunar base for research, resource utilization, and testing technologies for deep space travel.",

    "summarize: Advances in artificial intelligence are reshaping industries across the globe. From natural language processing to computer vision, AI technologies are improving productivity, decision-making, and customer experiences. Healthcare has seen remarkable applications, such as AI-driven diagnostics, personalized treatment plans, and drug discovery. In finance, algorithms detect fraud and automate trading. However, ethical concerns around bias, surveillance, and job displacement are growing. Policymakers are debating regulations to ensure AI is used responsibly. Transparency, accountability, and fairness are crucial to building trust in AI systems. As AI continues to evolve, experts believe human oversight and ethical frameworks will be key to guiding its impact.",

    "summarize: The COVID-19 pandemic has reshaped how societies function, highlighting the importance of resilience and preparedness. Governments worldwide implemented lockdowns, contact tracing, and mass vaccination to curb the virus's spread. While some countries managed better than others, the pandemic exposed weaknesses in healthcare infrastructure, supply chains, and crisis communication. Remote work, online education, and telemedicine became mainstream, accelerating digital adoption. Researchers developed vaccines at unprecedented speeds using mRNA technology, a breakthrough with potential beyond COVID. Moving forward, experts recommend strengthening health systems, investing in early warning mechanisms, and maintaining global cooperation to face future pandemics more effectively."
]

# Summarization with length comparison
for i, text in enumerate(long_texts):
    # Tokenize and encode the input
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)

    # Generate summary
    summary_ids = model.generate(
        inputs,
        max_length=150,
        min_length=40,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )

    # Decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Print lengths
    original_char_len = len(text)
    summary_char_len = len(summary)
    original_token_len = len(tokenizer.tokenize(text))
    summary_token_len = len(tokenizer.tokenize(summary))

    print(f"\n--- Long Text {i+1} Summary ---")
    print(f"Original Text (chars): {original_char_len}, Tokens: {original_token_len}")
    print(f"Summary Text  (chars): {summary_char_len}, Tokens: {summary_token_len}")
    print(f"\nSummary:\n{summary}")


## Deploying model on HuggingFace

In [None]:
!pip install -q huggingface_hub
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub("trohith89/KDTS_T5_Summary_FineTune")
tokenizer.push_to_hub("trohith89/KDTS_T5_Summary_FineTune")

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text2text-generation", model="trohith89/KDTS_T5_Summary_FineTune")

In [None]:
pipe("summarize: Climate change is accelerating faster than previously predicted. Global temperatures are rising, leading to extreme weather events like hurricanes, droughts, and floods. Scientists urge immediate reduction of greenhouse gas emissions to prevent irreversible damage.")

In [None]:
pipe("""The Trump administration has ordered US embassies worldwide to immediately stop scheduling visa interviews for foreign students as it prepares to implement comprehensive social media screening for all international applicants.

A Tuesday state department cable instructs consular sections to pause adding “any additional student or exchange visitor (F, M, and J) visa appointment capacity until further guidance is issued” within days.

The directive, first reported by Politico and now confirmed by the Guardian, could severely delay visa processing and hurt universities – many of which Donald Trump accuses of having far-left ideologies – that rely heavily on foreign students for revenue.

“The department is conducting a review of existing operations and processes for screening and vetting of student and exchange visitor visa applicants,” the cable reads. Officials plan to issue guidance on “expanded social media vetting for all such applicants”.

""")

## ● Compare model-generated summaries vs. extractive method (e.g., spaCy)


In [None]:
import spacy
from transformers import T5ForConditionalGeneration, T5TokenizerFast
from collections import Counter
import string

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load T5 model
model_path = "/content/T5-small/results/checkpoint-14400"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5TokenizerFast.from_pretrained(model_path)

# Input texts
texts = [
    "summarize: The global climate crisis has reached a critical point, with temperatures rising faster than expected. Scientists have observed a dramatic increase in ice melt in both the Arctic and Antarctic regions. Sea levels are projected to rise significantly by the end of the century, threatening coastal cities worldwide. Forest fires, heatwaves, and hurricanes are becoming more frequent and intense, indicating the need for urgent action. International agreements such as the Paris Accord have called for reducing emissions, but many countries are falling short of their targets. Experts emphasize the need for systemic change in energy production, transportation, and agriculture to curb greenhouse gases effectively."
]

def extractive_summary_spacy(text, top_n=3):
    doc = nlp(text)
    words = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop]
    freq = Counter(words)

    # Score sentences based on word frequencies
    sent_scores = {}
    for sent in doc.sents:
        score = sum(freq.get(token.text.lower(), 0) for token in sent if token.is_alpha)
        sent_scores[sent] = score

    # Get top n sentences
    top_sents = sorted(sent_scores, key=sent_scores.get, reverse=True)[:top_n]
    return ' '.join([sent.text for sent in top_sents])

# Compare summaries
for i, text in enumerate(texts):
    # Abstractive
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    abstractive_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Extractive
    original_text = text.replace("summarize: ", "")
    extractive_summary = extractive_summary_spacy(original_text, top_n=3)

    # Print
    print(f"\n--- Text {i+1} Comparison ---")
    print(f"Original Length: {len(original_text)} chars")

    print(f"\nAbstractive Summary:\n{abstractive_summary}")
    print(f"Abstractive Length: {len(abstractive_summary)} chars")

    print(f"\nExtractive Summary:\n{extractive_summary}")
    print(f"Extractive Length: {len(extractive_summary)} chars")
