# Text Summarization

## Importing required libraries and Loading dataset

In [1]:
# Step 1: Load and Preprocess the Dataset

import pandas as pd
import nltk
nltk.download('punkt_tab')
import spacy
from tqdm.notebook import tqdm  
# Load the CSV files
# Load just the first 1,000 rows
train_df = pd.read_csv('train.csv', nrows=1000)
test_df = pd.read_csv('test.csv', nrows=1000)
val_df = pd.read_csv('validation.csv', nrows=1000)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# View structure
train_df.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [3]:
train_df.tail()

Unnamed: 0,id,article,highlights
995,02cda7baa5ffb66030294542e7372ed3d5742b22,After the anxiety of the first day Andy Murray...,"Andy Murray wins 6-3, 6-3, 6-4 in one hour and..."
996,02ce5810b37842c00ae90b6c7b70dbf686cd865f,By . Leon Watson and Sebastian Lander . PUBLIS...,Figures released by ABTA show Britons took few...
997,02cebc35c007eb63dc0a22f96de0541e4269793e,Mexico's government is trying to block the exe...,Defense attorneys are expected to present oral...
998,02d123388fbdf6da1466253313fe6641595c291c,By . Rob Cooper . Last updated at 5:05 PM on 2...,High-speed bed is fitted with a V8 600bhp engi...
999,02d13195c2ac5c61415b40c45712c943d9290164,Liverpool manager Brendan Rodgers felt Everton...,Everton ace Gareth Barry was lucky to escape a...


In [4]:
# Keep only necessary columns
train_df = train_df[['article', 'highlights']]
test_df = test_df[['article', 'highlights']]
val_df = val_df[['article', 'highlights']]

## Preprocessing Text

In [5]:
# Step 2: Preprocessing Text

import re
import nltk
from nltk.tokenize import sent_tokenize

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # remove multiple spaces
    text = re.sub(r'\[[^\]]*\]', '', text)  # remove brackets
    text = text.strip()
    return text

train_df['clean_article'] = train_df['article'].apply(clean_text)
train_df['sentences'] = train_df['clean_article'].apply(sent_tokenize)


## Extractive summarization using Spacy

In [6]:
# Step 3: Executive Summary using spaCy
import spacy
from tqdm.notebook import tqdm

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Fast scoring method based on named entities and sentence length
def extractive_summary_fast(text, num_sentences=3):
    doc = nlp(text)
    sentences = list(doc.sents)
    scored_sentences = []

    for sent in sentences:
        score = len(sent.ents) + len(sent.text.split())
        scored_sentences.append((sent.text, score))

    top_sentences = sorted(scored_sentences, key=lambda x: x[1], reverse=True)[:num_sentences]
    summary = ' '.join([sent[0] for sent in top_sentences])
    return summary

# Create a subset for testing
extractive_subset = train_df.head(100).copy()
tqdm.pandas()

# Apply the summarizer
extractive_subset['extractive_summary'] = extractive_subset['clean_article'].progress_apply(lambda x: extractive_summary_fast(x))


  0%|          | 0/100 [00:00<?, ?it/s]

## Abstractive Summarizatiion using BART

In [32]:
## Step 04 Abstractive summarization using BART
def summarize_articles_in_batches(articles, batch_size=4):
    summaries = []
    for i in tqdm(range(0, len(articles), batch_size), desc="Generating summaries"):
        batch = articles[i:i+batch_size]
        try:
            outputs = summarizer(batch, max_length=130, min_length=30, truncation=True)
            summaries.extend([output['summary_text'] for output in outputs])
        except Exception as e:
            print(f"Error summarizing batch {i}-{i+batch_size}: {e}")
            summaries.extend(["ERROR"] * len(batch))
    return summaries

# Only keep non-empty articles
subset_df = train_df.head(5).copy()
subset_df = subset_df[subset_df['article'].str.strip().astype(bool)]

# Summarize and assign
summaries = summarize_articles_in_batches(subset_df['article'].tolist()
subset_df['abstractive_summary'] = summaries
train_df.update(subset_df)

Generating summaries:  50%|███████████████████████████████                               | 1/2 [00:57<00:57, 57.67s/it]

Error summarizing batch 0-4: index out of range in self


Generating summaries: 100%|██████████████████████████████████████████████████████████████| 2/2 [01:51<00:00, 55.82s/it]


## Evaluation using ROGUE

In [33]:
## Evaluation using ROUGE
def evaluate_rouge_batch(references, generated_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    all_scores = []

    for ref, gen in zip(references, generated_summaries):
        if isinstance(ref, str) and isinstance(gen, str):
            score = scorer.score(ref, gen)
            all_scores.append(score)
        else:
            all_scores.append({'rouge1': None, 'rouge2': None, 'rougeL': None})

    return all_scores

# Step 5: Evaluate using 'highlights' as reference summaries
references = subset_df['highlights'][:10]
generated = subset_df['abstractive_summary'][:10]

rouge_scores = evaluate_rouge_batch(references, generated)

# Step 6: Display average ROUGE scores
import numpy as np

def average_rouge_scores(score_list):
    rouge1 = np.mean([s['rouge1'].fmeasure for s in score_list if s['rouge1']])
    rouge2 = np.mean([s['rouge2'].fmeasure for s in score_list if s['rouge2']])
    rougel = np.mean([s['rougeL'].fmeasure for s in score_list if s['rougeL']])
    return {
        "ROUGE-1": rouge1,
        "ROUGE-2": rouge2,
        "ROUGE-L": rougel
    }

avg_scores = average_rouge_scores(rouge_scores)
print("Average ROUGE Scores for Abstractive Summaries:")
for k, v in avg_scores.items():
    print(f"{k}: {v:.4f}")

Average ROUGE Scores for Abstractive Summaries:
ROUGE-1: 0.0613
ROUGE-2: 0.0207
ROUGE-L: 0.0409
