# Import and Preprocess Data

In [None]:
import re
import pandas as pd
pd.options.display.max_colwidth = 150

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%time
# WikiHow
# wikihow = pd.read_csv("wikihowAll.csv")
wikihow = pd.read_csv("/content/drive/MyDrive/NLP_data/wikihowAll.csv")

CPU times: user 8.39 s, sys: 1.14 s, total: 9.53 s
Wall time: 13.9 s


In [None]:
%%time
# CNN-Dailymail - Test
# cnn_daily_test = pd.read_csv("cnn_dailymail/test.csv")
cnn_daily_test = pd.read_csv("/content/drive/MyDrive/NLP_data/cnn_dailymail/test.csv")

CPU times: user 484 ms, sys: 65.8 ms, total: 550 ms
Wall time: 1.12 s


In [None]:
%%time
# CNN-Dailymail - Validation
# cnn_daily_valid = pd.read_csv("cnn_dailymail/validation.csv")
cnn_daily_valid = pd.read_csv("/content/drive/MyDrive/NLP_data/cnn_dailymail/validation.csv")

CPU times: user 599 ms, sys: 105 ms, total: 705 ms
Wall time: 1.07 s


In [None]:
%%time
# CNN-Dailymail - Train
# cnn_daily_train = pd.read_csv("cnn_dailymail/train.csv")
cnn_daily_train = pd.read_csv("/content/drive/MyDrive/NLP_data/cnn_dailymail/train.csv")

CPU times: user 12.7 s, sys: 2.14 s, total: 14.9 s
Wall time: 18 s


In [None]:
cnn_daily = pd.concat([cnn_daily_train, cnn_daily_valid, cnn_daily_test]).reset_index(drop=True)
cnn_daily = cnn_daily[["article", "highlights"]].rename(columns={"highlights":"summary"})
cnn_daily = cnn_daily.replace(r'\n',' ', regex=True)
cnn_daily = cnn_daily.replace(r'\s+([.,;:!?])', r'\1', regex=True)
cnn_daily = cnn_daily.astype(str)

arr_filter = cnn_daily["article"].apply(lambda x: len(x)) > 400

cnn_daily = cnn_daily.loc[arr_filter, :]

In [None]:
cnn_daily.head()

Unnamed: 0,article,summary
0,"By. Associated Press. PUBLISHED:. 14:11 EST, 25 October 2013. |. UPDATED:. 15:36 EST, 25 October 2013. The bishop of the Fargo Catholic Diocese in...","Bishop John Folda, of North Dakota, is taking time off after being diagnosed. He contracted the infection through contaminated food in Italy. Chur..."
1,"(CNN) -- Ralph Mata was an internal affairs lieutenant for the Miami-Dade Police Department, working in the division that investigates allegations...","Criminal complaint: Cop used his role to help cocaine traffickers. Ralph Mata, an internal affairs lieutenant, allegedly helped group get guns. He..."
2,"A drunk driver who killed a young woman in a head-on crash while checking his mobile phone has been jailed for six years. Craig Eccleston-Todd, 27...","Craig Eccleston-Todd, 27, had drunk at least three pints before driving car. Was using phone when he veered across road in Yarmouth, Isle of Wight..."
3,"(CNN) -- With a breezy sweep of his pen President Vladimir Putin wrote a new chapter into Crimea's turbulent history, committing the region to a f...",Nina dos Santos says Europe must be ready to accept sanctions will hurt both sides. Targeting Russia's business community would be one way of sapp...
4,Fleetwood are the only team still to have a 100% record in Sky Bet League One as a 2-0 win over Scunthorpe sent Graham Alexander’s men top of the ...,"Fleetwood top of League One after 2-0 win at Scunthorpe. Peterborough, Bristol City, Chesterfield and Crawley all drop first points of the season...."


In [None]:
wikihow_clean = wikihow[["headline", "text"]].rename(columns={"headline":"summary", "text":"article"})
wikihow_clean = wikihow_clean.replace(r'\n',' ', regex=True)
wikihow_clean = wikihow_clean.replace(r'\s+([.,;:!?])', r'\1', regex=True)

wikihow_clean = wikihow_clean.astype(str)

arr_filter = wikihow_clean["article"].apply(lambda x: len(x)) > 400

wikihow_clean = wikihow_clean.loc[arr_filter, :]

In [None]:
wikihow_clean.head()

Unnamed: 0,summary,article
0,"Keep related supplies in the same area., Make an effort to clean a dedicated workspace after every session., Place loose supplies in large, clear...","If you're a photographer, keep all the necessary lens, cords, and batteries in the same quadrant of your home or studio. Paints should be kept wi..."
1,"Create a sketch in the NeoPopRealist manner of the future mural on a small piece of paper 8""x10"" using the black ink pen., Prepare to create your...","See the image for how this drawing develops step-by-step. However, there is an important detail: the following drawings are to examine it, and th..."
2,"Get a bachelor’s degree., Enroll in a studio-based program., Train on a number of VFX computer programs., Watch online tutorials., Nurture your a...","It is possible to become a VFX artist without a college degree, but the path is often easier with one. VFX artists usually major in fine arts, co..."
3,"Start with some experience or interest in art., Understand the difference between art collectors, art investors and art speculators., Figure out ...","The best art investors do their research on the pieces of art that they buy, so someone with some education or interest in the art world is more ..."
4,"Keep your reference materials, sketches, articles, photos, etc, in one easy to find place., Make ""studies,"" or practice sketches, to organize eff...","As you start planning for a project or work, you'll likely be gathering scraps of inspiration and test sketches. While everyone has a strategy, t..."


In [None]:
print(cnn_daily.shape)
print(wikihow_clean.shape)

(311820, 2)
(181925, 2)


# Summarization Methods

## Imports and load models (PEGASUS and BERTSUM)

In [None]:
%%capture
# Install transformers library
!pip install transformers
!pip install sentencepiece

# nltk --> for ensemble summarizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

print(f"Using device: {device}")

Using device: cuda


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel, BertTokenizer, EncoderDecoderModel
from transformers import pipeline, PegasusForConditionalGeneration, PegasusTokenizer
import heapq
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Abstractive summarization pipeline (PEGASUS-XSUM)
abstractive_summarizer = pipeline("summarization", model="google/pegasus-xsum", device=device)

# Abstractive summarization model and tokenizer (PEGASUS-XSUM)
abstractive_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(device)
abstractive_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

# Extractive summarization model and tokenizer (BERTSUM)
extractive_model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail").to(device)
extractive_tokenizer = BertTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")

# Get single example (for testing methods)
cnn_example_summary = cnn_daily["summary"][2]
cnn_example_text = cnn_daily["article"][2]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.66k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

## Baseline

In [None]:
# Baseline, with k=4
def baseline(text):
    sentences = re.split(r'(?<=[.:;!?])\s', text)
    selected_sentences = [sentence for sentence in sentences if len(sentence.split()) >= 3][:4]

    return(''.join(selected_sentences))

print(f"Reference Sumamry:\n{cnn_example_summary}\n")
print(f"Baseline:\n{baseline(cnn_example_text)}\n")

Reference Sumamry:
Craig Eccleston-Todd, 27, had drunk at least three pints before driving car. Was using phone when he veered across road in Yarmouth, Isle of Wight. Crashed head-on into 28-year-old Rachel Titley's car, who died in hospital. Police say he would have been over legal drink-drive limit at time of crash. He was found guilty at Portsmouth Crown Court of causing death by dangerous driving.

Baseline:
A drunk driver who killed a young woman in a head-on crash while checking his mobile phone has been jailed for six years.Craig Eccleston-Todd, 27, was driving home from a night at a pub when he received a text message.As he was reading or replying to it, he veered across the road while driving round a bend and smashed into Rachel Titley’s car coming the other way.Craig Eccleston-Todd, 27 (left) was using his mobile phone when he crashed head-on into the car being driven by Rachel Titley, 28 (right).



## Base abstractive and extractive summarization

In [None]:
# Base function for abstractive summarization (PEGASUS)
# Base function for abstractive summarization (PEGASUS)
def abstractive_summarization(text,
                              max_length=None, min_length=None, do_sample=False,
                              truncate=True, model=abstractive_model, tokenizer=abstractive_tokenizer):

    max_input_length = 1024
    # Truncate input text if its length exceeds `max_input_length`
    if truncate and len(text) > max_input_length:
        text = text[:max_input_length]
        # Find the last complete sentence before the truncated point
        last_period_idx = text.rfind(".")
        if last_period_idx != -1:
            text = text[:last_period_idx+1]

    inputs = tokenizer(text, return_tensors="pt").to(device)

    if max_length is None:
        max_length = len(inputs["input_ids"][0]) // 2

    if min_length is None:
        min_length = len(inputs["input_ids"][0]) // 4

    summary_ids = model.generate(inputs["input_ids"].to(device), max_length=max_length, min_length=min_length,
                                 do_sample=do_sample, early_stopping=True)

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


# Base function for extractive summarization (BERTSUM)
def extractive_summarization(text,
                             padding=True, truncation=True,
                             num_beams=5, max_length=None,
                             early_stopping=True, skip_special_tokens=True,
                             model=extractive_model, tokenizer=extractive_tokenizer):

    if max_length is None:
        max_length = len(text) // 7
    inputs = tokenizer(text, return_tensors="pt", padding=padding, truncation=truncation).to(device)

    summary_ids = model.generate(inputs["input_ids"].to(device), num_beams=num_beams, max_length=max_length,
                                 early_stopping=early_stopping)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=skip_special_tokens)
    return summary

print(f"Reference Sumamry:\n{cnn_example_summary}\n")

print(f"Abstractive Summarization:\n{abstractive_summarization(cnn_example_text)}\n")
print(f"Extractive Summarization:\n{extractive_summarization(cnn_example_text)}\n")

Reference Sumamry:
Craig Eccleston-Todd, 27, had drunk at least three pints before driving car. Was using phone when he veered across road in Yarmouth, Isle of Wight. Crashed head-on into 28-year-old Rachel Titley's car, who died in hospital. Police say he would have been over legal drink-drive limit at time of crash. He was found guilty at Portsmouth Crown Court of causing death by dangerous driving.

Abstractive Summarization:
A drunk driver who killed a young woman in a head-on crash while checking his mobile phone has been jailed for six years, reports the BBC's Victoria Derbyshire programme, which is broadcast on BBC One in the Isle of Wight and on BBC Two in the south of England.

Extractive Summarization:
craig eccleston - todd, 27, was driving home from a pub when he crashed. he veered across the road and smashed into rachel titley's car coming the other way. miss titley was driving responsibly and there was'nothing she could have done to avoid'he was found guilty of causing de

## Simple two-step hybrid summarization

In [None]:
'''
Hybrid extractive-abstractive (simple two step approach)

In this example, we will create a hybrid extractive-abstractive summarizer using BERTSUM (extractive)
and PEGASUS (abstractive). The idea is to first generate an extractive summary using BERTSUM,
which will serve as a condensed version of the original text, and then use PEGASUS to create an
abstractive summary from the extractive summary. This two-step approach combines the strengths of both
extractive and abstractive methods.
'''
def hybrid_summarization(text, num_steps=3):

    # Step 1: Split the text into equal parts
    step_size = len(text) // num_steps
    text_parts = [text[i:i+step_size] for i in range(0, len(text), step_size)]

    # Step 2: Perform extractive summarization on each part
    extractive_summaries = []
    for part in text_parts:
        extractive_summary = extractive_summarization(part, max_length=step_size)
        extractive_summaries.append(extractive_summary)

    # Step 3: Concatenate the extractive summaries and perform abstractive summarization
    abstractive_summary = abstractive_summarization(" ".join(extractive_summaries))

    return abstractive_summary


print(f"Reference Sumamry:\n{cnn_example_summary}\n")

print(f"Hybrid (two-step) Summarization:\n{hybrid_summarization(cnn_example_text)}\n")

Reference Sumamry:
Craig Eccleston-Todd, 27, had drunk at least three pints before driving car. Was using phone when he veered across road in Yarmouth, Isle of Wight. Crashed head-on into 28-year-old Rachel Titley's car, who died in hospital. Police say he would have been over legal drink-drive limit at time of crash. He was found guilty at Portsmouth Crown Court of causing death by dangerous driving.

Hybrid (two-step) Summarization:
A man has been jailed for six years for causing the death of a woman who was killed when he crashed into her car as he was reading or replying to a text message, a court has been told. eccleston - todd, of newport, was found guilty of causing death by dangerous driving following trial at portsmouth crown court.



## Two-step hybrid summarization with 'importance ranking' in extractive stage

In [None]:
'''
Hybrid extractive-abstractive (two step approach, this time focusing on "most important sentences" in extractive stage)

In this approach, we will first extract the most important sentences, measured by length from summary,
from the input text using an extractive summarizer,
then use these sentences as input for the abstractive summarization model.
This is similar to the previous hybrid approach but prioritizes the "most important sentences" (proxied by length)
from the extractive summary.
'''

def extractive_summarization_priority(text, num_sentences=20,
                                      padding=True, truncation=True,
                                      num_beams=5, max_length=200,
                                      early_stopping=False, skip_special_tokens=True,
                                      model=extractive_model, tokenizer=extractive_tokenizer):


    inputs = tokenizer(text, return_tensors="pt", padding=padding, truncation=truncation).to(device)
    summary_ids = model.generate(inputs["input_ids"], num_beams=num_beams, max_length=len(text),
                                 early_stopping=early_stopping)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=skip_special_tokens)

    # Extract the most important sentences, measured by length
    sentences = summary.split('. ')
    top_sentences = heapq.nlargest(num_sentences, sentences, key=len)
    important_sentences = '. '.join(top_sentences)

    return important_sentences

def hybrid_summarization_important_sentences(text):
    # Step 1: Extractive summarization, with important sentence selection
    important_sentences = extractive_summarization_priority(text)

    # Step 2: Abstractive summarization
    abstractive_summary = abstractive_summarization(important_sentences)

    return abstractive_summary

print(f"Reference Sumamry:\n{cnn_example_summary}\n")

print(f"Hybrid (important sentences) Summarization:\n{hybrid_summarization_important_sentences(cnn_example_text)}\n")

Reference Sumamry:
Craig Eccleston-Todd, 27, had drunk at least three pints before driving car. Was using phone when he veered across road in Yarmouth, Isle of Wight. Crashed head-on into 28-year-old Rachel Titley's car, who died in hospital. Police say he would have been over legal drink-drive limit at time of crash. He was found guilty at Portsmouth Crown Court of causing death by dangerous driving.

Hybrid (important sentences) Summarization:
A woman has been jailed for three years for causing the death of a man in a crash in Portsmouth.



## Graph-based summarization, using TextRank

In [None]:
'''
Graph based summarization
'''

def build_similarity_matrix_graph(sentences):
    vectorizer = TfidfVectorizer()
    sentence_vectors = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(sentence_vectors)
    return similarity_matrix

def textrank(sentences, top_n=10):
    similarity_matrix = build_similarity_matrix_graph(sentences)
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    top_sentences = [ranked_sentences[i][1] for i in range(top_n)]
    return '. '.join(top_sentences)

def graph_based_summarization(text, top_n=10):
    sentences = text.split('. ')
    top_n = min(top_n, len(sentences))
    top_ranked_sentences = textrank(sentences, top_n)

    summary = abstractive_summarization(top_ranked_sentences,
                                        truncate=True,
                                        do_sample=True)
    return summary

print(f"Reference Sumamry:\n{cnn_example_summary}\n")

print(f"Graph-based Summarization:\n{graph_based_summarization(cnn_example_text)}\n")

Reference Sumamry:
Craig Eccleston-Todd, 27, had drunk at least three pints before driving car. Was using phone when he veered across road in Yarmouth, Isle of Wight. Crashed head-on into 28-year-old Rachel Titley's car, who died in hospital. Police say he would have been over legal drink-drive limit at time of crash. He was found guilty at Portsmouth Crown Court of causing death by dangerous driving.

Graph-based Summarization:
A man has been jailed for six years for causing the death of 19-year-old Rachel Titley, who was killed when she was hit by a car as she crossed the road in Portsmouth in March last year, writes the BBC's Nick Triggle.



## Ensemble hybrid approach, ranking results of models to generate final summary

In [None]:
'''
Ensemble approach

'''

def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []

    sent1 = [w.lower() for w in sent1 if w not in stopwords]
    sent2 = [w.lower() for w in sent2 if w not in stopwords]

    all_words = list(set(sent1 + sent2))

    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    for w in sent1:
        vector1[all_words.index(w)] += 1

    for w in sent2:
        vector2[all_words.index(w)] += 1

    return 1 - cosine_similarity(np.array(vector1).reshape(1, -1), np.array(vector2).reshape(1, -1))[0, 0]

def build_similarity_matrix(sentences, stopwords=None):
    if stopwords is None:
        stopwords = []

    S = np.zeros((len(sentences), len(sentences)))

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j:
                continue

            S[i][j] = sentence_similarity(sentences[i], sentences[j], stopwords)

    for i in range(len(S)):
        S[i] /= S[i].sum()

    return S

def ensemble_summarization(extractive_summary, abstractive_summary, top_n=5):

    # Extract sentences
    sentences = list(set(nltk.sent_tokenize(extractive_summary) + nltk.sent_tokenize(abstractive_summary)))

    # Build similarity matrix
    S = build_similarity_matrix(sentences, stopwords.words("english"))

    # Rank sentences
    ranked_sentences = nx.pagerank(nx.from_numpy_array(S), alpha=0.85, tol=1e-8)

    # Extract the top-ranked sentences
    top_sentences = heapq.nlargest(top_n, ranked_sentences, key=ranked_sentences.get)
    summary = ". ".join([sentences[i] for i in top_sentences])

    return summary

print(f"Reference Sumamry:\n{cnn_example_summary}\n")

extractive_summary = extractive_summarization(cnn_example_text)
abstractive_summary = abstractive_summarization(cnn_example_text)
print(f"Hybrid (ensemble, using TextRank to rank) Summarization:\n{ensemble_summarization(extractive_summary, abstractive_summary)}\n")

Reference Sumamry:
Craig Eccleston-Todd, 27, had drunk at least three pints before driving car. Was using phone when he veered across road in Yarmouth, Isle of Wight. Crashed head-on into 28-year-old Rachel Titley's car, who died in hospital. Police say he would have been over legal drink-drive limit at time of crash. He was found guilty at Portsmouth Crown Court of causing death by dangerous driving.

Hybrid (ensemble, using TextRank to rank) Summarization:
miss titley was driving responsibly and there was'nothing she could have done to avoid'he was found guilty of causing death by dangerous driving at portsmouth crown court.. he veered across the road and smashed into rachel titley's car coming the other way.. craig eccleston - todd, 27, was driving home from a pub when he crashed.. A drunk driver who killed a young woman in a head-on crash while checking his mobile phone has been jailed for six years, reports the BBC's Victoria Derbyshire programme, which is broadcast on BBC One in 

## Heirarchical summarization, primarily for WikiHow (will need better preprocessing)

In [None]:
'''
Heirarchical Summarization (for WikiHow primarily)
'''

def split_text_into_sections(text, delimiter='\n\n'):
    sections = text.split(delimiter)
    return [section.strip() for section in sections if section.strip()]

def hierarchical_abstractive_summarization(text):
    sections = split_text_into_sections(text)
    section_summaries = [abstractive_summarization(section, min_length=1, max_length=min(len(section.split()), 25)) for section in sections]
    summary_of_summaries = abstractive_summarization(' '.join(section_summaries))
    return summary_of_summaries

print(f"Reference Sumamry:\n{cnn_example_summary}\n")

print(f"Hierarchical Abstractive Summarization:\n{hierarchical_abstractive_summarization(wikihow['text'][0])}\n")


Reference Sumamry:
Craig Eccleston-Todd, 27, had drunk at least three pints before driving car. Was using phone when he veered across road in Yarmouth, Isle of Wight. Crashed head-on into 28-year-old Rachel Titley's car, who died in hospital. Police say he would have been over legal drink-drive limit at time of crash. He was found guilty at Portsmouth Crown Court of causing death by dangerous driving.

Hierarchical Abstractive Summarization:
Whether you're making art for a living or just daydreaming, here are some tips for getting the most out of, as visual people, a lot of artist clutter comes from a desire to keep track of supplies visually instead of



## Iterative abstractive summarization --> perhaps for longer texts

In [None]:
'''
Iterative summarization
'''

def merge_similar_sentences(text, similarity_threshold=0.8):
    sentences = text.split('. ')
    vectorizer = TfidfVectorizer()
    sentence_vectors = vectorizer.fit_transform(sentences)

    merged_sentences = []
    for i, sentence in enumerate(sentences):
        if i == len(sentences) - 1:
            break

        similarity = cosine_similarity(sentence_vectors[i], sentence_vectors[i + 1])
        if similarity > similarity_threshold:
            merged_sentences.append(sentence + " " + sentences[i + 1])
        else:
            merged_sentences.append(sentence)

    return '. '.join(merged_sentences)

def remove_short_sentences(text, length_threshold=5):
    sentences = text.split('. ')
    long_sentences = [sentence for sentence in sentences if len(sentence.split()) > length_threshold]
    return '. '.join(long_sentences)

def iterative_abstractive_summarization(text, iterations=3):
    current_summary = text
    for _ in range(iterations):
        current_summary = abstractive_summarization(current_summary, min_length=60, max_length=100)
        current_summary = merge_similar_sentences(current_summary)
        current_summary = remove_short_sentences(current_summary)
    return current_summary

print(f"Reference Sumamry:\n{cnn_example_summary}\n")

print(f"Iterative Abstractive Summarization:\n{iterative_abstractive_summarization(cnn_example_text)}\n")

Reference Sumamry:
Craig Eccleston-Todd, 27, had drunk at least three pints before driving car. Was using phone when he veered across road in Yarmouth, Isle of Wight. Crashed head-on into 28-year-old Rachel Titley's car, who died in hospital. Police say he would have been over legal drink-drive limit at time of crash. He was found guilty at Portsmouth Crown Court of causing death by dangerous driving.

Iterative Abstractive Summarization:




## Query-based summarization (information retreival purposes)

In [None]:
'''
Query-based summarization
'''

def retrieve_relevant_sentences(text, query, num_sentences=5):
    sentences = text.split('. ')
    all_text = [query] + sentences
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_text)

    query_vector = tfidf_matrix[0]
    sentence_vectors = tfidf_matrix[1:]

    similarities = cosine_similarity(query_vector, sentence_vectors)
    top_sentence_indices = heapq.nlargest(num_sentences, range(len(similarities[0])), similarities[0].__getitem__)
    relevant_sentences = [sentences[index] for index in sorted(top_sentence_indices)]
    return '. '.join(relevant_sentences)

def query_based_summarization(text, query):
    relevant_sentences = retrieve_relevant_sentences(text, query)
    summary = abstractive_summarization(relevant_sentences)
    return summary

print(f"Reference Sumamry:\n{cnn_example_summary}\n")

print(f"Query-based Summarization (driving):\n{query_based_summarization(cnn_example_text, 'driving')}\n")
print(f"Query-based Summarization (pints):\n{query_based_summarization(cnn_example_text, 'pints')}\n")

Reference Sumamry:
Craig Eccleston-Todd, 27, had drunk at least three pints before driving car. Was using phone when he veered across road in Yarmouth, Isle of Wight. Crashed head-on into 28-year-old Rachel Titley's car, who died in hospital. Police say he would have been over legal drink-drive limit at time of crash. He was found guilty at Portsmouth Crown Court of causing death by dangerous driving.

Query-based Summarization (driving):
A man has been jailed for seven years for causing the death of a woman while using a mobile phone while driving.

Query-based Summarization (pints):
A drunk driver who killed a young woman in a head-on crash while checking his mobile phone has been jailed for six years, reports the BBC's East of England news website.



# Evaluation and Metrics

## Define metrics

In [None]:
%%capture
!pip install rouge_score
!pip install bert_score
!pip install textstat
!pip install language_tool_python

In [None]:
import textstat
from rouge_score import rouge_scorer
import spacy
from nltk.translate.bleu_score import sentence_bleu
from sklearn.feature_extraction.text import CountVectorizer
import language_tool_python
import bert_score

In [None]:
# Initialize global variables
nlp = spacy.load("en_core_web_sm")
tool = language_tool_python.LanguageTool('en-US')

# Define functions to calculate various metrics
def calculate_rouge_scores(summary, reference):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=True)
    scores = scorer.score(summary, reference)
    return scores

def calculate_bert_score(summary, reference):
    _, _, bert_score_f1 = bert_score.score([summary], [reference], verbose=False)
    return bert_score_f1.item()

def readability_flesch_score(summary):
    return textstat.flesch_reading_ease(summary)

def compression_rate(summary, original_text):
    return len(summary) / len(original_text)

def density(summary):
    word_count = len(summary.split())
    unique_word_count = len(set(summary.split()))
    return unique_word_count / word_count

def coverage(summary, original_text):
    summary_words = set(summary.split())
    original_words = set(original_text.split())
    return len(summary_words.intersection(original_words)) / len(original_words)

def calculate_bleu_score(summary, reference):
    # Convert summary and reference to lists of tokens
    summary_tokens = summary.split()
    reference_tokens = reference.split()

    # Calculate BLEU score
    bleu_score = sentence_bleu([reference_tokens], summary_tokens)
    return bleu_score


def calculate_entity_grid_score(summary):
    try:
      doc = nlp(summary)
      entity_grid = {}

      # Build entity grid
      for ent in doc.ents:
          if ent.label_ not in entity_grid:
              entity_grid[ent.label_] = set()
          entity_grid[ent.label_].add(ent.text.lower())

      # Calculate entity grid score
      entity_grid_score = 0
      for label, entities in entity_grid.items():
          if len(entities) > 1:
              entity_grid_score += 1
      entity_grid_score /= len(entity_grid)

      return entity_grid_score

    except:
      return None

def calculate_lexical_similarity(summary, original_text):
    # Convert summary and original text to lists of strings
    documents = [summary, original_text]

    # Vectorize documents
    vectorizer = CountVectorizer().fit_transform(documents)

    # Calculate cosine similarity
    cosine_similarity = (vectorizer * vectorizer.T).toarray()[0, 1]

    return cosine_similarity

def calculate_jaccard_index(summary, original_text):
    # Convert summary and original text to sets of words
    summary_words = set(summary.split())
    original_words = set(original_text.split())

    # Calculate Jaccard Index
    jaccard_index = len(summary_words.intersection(original_words)) / len(summary_words.union(original_words))

    return jaccard_index

def calculate_grammar_score(summary):
    matches = tool.check(summary)
    grammar_score = len(matches) / len(summary.split())

    return grammar_score

Downloading LanguageTool 5.7: 100%|██████████| 225M/225M [00:02<00:00, 91.6MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpknco66ee.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://www.languagetool.org/download/LanguageTool-5.7.zip to /root/.cache/language_tool_python.


## Evaluation

In [None]:
start = 6000
end = 9000

import time

data = []

for idx, article in wikihow_clean[start:end].iterrows():
    start_time = time.time()
    original_text = article["article"]
    reference_summary = article["summary"]

    # Create summaries

    baseline_summary = baseline(original_text)
    abstractive_summary = abstractive_summarization(original_text)
    extractive_summary = extractive_summarization(original_text)
    # hybrid_summary = hybrid_summarization(original_text)
    # hybrid_importance_summary = hybrid_summarization_important_sentences(original_text)
    graph_summary = graph_based_summarization(original_text)
    ensemble_summary = ensemble_summarization(extractive_summary, abstractive_summary)
    # iterative_summary = iterative_abstractive_summarization(original_text)


    summaries = [
        ("0_Baseline", baseline_summary),
        ("Abstractive", abstractive_summary),
        ("Extractive", extractive_summary),
        # ("Hybrid two-step", hybrid_summary),
        # ("Hybrid importance", hybrid_importance_summary),
        ("Graph", graph_summary),
        ("Ensemble", ensemble_summary)
        # ("Iterative", iterative_summary)

    ]

    # Get metrics
    for summary_type, summary in summaries:
        row = {
            "Article Index": idx,
            "Summary Type": summary_type,
            "Summary": summary,
            "F1 Accuracy (ROUGE1)": calculate_rouge_scores(summary, reference_summary)["rouge1"].fmeasure,
            "F1 Accuracy (ROUGE2)": calculate_rouge_scores(summary, reference_summary)["rouge2"].fmeasure,
            "F1 Accuracy (ROUGEL)": calculate_rouge_scores(summary, reference_summary)["rougeL"].fmeasure,
            "Readability (Flesch)": readability_flesch_score(summary),
            "Entity Grid Score": calculate_entity_grid_score(summary),
            "Lexical Similarity": calculate_lexical_similarity(summary, original_text),
            "(Diversity) Jaccard Index": calculate_jaccard_index(summary, original_text),
            # "Grammar Score": calculate_grammar_score(summary),
            "Compression Rate": compression_rate(summary, original_text),
            "Density": density(summary),
            "Coverage": coverage(summary, original_text)
        }
        data.append(row)

    elapsed_time = time.time() - start_time
    # Print progress
    print(f"Processed article {idx+1} of {len(wikihow_clean)}, time taken: {elapsed_time:.2f} seconds")

df = pd.DataFrame(data)

filename = f"wikiHow_{start}_{end}.csv"
results_path = "/content/drive/MyDrive/HunterNLPProject/results/wikiHow/"
df.to_csv(results_path+filename, index=True)

Processed article 6178 of 181925, time taken: 5.40 seconds
Processed article 6179 of 181925, time taken: 5.17 seconds
Processed article 6180 of 181925, time taken: 5.97 seconds
Processed article 6181 of 181925, time taken: 4.85 seconds
Processed article 6182 of 181925, time taken: 4.80 seconds
Processed article 6183 of 181925, time taken: 4.52 seconds
Processed article 6184 of 181925, time taken: 5.89 seconds
Processed article 6185 of 181925, time taken: 4.63 seconds
Processed article 6186 of 181925, time taken: 3.56 seconds
Processed article 6187 of 181925, time taken: 3.97 seconds
Processed article 6188 of 181925, time taken: 4.29 seconds
Processed article 6189 of 181925, time taken: 5.08 seconds
Processed article 6190 of 181925, time taken: 5.43 seconds
Processed article 6191 of 181925, time taken: 5.38 seconds
Processed article 6192 of 181925, time taken: 4.72 seconds
Processed article 6193 of 181925, time taken: 4.66 seconds
Processed article 6194 of 181925, time taken: 4.99 secon

In [None]:
start = 19000
end = 24000

import time

data = []

for idx, article in wikihow_clean[start:end].iterrows():
    start_time = time.time()
    original_text = article["article"]
    reference_summary = article["summary"]

    # Create summaries

    baseline_summary = baseline(original_text)
    abstractive_summary = abstractive_summarization(original_text)
    extractive_summary = extractive_summarization(original_text)
    # hybrid_summary = hybrid_summarization(original_text)
    # hybrid_importance_summary = hybrid_summarization_important_sentences(original_text)
    graph_summary = graph_based_summarization(original_text)
    ensemble_summary = ensemble_summarization(extractive_summary, abstractive_summary)
    # iterative_summary = iterative_abstractive_summarization(original_text)


    summaries = [
        ("0_Baseline", baseline_summary),
        ("Abstractive", abstractive_summary),
        ("Extractive", extractive_summary),
        # ("Hybrid two-step", hybrid_summary),
        # ("Hybrid importance", hybrid_importance_summary),
        ("Graph", graph_summary),
        ("Ensemble", ensemble_summary)
        # ("Iterative", iterative_summary)

    ]

    # Get metrics
    for summary_type, summary in summaries:
        row = {
            "Article Index": idx,
            "Summary Type": summary_type,
            "Summary": summary,
            "F1 Accuracy (ROUGE1)": calculate_rouge_scores(summary, reference_summary)["rouge1"].fmeasure,
            "F1 Accuracy (ROUGE2)": calculate_rouge_scores(summary, reference_summary)["rouge2"].fmeasure,
            "F1 Accuracy (ROUGEL)": calculate_rouge_scores(summary, reference_summary)["rougeL"].fmeasure,
            "Readability (Flesch)": readability_flesch_score(summary),
            "Entity Grid Score": calculate_entity_grid_score(summary),
            "Lexical Similarity": calculate_lexical_similarity(summary, original_text),
            "(Diversity) Jaccard Index": calculate_jaccard_index(summary, original_text),
            # "Grammar Score": calculate_grammar_score(summary),
            "Compression Rate": compression_rate(summary, original_text),
            "Density": density(summary),
            "Coverage": coverage(summary, original_text)
        }
        data.append(row)

    elapsed_time = time.time() - start_time
    # Print progress
    print(f"Processed article {idx+1} of {len(wikihow_clean)}, time taken: {elapsed_time:.2f} seconds")

df = pd.DataFrame(data)

filename = f"wikiHow_{start}_{end}.csv"
results_path = "/content/drive/MyDrive/HunterNLPProject/results/wikiHow/"
df.to_csv(results_path+filename, index=True)

Processed article 20205 of 181925, time taken: 3.83 seconds
Processed article 20206 of 181925, time taken: 3.74 seconds
Processed article 20207 of 181925, time taken: 5.17 seconds
Processed article 20208 of 181925, time taken: 4.10 seconds
Processed article 20209 of 181925, time taken: 4.15 seconds
Processed article 20210 of 181925, time taken: 4.80 seconds
Processed article 20211 of 181925, time taken: 4.14 seconds
Processed article 20212 of 181925, time taken: 4.18 seconds
Processed article 20213 of 181925, time taken: 4.38 seconds
Processed article 20214 of 181925, time taken: 4.39 seconds
Processed article 20215 of 181925, time taken: 4.05 seconds
Processed article 20216 of 181925, time taken: 4.54 seconds
Processed article 20217 of 181925, time taken: 4.39 seconds
Processed article 20219 of 181925, time taken: 3.93 seconds
Processed article 20226 of 181925, time taken: 4.15 seconds
Processed article 20227 of 181925, time taken: 4.04 seconds
Processed article 20228 of 181925, time 

In [None]:
# Group the data by "Summary Type" and calculate the mean of the metrics
mean_metrics = df.groupby("Summary Type").mean()

# Reset index to move "Summary Type" back to a column
mean_metrics.reset_index(inplace=True)

# Display the aggregated metrics
mean_metrics.drop(['Article Index'], axis=1, inplace=True)

  mean_metrics = df.groupby("Summary Type").mean()


In [None]:
mean_metrics

Unnamed: 0,Summary Type,F1 Accuracy (ROUGE1),F1 Accuracy (ROUGE2),F1 Accuracy (ROUGEL),Readability (Flesch),Entity Grid Score,Lexical Similarity,(Diversity) Jaccard Index,Compression Rate,Density,Coverage
0,0_Baseline,0.245825,0.054783,0.148418,70.673658,0.231968,658.1386,0.326455,0.291346,0.748898,0.33229
1,Abstractive,0.211512,0.039599,0.143695,59.8347,0.176298,336.5644,0.088323,0.125883,0.698974,0.094705
2,Ensemble,0.243374,0.053935,0.144323,74.951594,0.269535,707.5506,0.202641,0.284642,0.665305,0.229204
3,Extractive,0.251151,0.053215,0.158106,77.38387,0.256723,401.1494,0.185857,0.165585,0.793607,0.194588
4,Graph,0.225913,0.046244,0.152456,61.245102,0.181518,387.8766,0.093904,0.126074,0.68615,0.100135
