# Extractive text summarisation - test framework

First let's import all relevant modules

In [1]:
import sys
sys.path.append("/Users/frankkelly/Dropbox/Projects-new/sahaj_text_summarisation_clean/gradio-screen/")

import os
from pathlib import Path
import pandas as pd
import swifter

In [2]:

PROJECT_PATH = Path("./").absolute().parent
PROJECT_PATH

PosixPath('/Users/frankkelly/Dropbox/Projects-new/sahaj_text_summarisation_clean')

In [3]:
# %load_ext autoreload
# %autoreload 2
# %load_ext lab_black

In [4]:
iskaggle = os.environ.get("KAGGLE_KERNEL_RUN_TYPE", "")
if iskaggle:
    path = Path("../pariza/bbc-news-summary")
else:
    full_path = Path("pariza/bbc-news-summary")
    owner_slug = str(full_path).split("/")[0]
    dataset_slug = str(full_path).split("/")[1]
    print(owner_slug, dataset_slug)
    path = Path("../"+ dataset_slug)
    if not path.exists():
        import zipfile, kaggle

        kaggle.api.datasets_download(owner_slug=owner_slug, dataset_slug=dataset_slug)

pariza bbc-news-summary


In [5]:

from rouge import Rouge 

hypothesis = "the #### transcript is a written version of each day 's cnn student news program use this transcript to he    lp students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you     saw on cnn student news"

reference = "this page includes the show transcript use the transcript to help students with reading comprehension and     vocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teac    her or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests     students ' knowledge of even ts in the news"

rouge = Rouge()
scores = rouge.get_scores(hypothesis, reference)

In [6]:
scores

[{'rouge-1': {'r': 0.42857142857142855,
   'p': 0.5833333333333334,
   'f': 0.49411764217577864},
  'rouge-2': {'r': 0.18571428571428572,
   'p': 0.3170731707317073,
   'f': 0.23423422957552154},
  'rouge-l': {'r': 0.3877551020408163,
   'p': 0.5277777777777778,
   'f': 0.44705881864636676}}]

In [7]:
# Load in the BBC news summaries - from datasets/bbc-news-summary/
data_path = Path(PROJECT_PATH, "bbc-news-summary/BBC News Summary")

data_path


PosixPath('/Users/frankkelly/Dropbox/Projects-new/sahaj_text_summarisation_clean/bbc-news-summary/BBC News Summary')

In [8]:
from dataclasses import dataclass, field
from typing import List

@dataclass()
class BBCNewsDataReader:
    base_folder: str
    exclusion: list = field(default_factory=list)
    
    @property
    def news_articles_folder(self):
        return self.base_folder / 'News Articles'
    
    @property
    def summaries_folder(self):
        return self.base_folder / 'Summaries'
    
    @property
    def categories(self):
        exclusion_folders = lambda x: x not in [".DS_Store"] + self.exclusion
        return filter(exclusion_folders, os.listdir(self.news_articles_folder))
    
    def to_df(self):
        df = pd.DataFrame(columns=['article', 'summary', 'category', 'filename'])
        for article_folder, summary_folder in self.__category_folders():
            category = article_folder.split('/')[-1]
            for filename in os.listdir(article_folder):
                if os.path.isfile(f'{article_folder}/{filename}'):
                    try:
                        article = self.__read_file(f'{article_folder}/{filename}')
                        summary = self.__read_file(f'{summary_folder}/{filename}')
                        # TODO: replace append with concat
                        df = df.append({'article': article, 'summary': summary, 'category': category, 'filename': filename}, ignore_index=True)
                    except UnicodeDecodeError:
                        pass
        return df
                
            
    def __category_folders(self):
        return [
            (f'{self.news_articles_folder}/{category}', f'{self.summaries_folder}/{category}') for category in self.categories
        ]
    
    
    def __read_file(self, filepath):
        with open(filepath) as file:
            return file.read()

In [9]:
data = BBCNewsDataReader(
        base_folder=data_path,
        # exclusion=['entertainment', 'tech', 'sport', 'politics'] # remove these to read all data
    ).to_df()

In [10]:
data.head()

Unnamed: 0,article,summary,category,filename
0,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...,entertainment,289.txt
1,"U2's desire to be number one\n\nU2, who have w...",But they still want more.They have to want to ...,entertainment,262.txt
2,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo...",entertainment,276.txt
3,Snicket tops US box office chart\n\nThe film a...,A Series of Unfortunate Events also stars Scot...,entertainment,060.txt
4,Ocean's Twelve raids box office\n\nOcean's Twe...,"Ocean's Twelve, the crime caper sequel starrin...",entertainment,074.txt


In [19]:
# Load in the models

from src.text_rank_summarizer import summarize as summarize_extractive
from src.transformer_summarization import summarize_abstractive

# Get the summaries from the datasets and models

In [13]:
import time

def get_extractive_summary(article):
    try:
        start_time = time.time()
        summary = summarize_extractive(article)
        # print(f"Extractive summary took {time.time() - start_time} seconds")
        return (summary, time.time() - start_time)
    except Exception as e:
        print(e)
        return (None, None)


In [14]:
data["extractive_summary"], data["extractive_time_taken"] = zip(*data["article"].swifter.apply(get_extractive_summary))



Pandas Apply: 100%|██████████| 2224/2224 [03:12<00:00, 11.58it/s]


In [16]:
data[["extractive_summary", "extractive_time_taken"]].head()

Unnamed: 0,extractive_summary,extractive_time_taken
0,Musicians to tackle US red tape\n\nMusicians' ...,0.180434
1,The group were born when Mullen put an appeal ...,0.23331
2,"On Tuesday, Doherty and his three bandmates we...",0.091577
3,Snicket tops US box office chart\n\nThe film a...,0.041918
4,Soderbergh returns to direct the hit sequel wh...,0.066866


In [17]:
import datetime

if not os.path.exists(PROJECT_PATH / "output"):
    os.makedirs(PROJECT_PATH / "output")

def write_to_file(df):
    todays_date_and_time_string = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    filename = PROJECT_PATH / f"output/bbc_news_summaries_{todays_date_and_time_string}.csv"

    df.to_csv(filename, index=False)
    print(f"Wrote {filename}")

write_to_file(data)

Wrote /Users/frankkelly/Dropbox/Projects-new/sahaj_text_summarisation_clean/output/bbc_news_summaries_2022-06-27_09-11-18.csv


In [20]:
def get_abstractive_summary(article):
    try:
        start_time = time.time()
        summary = summarize_abstractive(article)
        return summary, time.time() - start_time
    except Exception as e:
        print(e)
        return None, None


In [21]:

data["abstractive_summary"], data["abstractive_time_taken"] = zip(*data["article"].swifter.apply(get_abstractive_summary))

Pandas Apply:   0%|          | 0/2224 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Pandas Apply:   0%|          | 2/2224 [00:02<42:11,  1.14s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Pandas Apply:   0%|          | 3/2224 [00:06<1:23:48,  2.26s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Pandas Apply:   0%|          | 4/2224 [00:08<1:23:12,  2.25s/it]The attention m

index out of range in self


Pandas Apply:  17%|█▋        | 385/2224 [17:02<1:03:29,  2.07s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Pandas Apply:  17%|█▋        | 386/2224 [17:05<1:04:41,  2.11s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Pandas Apply:  17%|█▋        | 387/2224 [17:07<1:09:48,  2.28s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Pandas Apply:  17%|█▋        | 388/2224 [17:10<1:12:34,  2.37

In [None]:
write_to_file(data)

In [55]:
data.article.describe()

count                                                  2224
unique                                                 2126
top       Rings of steel combat net attacks\n\nGambling ...
freq                                                      2
Name: article, dtype: object

In [56]:
data[["article", "summary"]].describe()

Unnamed: 0,article,summary
count,2224,2224
unique,2126,2080
top,Rings of steel combat net attacks\n\nGambling ...,"Christian Harris, partnership manager of mobil..."
freq,2,2


In [57]:
data[["article", "summary"]].isna().sum()

article    0
summary    0
dtype: int64

In [53]:
data["abstractive_summary"].head()

0    
1    
2    
3    
4    
Name: abstractive_summary, dtype: object

In [None]:
data["extractive_rouge-l-f1"] = data.swifter.apply(lambda row: rouge.get_scores(row.extractive_summary, row.summary)[0]['rouge-l']['f'], axis=1)

In [None]:
data["abstractive_rouge-l-f1"] = data.swifter.apply(lambda row: rouge.get_scores(row.abstractive_summary, row.summary)[0]['rouge-l']['f'], axis=1)

In [None]:
data["extractive_rouge-1-f1"] = data.swifter.apply(lambda row: rouge.get_scores(row.extractive_summary, row.summary)[0]['rouge-1']['f'], axis=1)

In [None]:
data["abstractive_rouge-1-1"] = data.swifter.apply(lambda row: rouge.get_scores(row.abstractive_summary, row.summary)[0]['rouge-1']['f'], axis=1)

In [None]:
data["extractive_rouge-2-f1"] = data.swifter.apply(lambda row: rouge.get_scores(row.extractive_summary, row.summary)[0]['rouge-2']['f'], axis=1)

In [None]:
data["abstractive_rouge-2-f1"] = data.swifter.apply(lambda row: rouge.get_scores(row.abstractive_summary, row.summary)[0]['rouge-2']['f'], axis=1)

In [None]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu


def calculate_sentence_bleu(reference: str, candidate: str):
    """Calculate the sentence BLEU score."""
    if candidate is not None:
        return sentence_bleu([reference.split()], candidate.split())
    else:
        return 0


def calculate_corpus_bleu(reference: str, candidate: str):
    """Calculate the corpus BLEU score."""
    if candidate is not None:
        return corpus_bleu([reference.split()], candidate.split())
    else:
        return 0


def calculate_n_gram_bleu(reference: str, candidate: str, n: int):
    """Calculate the n-gram BLEU score."""
    if candidate is not None:
        if n == 1:
            return sentence_bleu(
                [reference.split()], candidate.split(), weights=(1, 0, 0, 0)
            )
        elif n == 2:
            return sentence_bleu(
                [reference.split()], candidate.split(), weights=(0, 1, 0, 0)
            )
        elif n == 3:
            return sentence_bleu(
                [reference.split()], candidate.split(), weights=(0, 0, 1, 0)
            )
        elif n == 4:
            return sentence_bleu(
                [reference.split()], candidate.split(), weights=(0, 0, 0, 1)
            )
    else:
        return 0


In [None]:
if not os.path.exists(PROJECT_PATH / "output"):
    os.makedirs(PROJECT_PATH / "output")

data.to_csv(PROJECT_PATH / "output/bbc_news_summaries.csv", index=False)