# <p style="text-align: center;">Methods Of Extractive Text Summarization</p>

## <p style="text-align: center;">Preparation</p>
---

In [2]:
# Installing libraries.
#%pip install rouge_score

# Imports.
import pandas as pd
import numpy as np
import string
import re
import operator
import random
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.decomposition import NMF
import nltk
from nltk.probability import FreqDist
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from rouge_score import rouge_scorer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wjsci\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wjsci\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Importing datasets.
test = pd.read_csv('test.csv')
test.to_pickle("./test.pkl")

## <p style="text-align: center;">Summarization Parameters</p>
---
Sentence count refers to the amount of sentences that each method will select for the purpose of extractive text summarization. Increasing this number will generate longer summaries. The evaluation of each method involves the ROUGE-1 standard, which compares unigrams.

In [4]:
# Summary parameters.
max_sentence_count = 2
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

## <p style="text-align: center;">Pre-Processing</p>
---
The following function details the text cleaning process used on sentence strings before they are scored. It involves the removal of stopwords and punctuation, the use of a porter stemmer, and the decapitalization of every character. 

In [5]:
def clean(data):
    # setting stop words.
    stops = set(stopwords.words('english'))
    # creating stemmer.
    porter = PorterStemmer()
    # remove punctuation.
    for punc in string.punctuation:
        data = data.replace(punc, '')
    data = data.replace('‘', '')
    data = data.replace('’', '')
    data = data.replace('“', '')
    # remove numbers
    data = re.sub(r'\d+','NUM',data)
    # remove non ascii characters.
    chars = set(string.printable)
    filter(lambda x: x in chars, data)
    # convert to lower case.
    data = data.lower()
    # tokenizing.
    tokens = word_tokenize(data)
    # removing stop words.
    filteredTokens = []
    for token in tokens:
        if token not in stops:
            filteredTokens.append(token)
    # stemming.
    for i in range (len(filteredTokens)):
        # stem.
        filteredTokens[i] = porter.stem(filteredTokens[i])
    # recreating string.
    message = ' '
    message = message.join(filteredTokens)
    # returning.
    return message

## <p style="text-align: center;">Baseline</p>

### <p style="text-align: center;">Random Selection</p>
---

This method represents a baseline with which to compare the following methods to. It selects K random sentences from each article to represent the summary.

In [308]:
def randomized():    
    # Evaluation values.
    stats = ['r1', 'r2', 'rL']
    avg_precision = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_recall = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_fmeasure = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    # Summarize every article.
    for a, article in enumerate(test.loc[0:9999, "article"]):
        # Counter.
        if (a % 1000 == 0 and a > 0):
            print(f'{a} articles summarized.')
            for stat in stats:
                print(f'{stat}')
                print(f'Average Precision: {avg_precision[stat]/(a)}')
                print(f'Average Recall: {avg_recall[stat]/(a)}')
                print(f'Average FMeasure: {avg_fmeasure[stat]/(a)}')
        # Initial blank summary and sentence scores.
        summary = []
        # Tokenize the article by sentences.
        sentences = sent_tokenize(article)
        # Setting summary sentence count.
        sentence_count = min(max_sentence_count, len(sentences))
        # Getting random sentences.
        sentence_ids = random.sample(range(0, len(sentences)), sentence_count)
        for sentence_id in sentence_ids:
            summary.append(sentences[sentence_id])
        candidate = ' '.join(summary)
        reference = test.loc[a,"highlights"]
        # Scoring.
        scores = scorer.score(reference, candidate)
        for stat, key in zip(stats, scores):
            avg_precision[stat] += scores[key][0]
            avg_recall[stat] += scores[key][1]
            avg_fmeasure[stat] += scores[key][2]
    for stat in stats:
        avg_precision[stat] /= (a+1)
        avg_recall[stat] /= (a+1)
        avg_fmeasure[stat] /= (a+1)
    print("\n=====")
    print("Final scores.")
    for stat in stats:
        print(f'{stat}')
        print(f'Average Precision: {avg_precision[stat]}')
        print(f'Average Recall: {avg_recall[stat]}')
        print(f'Average FMeasure: {avg_fmeasure[stat]}')

# Run.
randomized()

1000 articles summarized.
r1
Average Precision: 0.2941267710810191
Average Recall: 0.24796226056831208
Average FMeasure: 0.2529957733977247
r2
Average Precision: 0.0822413232214134
Average Recall: 0.07022443574955406
Average FMeasure: 0.07096243276214713
rL
Average Precision: 0.1904044432022451
Average Recall: 0.158891988998144
Average FMeasure: 0.1622936651470126
2000 articles summarized.
r1
Average Precision: 0.29702421189954664
Average Recall: 0.24886382943219473
Average FMeasure: 0.2556097980660546
r2
Average Precision: 0.08370922583913025
Average Recall: 0.07098957503701919
Average FMeasure: 0.07256688903585656
rL
Average Precision: 0.19197777325288998
Average Recall: 0.15931205610033664
Average FMeasure: 0.16400349400039482
3000 articles summarized.
r1
Average Precision: 0.29712555664453094
Average Recall: 0.24750256935270923
Average FMeasure: 0.2549456326387416
r2
Average Precision: 0.08273229840576247
Average Recall: 0.06946926882140182
Average FMeasure: 0.07119002700035389
rL


## <p style="text-align: center;">Method 1</p>

### <p style="text-align: center;">TF-IDF Combination</p>
---

This method involves gathering the TF-IDF scores of each word in the vocabulary of an article in relation to each document of the article. In this case, each sentence is considered its own "document" within the article. Once the TF-IDF matrix of size |D| x |V| is calculated, the word scores for each sentence/document are summed together in order to obtain a final per sentence scoring. This does not occur per word token in each sentence, but rather involves summing the |V| values for each word in the vocabulary per document. In this way, sentences that contain more unusual words will tend to score higher and be selected for summarization. Blank sentences are removed prior to TFIDF embedding.

In [310]:
def tfidfcombo():    
    # Evaluation values.
    stats = ['r1', 'r2', 'rL']
    avg_precision = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_recall = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_fmeasure = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    # Vectorizer.
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, lowercase=False)
    # Summarize every article.
    for a, article in enumerate(test.loc[0:9999, "article"]):
        # Counter.
        if (a % 1000 == 0 and a > 0):
            print(f'{a} articles summarized.')
            for stat in stats:
                print(f'{stat}')
                print(f'Average Precision: {avg_precision[stat]/(a)}')
                print(f'Average Recall: {avg_recall[stat]/(a)}')
                print(f'Average FMeasure: {avg_fmeasure[stat]/(a)}')
        # Initial blank summary and sentence scores.
        summary = []
        id2score = {}
        # Tokenize the article by sentences.
        sentences = sent_tokenize(article)
        final = sent_tokenize(article)
        # For storing indices of blank sentences.
        blanks = []
        # Clean each sentence.
        for i, sentence in enumerate(sentences):
            sentences[i] = clean(sentence)
            if sentences[i] == "" or sentences[i] == " ":
                blanks.append(i)
        # Remove blank sentences.
        for i in range(len(blanks) - 1, -1, -1):
            sentences.pop(blanks[i])
            final.pop(blanks[i])
        # Setting summary sentence count.
        sentence_count = min(max_sentence_count, len(sentences))
        # Calculate TF-IDF scores.
        tf_idf = vectorizer.fit_transform(sentences).todense()
        # Sum each sentence's |V| TF-IDF values.
        for i in range(tf_idf.shape[0]):
            score = 0
            for j in range(tf_idf.shape[1]):
                score += tf_idf[i,j]
            id2score[i] = score
        # Selecting K highest-scoring sentences for summarization.
        for k in range(sentence_count):
            sentence_id = max(id2score.items(), key=operator.itemgetter(1))[0]
            id2score.pop(sentence_id)
            summary.append(final[sentence_id])
        candidate = ' '.join(summary)
        reference = test.loc[a,"highlights"]
        # Scoring.
        scores = scorer.score(reference, candidate)
        for stat, key in zip(stats, scores):
            avg_precision[stat] += scores[key][0]
            avg_recall[stat] += scores[key][1]
            avg_fmeasure[stat] += scores[key][2]
    for stat in stats:
        avg_precision[stat] /= (a+1)
        avg_recall[stat] /= (a+1)
        avg_fmeasure[stat] /= (a+1)
    print("\n=====")
    print("Final scores.")
    for stat in stats:
        print(f'{stat}')
        print(f'Average Precision: {avg_precision[stat]}')
        print(f'Average Recall: {avg_recall[stat]}')
        print(f'Average FMeasure: {avg_fmeasure[stat]}')

# Run.
tfidfcombo()
        


1000 articles summarized.
r1
Average Precision: 0.2599599147054455
Average Recall: 0.39867837836593795
Average FMeasure: 0.30270692852086983
r2
Average Precision: 0.08623954812325758
Average Recall: 0.12764525932259554
Average FMeasure: 0.09870898742057657
rL
Average Precision: 0.15768006827309142
Average Recall: 0.2430350554041168
Average FMeasure: 0.18361663105652826
2000 articles summarized.
r1
Average Precision: 0.26066740456421605
Average Recall: 0.39651511283743296
Average FMeasure: 0.30277339566404765
r2
Average Precision: 0.08505318410980937
Average Recall: 0.1254232644690386
Average FMeasure: 0.0972175336357825
rL
Average Precision: 0.15649400887661932
Average Recall: 0.23994061952108134
Average FMeasure: 0.18205907764620052
3000 articles summarized.
r1
Average Precision: 0.26153421045458375
Average Recall: 0.3967156816556744
Average FMeasure: 0.3036787167525011
r2
Average Precision: 0.08506058077837579
Average Recall: 0.12521071299751027
Average FMeasure: 0.09733461378926134


## <p style="text-align: center;">Method 2</p>

### <p style="text-align: center;">TF-IDF Summation</p>
---

This method involves gathering the TF-IDF scores of each word in the vocabulary of an article in relation to each document of the article. In this case, each sentence is considered its own "document" within the article. Once the TF-IDF matrix of size |D| x |V| is calculated, the word scores for each word in each sentence/document are summed together in order to obtain a final per sentence scoring. This is a slightly different approach compared with method one, because we sum scores for each word token in a sentence rather than summing the |V| TF-IDF scores in the TF-IDF matrix per sentence. This has the downside of immediately giving longer sentences a more positive value, skewing recall and precision. Blank sentences are removed prior to TFIDF embedding.

In [311]:
def tfidfsum():    
    # Evaluation values.
    stats = ['r1', 'r2', 'rL']
    avg_precision = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_recall = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_fmeasure = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    # Vectorizer.
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, lowercase=False)
    # Summarize every article.
    for a, article in enumerate(test.loc[0:9999, "article"]):
        # Counter.
        if (a % 1000 == 0 and a > 0):
            print(f'{a} articles summarized.')
            for stat in stats:
                print(f'{stat}')
                print(f'Average Precision: {avg_precision[stat]/(a)}')
                print(f'Average Recall: {avg_recall[stat]/(a)}')
                print(f'Average FMeasure: {avg_fmeasure[stat]/(a)}')
        # Initial blank summary and sentence scores.
        summary = []
        id2score = {}
        # Tokenize the article by sentences.
        sentences = sent_tokenize(article)
        final = sent_tokenize(article)
        # For storing indices of blank sentences.
        blanks = []
        # Clean each sentence.
        for i, sentence in enumerate(sentences):
            sentences[i] = clean(sentence)
            if sentences[i] == "" or sentences[i] == " ":
                blanks.append(i)
        # Remove blank sentences.
        for i in range(len(blanks) - 1, -1, -1):
            sentences.pop(blanks[i])
            final.pop(blanks[i])
        # Get average sentence length.
        avg_length = 0
        for sentence in sentences:
            avg_length += len(word_tokenize(sentence))
        # Setting summary sentence count.
        sentence_count = min(max_sentence_count, len(sentences))
        # Calculate TF-IDF scores
        tf_idf = vectorizer.fit_transform(sentences).todense()
        # Get feature names (words).
        feature_names = vectorizer.get_feature_names_out()
        # Sum each word's TF-IDF values per sentence, then average across sentence length.
        for i, sentence in enumerate(sentences):
            score = 0
            words = word_tokenize(sentence)
            length = len(words)
            for word in words:
                score += tf_idf[i, np.where(feature_names == word)]
            # Apply final penalty.
            penalty = 0
            #penalty = abs((min(length, avg_length) - avg_length) / avg_length)**1.5
            #penalty = abs((length - avg_length) / avg_length)**2
            id2score[i] = (score - penalty)
        # Selecting K highest-scoring sentences for summarization.
        for k in range(sentence_count):
            sentence_id = max(id2score.items(), key=operator.itemgetter(1))[0]
            id2score.pop(sentence_id)
            summary.append(final[sentence_id])
        candidate = ' '.join(summary)
        reference = test.loc[a,"highlights"]
        # Scoring.
        scores = scorer.score(reference, candidate)
        for stat, key in zip(stats, scores):
            avg_precision[stat] += scores[key][0]
            avg_recall[stat] += scores[key][1]
            avg_fmeasure[stat] += scores[key][2]
    for stat in stats:
        avg_precision[stat] /= (a+1)
        avg_recall[stat] /= (a+1)
        avg_fmeasure[stat] /= (a+1)
    print("\n=====")
    print("Final scores.")
    for stat in stats:
        print(f'{stat}')
        print(f'Average Precision: {avg_precision[stat]}')
        print(f'Average Recall: {avg_recall[stat]}')
        print(f'Average FMeasure: {avg_fmeasure[stat]}')

# Run.
tfidfsum()
        


1000 articles summarized.
r1
Average Precision: 0.2533201965235825
Average Recall: 0.38492604092016025
Average FMeasure: 0.2934175761285802
r2
Average Precision: 0.08267410308629339
Average Recall: 0.12138264846699072
Average FMeasure: 0.09423580309800486
rL
Average Precision: 0.15389022111327263
Average Recall: 0.23475141396068475
Average FMeasure: 0.17818600657196224
2000 articles summarized.
r1
Average Precision: 0.2526791277968233
Average Recall: 0.38075564808044876
Average FMeasure: 0.29209448847152564
r2
Average Precision: 0.08123949457998794
Average Recall: 0.11873973914031216
Average FMeasure: 0.09249167584541106
rL
Average Precision: 0.15315458473822477
Average Recall: 0.23213265548157241
Average FMeasure: 0.177166862287821
3000 articles summarized.
r1
Average Precision: 0.25352806507869363
Average Recall: 0.38064877830578986
Average FMeasure: 0.2928056882861603
r2
Average Precision: 0.0812244901699128
Average Recall: 0.11840221488613908
Average FMeasure: 0.0925048665570376
rL

## <p style="text-align: center;">Method 3</p>

### <p style="text-align: center;">TF-IDF Averaging</p>
---

This method involves gathering the TF-IDF scores of each word in the vocabulary of an article in relation to each document of the article. In this case, each sentence is considered its own "document" within the article. Once the TF-IDF matrix of size |D| x |V| is calculated, the word scores for each word in each sentence/document are summed together and then averaged in order to obtain a final per sentence scoring. This is a slightly different approach compared with method one, because we sum scores for each word token in a sentence rather than summing the |V| TF-IDF scores in the TF-IDF matrix per sentence. The averaging helps to avoid the bias of choosing longer sentences, while a penalty term helps to avoid choosing sentences that are too short. Blank sentences are removed prior to TFIDF embedding.

In [312]:
def tfidfavg():    
    # Evaluation values.
    stats = ['r1', 'r2', 'rL']
    avg_precision = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_recall = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_fmeasure = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    # Vectorizer.
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, lowercase=False)
    # Summarize every article.
    for a, article in enumerate(test.loc[0:9999, "article"]):
        # Counter.
        if (a % 1000 == 0 and a > 0):
            print(f'{a} articles summarized.')
            for stat in stats:
                print(f'{stat}')
                print(f'Average Precision: {avg_precision[stat]/(a)}')
                print(f'Average Recall: {avg_recall[stat]/(a)}')
                print(f'Average FMeasure: {avg_fmeasure[stat]/(a)}')
        # Initial blank summary and sentence scores.
        summary = []
        id2score = {}
        # Tokenize the article by sentences.
        sentences = sent_tokenize(article)
        final = sent_tokenize(article)
        # For storing indices of blank sentences.
        blanks = []
        # Clean each sentence.
        for i, sentence in enumerate(sentences):
            sentences[i] = clean(sentence)
            if sentences[i] == "" or sentences[i] == " ":
                blanks.append(i)
        # Remove blank sentences.
        for i in range(len(blanks) - 1, -1, -1):
            sentences.pop(blanks[i])
            final.pop(blanks[i])
        # Get average sentence length.
        avg_length = 0
        for sentence in sentences:
            avg_length += len(word_tokenize(sentence))
        avg_length /= len(sentences)
        # Setting summary sentence count.
        sentence_count = min(max_sentence_count, len(sentences))
        # Calculate TF-IDF scores
        tf_idf = vectorizer.fit_transform(sentences).todense()
        # Get feature names (words).
        feature_names = vectorizer.get_feature_names_out()
        # Sum each word's TF-IDF values per sentence, then average across sentence length.
        for i, sentence in enumerate(sentences):
            score = 0
            words = word_tokenize(sentence)
            length = len(words)
            for word in words:
                score += tf_idf[i, np.where(feature_names == word)]
            score /= length
            # Apply final penalty.
            #penalty = 0
            penalty = abs((min(length, avg_length) - avg_length) / avg_length)**1.5
            #penalty = abs((length - avg_length) / avg_length)**2
            id2score[i] = (score - penalty)
        # Selecting K highest-scoring sentences for summarization.
        for k in range(sentence_count):
            sentence_id = max(id2score.items(), key=operator.itemgetter(1))[0]
            id2score.pop(sentence_id)
            summary.append(final[sentence_id])
        candidate = ' '.join(summary)
        reference = test.loc[a,"highlights"]
        # Scoring.
        scores = scorer.score(reference, candidate)
        for stat, key in zip(stats, scores):
            avg_precision[stat] += scores[key][0]
            avg_recall[stat] += scores[key][1]
            avg_fmeasure[stat] += scores[key][2]
    for stat in stats:
        avg_precision[stat] /= (a+1)
        avg_recall[stat] /= (a+1)
        avg_fmeasure[stat] /= (a+1)
    print("\n=====")
    print("Final scores.")
    for stat in stats:
        print(f'{stat}')
        print(f'Average Precision: {avg_precision[stat]}')
        print(f'Average Recall: {avg_recall[stat]}')
        print(f'Average FMeasure: {avg_fmeasure[stat]}')

# Run.
tfidfavg()

1000 articles summarized.
r1
Average Precision: 0.2786637482263132
Average Recall: 0.249010544689199
Average FMeasure: 0.2512942160039919
r2
Average Precision: 0.07556382490606885
Average Recall: 0.06686577850004738
Average FMeasure: 0.06735130835812841
rL
Average Precision: 0.1767646588599449
Average Recall: 0.15837511430074977
Average FMeasure: 0.1594186813898363
2000 articles summarized.
r1
Average Precision: 0.27774485723797276
Average Recall: 0.24665597390406174
Average FMeasure: 0.25047471670146304
r2
Average Precision: 0.07344743846525444
Average Recall: 0.06486092064009288
Average FMeasure: 0.06572787572578308
rL
Average Precision: 0.17560982502932637
Average Recall: 0.1567312224855122
Average FMeasure: 0.15860384789610785
3000 articles summarized.
r1
Average Precision: 0.27676853346914804
Average Recall: 0.24499243469021323
Average FMeasure: 0.24940950485988556
r2
Average Precision: 0.07280643386814516
Average Recall: 0.06389101716041289
Average FMeasure: 0.06504667560682234
r

## <p style="text-align: center;">Method 4</p>

### <p style="text-align: center;">TF-IDF Non-negative Matrix Factorization Summation</p>
---

This method involves gathering the TF-IDF scores of each word in the vocabulary of an article in relation to each document of the article. Each sentence is considered its own "document" within the article. Once the TF-IDF matrix of size |D| x |V| is calculated, non-negative matrix factorization can be used to reduce the matrix into two matrices of lesser dimensionality of size |D| x F and F x |V| where F is the number of latent features present in the article. Each of the |D| rows in the |D| x F matrix can be summed together in order to obtain a final per sentence scoring. In this way, sentences that generalize well over the core features of the article will be selected for summarization. Blank sentences are removed prior to TFIDF embedding.

In [313]:
def nnmf():    
    # Evaluation values.
    stats = ['r1', 'r2', 'rL']
    avg_precision = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_recall = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_fmeasure = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    # Vectorizer.
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, lowercase=False)
    # Summarize every article.
    for a, article in enumerate(test.loc[0:9999, "article"]):
        # Counter.
        if (a % 1000 == 0 and a > 0):
            print(f'{a} articles summarized.')
            for stat in stats:
                print(f'{stat}')
                print(f'Average Precision: {avg_precision[stat]/(a)}')
                print(f'Average Recall: {avg_recall[stat]/(a)}')
                print(f'Average FMeasure: {avg_fmeasure[stat]/(a)}')
        # Initial blank summary.
        summary = []
        id2score = {}
        # Tokenize the article by sentences.
        sentences = sent_tokenize(article)
        final = sent_tokenize(article)
        # For storing indices of blank sentences.
        blanks = []
        # Clean each sentence.
        for i, sentence in enumerate(sentences):
            sentences[i] = clean(sentence)
            if sentences[i] == "" or sentences[i] == " ":
                blanks.append(i)
        # Remove blank sentences.
        for i in range(len(blanks) - 1, -1, -1):
            sentences.pop(blanks[i])
            final.pop(blanks[i])
        # Setting summary sentence count.
        sentence_count = min(max_sentence_count, len(sentences))
        # Calculate TF-IDF scores.
        tf_idf = vectorizer.fit_transform(sentences)
        # Non-negative matrix factorization.
        nnmf = NMF(n_components=12, init='random', random_state=0, max_iter=2000)
        factors = nnmf.fit_transform(tf_idf)
        # Sum each sentence's factor values.
        for i in range(factors.shape[0]):
            score = 0
            for j in range(factors.shape[1]):
                score += factors[i,j]
            id2score[i] = score
        # Selecting K highest-scoring sentences for summarization.
        for k in range(sentence_count):
            sentence_id = max(id2score.items(), key=operator.itemgetter(1))[0]
            id2score.pop(sentence_id)
            summary.append(final[sentence_id])
        candidate = ' '.join(summary)
        reference = test.loc[a,"highlights"]
        # Scoring.
        scores = scorer.score(reference, candidate)
        for stat, key in zip(stats, scores):
            avg_precision[stat] += scores[key][0]
            avg_recall[stat] += scores[key][1]
            avg_fmeasure[stat] += scores[key][2]
    for stat in stats:
        avg_precision[stat] /= (a+1)
        avg_recall[stat] /= (a+1)
        avg_fmeasure[stat] /= (a+1)
    print("\n=====")
    print("Final scores.")
    for stat in stats:
        print(f'{stat}')
        print(f'Average Precision: {avg_precision[stat]}')
        print(f'Average Recall: {avg_recall[stat]}')
        print(f'Average FMeasure: {avg_fmeasure[stat]}')
    
nnmf()

1000 articles summarized.
r1
Average Precision: 0.3109971302293935
Average Recall: 0.294051461080157
Average FMeasure: 0.28537794048078113
r2
Average Precision: 0.10469410700970007
Average Recall: 0.09816019647448429
Average FMeasure: 0.09568954089746394
rL
Average Precision: 0.2061267193940587
Average Recall: 0.19324685419277968
Average FMeasure: 0.187871721475997




2000 articles summarized.
r1
Average Precision: 0.30582607105851406
Average Recall: 0.28777285664917884
Average FMeasure: 0.28064585492356864
r2
Average Precision: 0.0986452138136795
Average Recall: 0.09302416553759871
Average FMeasure: 0.09056332576675309
rL
Average Precision: 0.20120788833075468
Average Recall: 0.18887893273117612
Average FMeasure: 0.1839384800908994
3000 articles summarized.
r1
Average Precision: 0.3068565226864782
Average Recall: 0.28657027609628977
Average FMeasure: 0.2807813193872062
r2
Average Precision: 0.09925040543238371
Average Recall: 0.093110199053022
Average FMeasure: 0.09092292044806086
rL
Average Precision: 0.20212777018592507
Average Recall: 0.18803017774216838
Average FMeasure: 0.18404669663971535
4000 articles summarized.
r1
Average Precision: 0.3043886530215791
Average Recall: 0.28737351843261555
Average FMeasure: 0.280177819925041
r2
Average Precision: 0.09736584470009543
Average Recall: 0.09245708452654036
Average FMeasure: 0.08986440207245577
rL


## <p style="text-align: center;">Method 5</p>

### <p style="text-align: center;">TF-IDF K-Means Clustering</p>
---

This method involves gathering the TF-IDF scores of each word in the vocabulary of an article in relation to each document of the article. Each sentence is considered its own "document" within the article. Once the TF-IDF matrix of size |D| x |V| is calculated, each sentence can be thought of as a vector of size |V| where each cell is the TF-IDF value of a word in that sentence. These vectors can be used to obtain cosine similarities and cluster sentences into K groups, where K is the number of sentences desired in the final article summarization. The sentence closest to the center of each of these clusters will be chosen for the summary. In this way, sentences will avoid overlap in their textual and semantic similarities, instead covering a larger swath of the original article's meaning. Blank sentences are removed prior to TFIDF embedding.

In [314]:
def tfidfkmeans():  
    # Evaluation values.
    stats = ['r1', 'r2', 'rL']
    avg_precision = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_recall = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_fmeasure = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    # Vectorizer.
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, lowercase=False)
    # Summarize every article.
    for a, article in enumerate(test.loc[0:9999, "article"]):
        # Counter.
        if (a % 1000 == 0 and a > 0):
            print(f'{a} articles summarized.')
            for stat in stats:
                print(f'{stat}')
                print(f'Average Precision: {avg_precision[stat]/(a)}')
                print(f'Average Recall: {avg_recall[stat]/(a)}')
                print(f'Average FMeasure: {avg_fmeasure[stat]/(a)}')
        # Initial blank summary.
        summary = []
        # Tokenize the article by sentences.
        sentences = sent_tokenize(article)
        final = sent_tokenize(article)
        # For storing indices of blank sentences.
        blanks = []
        # Clean each sentence.
        for i, sentence in enumerate(sentences):
            sentences[i] = clean(sentence)
            if sentences[i] == "" or sentences[i] == " ":
                blanks.append(i)
        # Remove blank sentences.
        for i in range(len(blanks) - 1, -1, -1):
            sentences.pop(blanks[i])
            final.pop(blanks[i])
        # Setting summary sentence count.
        sentence_count = min(max_sentence_count, len(sentences))
        # Calculate TF-IDF scores.
        tf_idf = vectorizer.fit_transform(sentences)
        # K-Means clustering.
        kmeans = KMeans(n_clusters=sentence_count, random_state=0)
        # Compute K clusters.
        sentence_ids = []
        kmeans.fit(tf_idf)
        # For each of K sets of points, one per cluster, compute the closest to cluster center.
        for i in range(kmeans.n_clusters):
            tf_idf_clustered = tf_idf[kmeans.labels_ == i]
            sentence_id, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, tf_idf_clustered)
            sentence_ids.append(sentence_id[i])
        # Find actual sentence ids from original article.
        for i in range(len(sentence_ids)):
            count = sentence_ids[i]
            sentence_ids[i] = [j for j, n in enumerate(kmeans.labels_) if n == i][count]
        # Selecting sentences for summarization.
        for sentence_id in sentence_ids:
            summary.append(final[sentence_id])
        candidate = ' '.join(summary)
        reference = test.loc[a,"highlights"]
        # Scoring.
        scores = scorer.score(reference, candidate)
        for stat, key in zip(stats, scores):
            avg_precision[stat] += scores[key][0]
            avg_recall[stat] += scores[key][1]
            avg_fmeasure[stat] += scores[key][2]
    for stat in stats:
        avg_precision[stat] /= (a+1)
        avg_recall[stat] /= (a+1)
        avg_fmeasure[stat] /= (a+1)
    print("\n=====")
    print("Final scores.")
    for stat in stats:
        print(f'{stat}')
        print(f'Average Precision: {avg_precision[stat]}')
        print(f'Average Recall: {avg_recall[stat]}')
        print(f'Average FMeasure: {avg_fmeasure[stat]}')
    
tfidfkmeans()

1000 articles summarized.
r1
Average Precision: 0.3756438151973616
Average Recall: 0.3495646051299048
Average FMeasure: 0.3430318528861402
r2
Average Precision: 0.14584689192474393
Average Recall: 0.1333401863744245
Average FMeasure: 0.13150890416070668
rL
Average Precision: 0.24063836895979043
Average Recall: 0.2227905409449432
Average FMeasure: 0.21849746867633482
2000 articles summarized.
r1
Average Precision: 0.37560320477686737
Average Recall: 0.34741529917141856
Average FMeasure: 0.34286658504353257
r2
Average Precision: 0.14306554016733325
Average Recall: 0.13096963878768192
Average FMeasure: 0.1295046671922906
rL
Average Precision: 0.23950772936772302
Average Recall: 0.22077739878055283
Average FMeasure: 0.21756647027569642
3000 articles summarized.
r1
Average Precision: 0.3768218116547009
Average Recall: 0.34700420182878466
Average FMeasure: 0.3428320971945409
r2
Average Precision: 0.14303867735135747
Average Recall: 0.13063095824483087
Average FMeasure: 0.1292741123206082
rL


## <p style="text-align: center;">Method 6</p>

### <p style="text-align: center;">TF-IDF Non-negative Matrix Factorization K-Means Clustering</p>
---

This method involves gathering the TF-IDF scores of each word in the vocabulary of an article in relation to each document of the article. Each sentence is considered its own "document" within the article. Once the TF-IDF matrix of size |D| x |V| is calculated, non-negative matrix factorization can be used to reduce the matrix into two matrices of lesser dimensionality of size |D| x F and F x |V| where F is the number of latent features present in the article. The |D| x F matrix can be used to obtain cosine similarities and cluster sentences into K groups, where K is the number of sentences desired in the final article summarization. For each cluster, a member of that cluster closest to the center of the cluster will be chosen for the summary. In this way, sentences will avoid overlap in their textual and semantic similarities, instead covering a larger swath of the original article's meaning. Blank sentences are removed prior to TFIDF embedding.

In [6]:
def nnmfkmeans():  
    # Evaluation values.
    stats = ['r1', 'r2', 'rL']
    avg_precision = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_recall = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    avg_fmeasure = {
        'r1' : 0,
        'r2' : 0,
        'rL' : 0
    }
    # Vectorizer.
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, lowercase=False)
    # Summarize every article.
    for a, article in enumerate(test.loc[0:9999, "article"]):
        # Counter.
        if (a % 1000 == 0 and a > 0):
            print(f'{a} articles summarized.')
            for stat in stats:
                print(f'{stat}')
                print(f'Average Precision: {avg_precision[stat]/(a)}')
                print(f'Average Recall: {avg_recall[stat]/(a)}')
                print(f'Average FMeasure: {avg_fmeasure[stat]/(a)}')
        # Initial blank summary.
        summary = []
        # Tokenize the article by sentences.
        sentences = sent_tokenize(article)
        final = sent_tokenize(article)
        # For storing indices of blank sentences.
        blanks = []
        # Clean each sentence.
        for i, sentence in enumerate(sentences):
            sentences[i] = clean(sentence)
            if sentences[i] == "" or sentences[i] == " ":
                blanks.append(i)
        # Remove blank sentences.
        for i in range(len(blanks) - 1, -1, -1):
            sentences.pop(blanks[i])
            final.pop(blanks[i])
        # Setting summary sentence count.
        sentence_count = min(max_sentence_count, len(sentences))
        # Calculate TF-IDF scores.
        tf_idf = vectorizer.fit_transform(sentences)
        # Non-negative matrix factorization.
        nnmf = NMF(n_components=10, init='random', random_state=0, max_iter=2000)
        factors = nnmf.fit_transform(tf_idf)
        # K-Means clustering.
        kmeans = KMeans(n_clusters=sentence_count, random_state=0)
        # Compute K clusters.
        sentence_ids = []
        kmeans.fit(factors)
        # For each of K sets of points, one per cluster, compute the closest to cluster center.
        for i in range(kmeans.n_clusters):
            factors_clustered = factors[kmeans.labels_ == i]
            sentence_id, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, factors_clustered)
            sentence_ids.append(sentence_id[i])
        # Find actual sentence ids from original article.
        for i in range(len(sentence_ids)):
            count = sentence_ids[i]
            sentence_ids[i] = [j for j, n in enumerate(kmeans.labels_) if n == i][count]
        # Selecting sentences for summarization.
        for sentence_id in sentence_ids:
            summary.append(final[sentence_id])
        candidate = ' '.join(summary)
        reference = test.loc[a,"highlights"]
        # Scoring.
        scores = scorer.score(reference, candidate)
        for stat, key in zip(stats, scores):
            avg_precision[stat] += scores[key][0]
            avg_recall[stat] += scores[key][1]
            avg_fmeasure[stat] += scores[key][2]
    for stat in stats:
        avg_precision[stat] /= (a+1)
        avg_recall[stat] /= (a+1)
        avg_fmeasure[stat] /= (a+1)
    print("\n=====")
    print("Final scores.")
    for stat in stats:
        print(f'{stat}')
        print(f'Average Precision: {avg_precision[stat]}')
        print(f'Average Recall: {avg_recall[stat]}')
        print(f'Average FMeasure: {avg_fmeasure[stat]}')
    
nnmfkmeans()

1000 articles summarized.
r1
Average Precision: 0.301569889181544
Average Recall: 0.2595424970759193
Average FMeasure: 0.26284177220883653
r2
Average Precision: 0.08901698102077771
Average Recall: 0.07542069399586723
Average FMeasure: 0.07667078708739174
rL
Average Precision: 0.19581709298510488
Average Recall: 0.1674488494349567
Average FMeasure: 0.16957997437452516
2000 articles summarized.
r1
Average Precision: 0.30016572852206375
Average Recall: 0.25312418961119976
Average FMeasure: 0.2588922889823209
r2
Average Precision: 0.08583047950309097
Average Recall: 0.07193066178444159
Average FMeasure: 0.07349417734347045
rL
Average Precision: 0.19440044864685865
Average Recall: 0.16275301714098828
Average FMeasure: 0.16652453321361554
3000 articles summarized.
r1
Average Precision: 0.3014261497655132
Average Recall: 0.2538877219842601
Average FMeasure: 0.260040776504014
r2
Average Precision: 0.08625405721676663
Average Recall: 0.0725493852396443
Average FMeasure: 0.07418869736946586
rL
A