# LIN 353C Final Project: Extractive Text Summarization #

Our goal for this project is to perform extractive text summarization through various methods, from the most basic to more complicated methods. For this project, we will use the CNN/Daily Mail dataset.

### 01. Importing libraries + downloading packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install rouge
###
import nltk
nltk.download('punkt')
nltk.download('stopwords')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import os
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from rouge import FilesRouge
import gensim.downloader as gensim_api
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

import math
from collections import Counter
import string
import re
from collections import OrderedDict

### 02. Downloading + processing data ###

In [None]:
# turn data into a dataframe and save the first 50 articles
test_data = pd.read_csv('/content/drive/My Drive/test.csv', encoding = 'utf-8')
df = test_data.iloc[:50]

# we don't need the id column
del df['id']

for i in range(50):
  # replace (CNN) tag from articles
  if '(CNN)' in df.at[i, 'article']:
    df.at[i, 'article'] = df.at[i, 'article'].replace('(CNN)', '')

# create a list containing each highlight
highlights_lst = []
for highlight in df['highlights']:
  highlight = highlight.replace('\n', '')
  highlights_lst.append(highlight)

# create a file for the highlights dataframe -- needed for rouge evaluation
hl_df = pd.DataFrame({'highlights': highlights_lst})
hl_df.to_csv('highlights.csv')

df.head()

# turing articles_lst into a dictionary of form {article_name: {word: count,...}}--- i'm not sure about this either
# article_freq_dict = {}
# inter_dict = {}
# for i in range(len(articles_lst)):
#     name = 'article' + str(i)
#     for sent in articles_lst[i]:
#         article_freq_dict[name] = Counter(sent)

Unnamed: 0,article,highlights
0,Ever noticed how plane seats appear to be gett...,Experts question if packed out planes are put...
1,A drunk teenage boy had to be rescued by secur...,Drunk teenage boy climbed into lion enclosure ...
2,Dougie Freedman is on the verge of agreeing a ...,Nottingham Forest are close to extending Dougi...
3,Liverpool target Neto is also wanted by PSG an...,Fiorentina goalkeeper Neto has been linked wit...
4,Bruce Jenner will break his silence in a two-h...,"Tell-all interview with the reality TV star, 6..."


In [None]:
# text processing and cleaning
tokenized_articles = []
tokenized_sentences = []
stop_words = set(stopwords.words('english'))

for article in df['article']:
  paragraph = []
  # sentence tokenize the article
  sentences = nltk.tokenize.sent_tokenize(article)
  tokenized_sentences.append(sentences)

  for sentence in sentences:
    # lowercase everything
    sentence = sentence.lower()
    # remove everything that is not a letter
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # word tokenize each sentence
    words = nltk.tokenize.word_tokenize(sentence)

    # remove stopwords
    clean_sentence = []
    for word in words:
      if word not in stop_words:
        clean_sentence.append(word)

    # paragraph is a list of lists containing each sentence in the article
    paragraph.append(clean_sentence)


  tokenized_articles.append(paragraph)

# example article below
print(tokenized_articles[0])

[['ever', 'noticed', 'plane', 'seats', 'appear', 'getting', 'smaller', 'smaller'], ['increasing', 'numbers', 'people', 'taking', 'skies', 'experts', 'questioning', 'packed', 'planes', 'putting', 'passengers', 'risk'], ['say', 'shrinking', 'space', 'aeroplanes', 'uncomfortable', 'putting', 'health', 'safety', 'danger'], ['squabbling', 'arm', 'rest', 'shrinking', 'space', 'planes', 'putting', 'health', 'safety', 'danger'], ['week', 'u', 'consumer', 'advisory', 'group', 'set', 'department', 'transportation', 'said', 'public', 'hearing', 'government', 'happy', 'set', 'standards', 'animals', 'flying', 'planes', 'stipulate', 'minimum', 'amount', 'space', 'humans'], ['world', 'animals', 'rights', 'space', 'food', 'humans', 'said', 'charlie', 'leocha', 'consumer', 'representative', 'committee'], ['time', 'dot', 'faa', 'take', 'stand', 'humane', 'treatment', 'passengers'], ['could', 'crowding', 'planes', 'lead', 'serious', 'issues', 'fighting', 'space', 'overhead', 'lockers', 'crashing', 'elbow

In [None]:
print(len(tokenized_articles))

50


### 03. Text summarization###

#### First-sentence method

In [None]:
# in this section, fs means first-sentence

# create a list for each summary
fs_lst = []

for article in df['article']:
  # tokenize to get the first sentence
  new = nltk.tokenize.sent_tokenize(article)
  summary = new[0]
  fs_lst.append(summary)

# create a dataframe for summary
# and create a file since we will need a file to evaluate using rouge
fs_df = pd.DataFrame({'summary': fs_lst})
fs_df.to_csv('first_sentence.csv')

#### TextRank using word embeddings

In [None]:
# we use the GloVe embeddings trained on Wikipedia 2014 + Gigaword 5
# extract 100 dimension vectors
space = gensim_api.load("glove-wiki-gigaword-100")



In [None]:
import warnings
warnings.filterwarnings("ignore")
# now we get the vector representation of each sentence using word embeddings
# we will get the word embeddings for each word in each sentence, then take the avg of each sentence
from numpy.linalg import norm
article_scores = []

for article in tokenized_articles:
  sentence_vectors = []

  for sentence in article:
    # we create a zero vector of dim 100
    sentence_vector = np.zeros((100,))

    for word in sentence:
      # get the word embedding from gensim, skip if it doesn't exist
      try:
        vector = space[word]
      except:
        continue
      # add all vectors from each sentence
      sentence_vector += vector
      # divide by sentence size to get avg
      sentence_vector /= len(sentence)

    # append each sentence's avg vector to a list
    sentence_vectors.append(sentence_vector)

  dim = len(article)
  matrix = np.zeros([dim, dim])

  # now calculate the cosine similarity between each sentence vector
  # to form a similarity matrix
  for i in range(dim):
    for j in range(dim):
      if i != j:
        matrix[i][j] = cosine_similarity(sentence_vectors[i].reshape(1, 100), sentence_vectors[j].reshape(1, 100))

  # turn similarity matrix into a pagerank graph using networkx
  graph = nx.from_numpy_array(matrix)
  scores = nx.pagerank_numpy(graph)

  # get the sentence number for top 5 sentences with the highest scores
  indexes = list(dict(sorted(scores.items(), key = lambda x: x[1])))[:3]
  article_scores.append(indexes)

print(article_scores[0])
assert len(article_scores) == 50

[5, 7, 8]


In [None]:
# get summaries of top 5 sentences using the scores we got for each sentence
embeddings_summaries = []

for n in range(50):
  summary = ''
  for idx in article_scores[n]:
    summary += tokenized_sentences[n][idx]
    summary += ' '
  embeddings_summaries.append(summary)

# create a dataframe for summary
# and create a file since we will need a file to evaluate using rouge
# we = word embeddings

we_df = pd.DataFrame({'summary': embeddings_summaries})
we_df.to_csv('word_embeddings.csv')
print(we_df.head())

                                             summary
0  'In a world where animals have more rights to ...
1  A police spokesman said: 'He has been cautione...
2  That has not prevented Forest's ownership maki...
3  Liverpool target Neto is also wanted by PSG an...
4  'Bruce had silicone breast implants put in a f...


#### TF-IDF(Article)

In [None]:
# going from tokenized_articles, which is list of articles which is list of setnence which is list of word tokens,
# to frequency dictionary for each article:
article_freq_dict = {}

for i in range(len(tokenized_articles)):
    obj = Counter()
    name = 'article' + str(i)
    flat_list = []
    for sentence in tokenized_articles[i]:
        flat_list += sentence
        dict.update(obj, Counter(flat_list))
    article_freq_dict[name] = obj

# article_freq_dict looks like: {'article0': Counter{'the': 20, .......}}

In [None]:
# my tf-idf function:

def get_tf_idf(article, word, freq_dict):
    tf = math.log(freq_dict[article][word]+1)
    count = 0
    for i in range(len(freq_dict)):
        if word in  article_freq_dict['article'+str(i)]:
            count+= 1
    idf = math.log(len(freq_dict)/count)
    return tf*idf

# this returns a value

In [None]:
# my scoring function

def tf_idf_scorer(article, full_text, freq_dict):
    scores = {}
    article_text = full_text[int(article[-1])]
    for i in range(len(article_text)):
        scores["sentence"+str(i)] = 0
        for k in range(len(article_text[i])):
            scores["sentence"+str(i)] += get_tf_idf(article, article_text[i][k], freq_dict)
    return scores

# this returns a dictionary of form: {'sentence0': tf-df value, ....}

In [None]:
# this returns the sentences with the highest scores

def summarizer(article, fully_tokenized, sentences, freq_dict, number):
    scores = tf_idf_scorer(article, fully_tokenized, freq_dict)
    ordered_scores = sorted(scores.items(), key = lambda x:x[1], reverse = True)
    article_text = fully_tokenized[int(article[-1])]
    inter_summary = []

    inter_summary += (ordered_scores[:number])

    indexes = []
    for tup in inter_summary:
        if tup[0][-2:].isdigit():
            indexes.append(int(tup[0][-2:]))
        else:
            indexes.append(int(tup[0][-1:]))

    summary_list = []
    for index in indexes:
        summary_list.append(sentences[int(article[-1])][index])

    summary = ''
    for sentence in summary_list:
        for word in sentence:
            summary += word



    return summary

In [None]:
print(tokenized_sentences[0])
print(tokenized_articles[0])

['Ever noticed how plane seats appear to be getting smaller and smaller?', 'With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk.', "They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger.", 'More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger?', "This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans.", "'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.", "'It is time that the DOT and FAA take a stand for humane treatment of passengers.'", 'But could crowding on planes lead to more serious issues t

In [None]:
tf_idf_summaries = []
for i in range(50):
  tf_idf_summaries.append((summarizer('article'+str(i), tokenized_articles, tokenized_sentences, article_freq_dict, 5)))

tf_idf_df = pd.DataFrame({'summary': tf_idf_summaries})
tf_idf_df.to_csv('tf_idf.csv')
print(tf_idf_df.head())

                                             summary
0  While United Airlines has 30 inches of space, ...
1  Next level drunk: Intoxicated Rahul Kumar, 17,...
2  Freedman has stabilised Forest since he replac...
3  Liverpool target Neto is also wanted by PSG an...
4  Speaking out: Bruce Jenner, pictured on 'Keepi...


In [None]:
print(summarizer('article0', tokenized_articles, tokenized_sentences, article_freq_dict, 5))

While United Airlines has 30 inches of space, Gulf Air economy seats have between 29 and 32 inches, Air Asia offers 29 inches and Spirit Airlines offers just 28 inches.British Airways has a seat pitch of 31 inches, while easyJet has 29 inches, Thomson's short haul seat pitch is 28 inches, and Virgin Atlantic's is 30-31.But these tests are conducted using planes with 31 inches between each row of seats, a standard which on some airlines has decreased, reported the Detroit News.Many economy seats on United Airlines have 30 inches of room, while some airlines offer as little as 28 inches .This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans.


### ALT TF-IDF (SENTENCES)

In [None]:
alt_freq_dict = {}
inter_dict = {}

for i in range(len(tokenized_articles)):
    name = 'article' + str(i)
    inter_dict = {}
    for k in range(len(tokenized_articles[i])):
        inter_dict['sentence'+str(k)] = Counter(tokenized_articles[i][k])

    alt_freq_dict[name] = inter_dict

In [None]:
def alt_get_tf_idf(article, sentence, word, fully_tokenized, freq_dict):
    tf = math.log(freq_dict[article][sentence][word]+1)
    count = 0
    for sent in fully_tokenized[int(article[-1])]:
        if word in sent:
            count+= 1
    idf = math.log(len(fully_tokenized[int(article[-1])])/count)
    return tf*idf

In [None]:
def alt_tf_idf_scorer(article, fully_tokenized, freq_dict):
    scores = {}
    article_text = fully_tokenized[int(article[-1])]
    for i in range(len(article_text)):
        sentence = 'sentence'+str(i)
        scores[sentence] = 0
        for k in range(len(article_text[i])):
            scores[sentence] += alt_get_tf_idf(article, sentence, article_text[i][k], fully_tokenized, freq_dict)
    return scores

In [None]:
def alt_summarizer(article, fully_tokenized, sentences, freq_dict, number):
    scores = alt_tf_idf_scorer(article, fully_tokenized, freq_dict)
    ordered_scores = sorted(scores.items(), key = lambda x:x[1], reverse = True)
    article_text = fully_tokenized[int(article[-1])]
    inter_summary = []

    inter_summary += (ordered_scores[:number])

    indexes = []
    for tup in inter_summary:
        if tup[0][-2:].isdigit():
            indexes.append(int(tup[0][-2:]))
        else:
            indexes.append(int(tup[0][-1:]))

    summary_list = []
    for index in indexes:
        summary_list.append(sentences[int(article[-1])][index])

    summary = ''
    for sentence in summary_list:
        for word in sentence:
            summary += word



    return summary

In [None]:
print(alt_summarizer('article0', tokenized_articles, tokenized_sentences, alt_freq_dict, 3 ))

This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans.While United Airlines has 30 inches of space, Gulf Air economy seats have between 29 and 32 inches, Air Asia offers 29 inches and Spirit Airlines offers just 28 inches.British Airways has a seat pitch of 31 inches, while easyJet has 29 inches, Thomson's short haul seat pitch is 28 inches, and Virgin Atlantic's is 30-31.


In [None]:
alt_tf_idf_summaries = []
for i in range(50):
  alt_tf_idf_summaries.append((alt_summarizer('article'+str(i), tokenized_articles, tokenized_sentences, alt_freq_dict, 5)))

alt_tf_idf_df = pd.DataFrame({'summary': alt_tf_idf_summaries})
alt_tf_idf_df.to_csv('alt_tf_idf.csv')
print(alt_tf_idf_df.head())

KeyError: ignored

### 05. Evaluation for all algorithms

For our evaluation, we will use the ROUGE metric. *more info about rouge* We will use the Python library rouge to calculate the average score across all summaries.

##### First-sentence evaluation

In [None]:
files_rouge = FilesRouge()

ref_path = '/content/highlights.csv'
hyp_path = '/content/first_sentence.csv'
scores = files_rouge.get_scores(hyp_path, ref_path, avg=True)
print(scores)

{'rouge-1': {'r': 0.20554431682706864, 'p': 0.34040600685658673, 'f': 0.24848531754958594}, 'rouge-2': {'r': 0.07532355654981401, 'p': 0.12417890238162313, 'f': 0.09104541805334229}, 'rouge-l': {'r': 0.18619969323731397, 'p': 0.3095018892499339, 'f': 0.22542566250795346}}


##### Word embeddings evaluation

In [None]:
ref_path = '/content/highlights.csv'
hyp_path = '/content/word_embeddings.csv'
scores = files_rouge.get_scores(hyp_path, ref_path, avg = True)
print(scores)

{'rouge-1': {'r': 0.3122615692327619, 'p': 0.24340894037853636, 'f': 0.2621363264064126}, 'rouge-2': {'r': 0.09991188814894493, 'p': 0.07951737109637602, 'f': 0.08397359671211209}, 'rouge-l': {'r': 0.27641919476518373, 'p': 0.21590890958049447, 'f': 0.23226379623461724}}


##### TF-IDF evaluation

In [None]:
ref_path = '/content/highlights.csv'
hyp_path = '/content/tf_idf.csv'
scores = files_rouge.get_scores(hyp_path, ref_path, avg = True)
print(scores)

{'rouge-1': {'r': 0.23472882254313587, 'p': 0.1078323712775791, 'f': 0.14366650448554197}, 'rouge-2': {'r': 0.04632396033636655, 'p': 0.015411353810105565, 'f': 0.022531829820967745}, 'rouge-l': {'r': 0.220868608704233, 'p': 0.1016772326445643, 'f': 0.13540798067271909}}


### ALT TF-IDF (SENTENCES) Evaluation

In [None]:
ref_path = '/content/highlights.csv'
hyp_path = '/content/alt_tf_idf.csv'
scores = files_rouge.get_scores(hyp_path, ref_path, avg = True)
print(scores)

{'rouge-1': {'r': 0.24114747784123974, 'p': 0.10481348896749221, 'f': 0.14209125191410993}, 'rouge-2': {'r': 0.05148674904159248, 'p': 0.01609219449530112, 'f': 0.023945419244644838}, 'rouge-l': {'r': 0.22727144413772205, 'p': 0.09938519854765543, 'f': 0.1345063764191946}}
