In [1]:
# Rouge score generation library
!pip install rouge
import rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [2]:
# Imports
import nltk
from nltk.corpus import stopwords
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity


from tqdm import tqdm 

import pandas as pd
import numpy as np

import json

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Downloads
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive

Mounted at /content/drive
/content/drive/MyDrive


# Dataset manipulation

In [5]:
# Open and load data from google drive
%%time

# Local file
filename = './wikihow_trimmed.json'
with open(filename, 'r', encoding = 'utf-8') as f:
    data = json.load(f)

CPU times: user 4.6 s, sys: 3.33 s, total: 7.93 s
Wall time: 12.5 s


In [6]:
# Using textrank on the test dataset
dataset = pd.DataFrame(data['test'])
dataset.drop(columns = 1, inplace = True)

In [7]:
min = 1000000000
for sample in dataset[0]:
  if min > len(sample):
    min = len(sample)
print("Minimum sentence within the dataset:", min)

Minimum sentence within the dataset: 3


# Generation

In [8]:
def ReformatSample(sentences):
  '''
  Takes a pd dataset and converts it to a flat sentence without stop words, double white spaces, punctuation and ending white spaces
  Input: pandas dataframe
  Output: Flat list of sentences
  '''
  # Remove any punctuation, double white space, capital letters, and ending white spaces
  sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ", regex = True)

  check = False
  while check == False:
    sentences = pd.Series(sentences).str.replace("  ", " ")
    if any("  " in sentence for sentence in sentences):
      check = False
    else:
      check = True
  sentences = [sentence.lower().strip() for sentence in sentences]
  
  sentences = [StripStopWords(sentence.split()) for sentence in sentences]

  return sentences
  
def StripStopWords(sentence):
  '''
  Remove stop words from sentences
  Input: List of sentences
  Output: Corrected list of sentences
  '''

  # English stop words
  StopWords = stopwords.words('english')

  # Remove stop words
  temp = " ".join([i for i in sentence if i not in StopWords])

  return temp

In [9]:
# Import glove word2vec
from gensim.scripts.glove2word2vec import glove2word2vec
# Local file
GloveFileName = './glove.6B.100d.txt'

In [10]:
# Extract word vectors
%%time 

WordEmbeddings = {}
f = open(GloveFileName, encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coeffs = np.asarray(values[1:], dtype = 'float32')
    WordEmbeddings[word] = coeffs
f.close()

CPU times: user 10.3 s, sys: 430 ms, total: 10.7 s
Wall time: 11.7 s


In [11]:
WordShape = WordEmbeddings['the'].shape[0]
WordShape

100

In [12]:
def SentenceVector(sentences, WordShape):
  '''
  Takes a list of sentences and retrieve the sentence vectors from each of them
  Input: List of sentences
  Output: Sentence vector array
  '''
  SentenceVecs = []

  for sentence in sentences:
      if len(sentence) != 0:
          WordVec = sum([WordEmbeddings.get(words, np.zeros((WordShape,))) for words in sentence.split()]) / (len(sentence.split()) + 0.0001)
      else:
          WordVec = np.zeros((WordShape,))

      SentenceVecs.append(WordVec)

  return SentenceVecs

In [13]:
def SimMatrix(sentences, SentenceVector, WordShape):
  '''
  Create the similarity matrix of a sample given the sentences of an article along with the sentence vectors
  Inputs: List of article sentences
          Sentence vectors of the article sentences
  Output: Simiarity matrix
  '''
  # Initialize blank similarity matrix of size
  SimilarityMatrix = np.zeros([len(sentences), len(sentences)])

  for i in range(len(sentences)):
      for j in range(len(sentences)):
          if i != j:
              SimilarityMatrix[i][j] = cosine_similarity(SentenceVector[i].reshape(1, WordShape), SentenceVector[j].reshape(1, WordShape))[0, 0]
              
  return SimilarityMatrix

In [14]:
def GenerateSummary(sample, SummaryVariable = 10, WordShape = 100):
  '''
  Generate a summary of a sample article separated into a list of sentences
  Inputs: Sample article consisting of lists of sentences
          The fraction of the article to be summarized
          The word shape vector
  Output: Extractive summary of given sample article
  '''
  # Retrieve cleaned sentences of sample
  sentences = ReformatSample(sample)

  # Retrieve sentence vectors consisting of word vectors using gensim
  SentenceVecs = SentenceVector(sentences, WordShape)

  # Retrieve similarity matrix
  SimilarityMatrix = SimMatrix(sentences, SentenceVecs, WordShape)

  # Rank each sentence
  NXGraph = nx.from_numpy_array(SimilarityMatrix)
  scores = nx.pagerank_numpy(NXGraph)
  RankedSentences = sorted(((scores[index], sentence) for index, sentence in enumerate(sentences)), reverse = True)

  # Choose summarization length
  # Ratio
  #SummaryLength = len(sentences) // SummaryVariable
  # Exact
  SummaryLength = SummaryVariable

  # Ensure a minimum summary length
  # Ratio
  #if SummaryLength == 0:
  #  SummaryLength = 1
  # Exact
  if SummaryLength > len(sentences):
    SummaryLength = len(sentences)


  # Generate extractive summary of article
  First = True

  SummarySentence = ""
  for i in range(SummaryLength):
    if First:
      SummarySentence += RankedSentences[i][1].capitalize()
      First = False

    else:
      SummarySentence += ", " + RankedSentences[i][1]

  SummarySentence += "."
  
  return SummarySentence

In [15]:
Summaries = []
for sample in tqdm(dataset[0]):
  Summaries.append(GenerateSummary(sample, SummaryVariable = 10, WordShape = 100))

100%|██████████| 5502/5502 [32:04<00:00,  2.86it/s]


# Evaluation

In [16]:
# Ground truth
GTdataset = pd.DataFrame(data['test'])

GT = []
for _, sample in GTdataset.iterrows():
  GTSummary = ""
  #print(sample[1])
  First = True
  for index, i in enumerate(sample[1]):
    if i == 1:
      if First == True:
        GTSummary += sample[0][index].capitalize()
        First = False

      else:
        GTSummary += ", " + sample[0][index]
  GT.append(GTSummary)

In [17]:
def Evaluation(index, GroundTruth, Predicted):
  RougeScore = rouge.Rouge()
  scores = RougeScore.get_scores(GroundTruth[index], Predicted[index], avg = True)

  return scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']

def EvaluationAvg(GroundTruth, Predicted):
  rouge1Avg, rouge2Avg, rougeLAvg = [], [], []
  for i in range(len(GroundTruth)):
    rouge1, rouge2, rougeL = 0, 0, 0
    rouge1, rouge2, rougeL = Evaluation(i, GroundTruth, Predicted)
    rouge1Avg.append(rouge1)
    rouge2Avg.append(rouge2)
    rougeLAvg.append(rougeL)
    
  print(f"Rouge-1 Score: {round(np.mean(rouge1Avg), 2)}")
  print(f"Rouge-2 Score: {round(np.mean(rouge2Avg), 2)}")
  print(f"Rouge-l Score: {round(np.mean(rougeLAvg), 2)}")
  print(f"Average Rouge: {round(np.mean([rouge1Avg, rouge2Avg, rougeLAvg]), 2)}")

In [18]:
EvaluationAvg(GT, Summaries)

Rouge-1 Score: 0.23
Rouge-2 Score: 0.05
Rouge-l Score: 0.22
Average Rouge: 0.17


# Individual sample of execution

In [19]:
%%time

sentences = ReformatSample(dataset[0][1075])

CPU times: user 17.7 ms, sys: 4.22 ms, total: 21.9 ms
Wall time: 21.4 ms


In [20]:
orisentences = sentences

In [21]:
SentenceVecs = SentenceVector(sentences, WordShape)

In [22]:
# Initialize the similarity matrix
SimilarityMatrix = np.zeros([len(orisentences), len(orisentences)])
SimilarityMatrix.shape

(76, 76)

In [23]:
sentences

['tricky learn new pronunciation familiar letters remember additional letters becoming confident malay alphabet first step able count malay',
 'alphabet pronunciation important role malay want learn thoroughly possible attempting master malay words',
 'letters malay alphabet pronounced father b pronounced bay c pronounced ch chay pronounced day e pronounced elephant f pronounced fine g pronounced gold h pronounced house',
 'pronounced ee meat j pronounced job k pronounced kitchen l pronounced life pronounced man',
 'n pronounced nice pronounced olive p pronounced pool q pronounced kiss r pronounced rice pronounced smile pronounced time u pronounced oo mood v pronounced f free w pronounced wind',
 'x pronounced wax pronounced year z pronounced zulu ng pronounced hanging ny prounced ana kh pronounced bach sy pronounced shield nng pronounced bingo possible speak native malay speaker simply listen talk',
 'help detect way intonates speech general rhythm malay language',
 'help learn speak 

In [24]:
for i in range(len(orisentences)):
    for j in range(len(orisentences)):
        if i != j:
            SimilarityMatrix[i][j] = cosine_similarity(SentenceVecs[i].reshape(1,100), SentenceVecs[j].reshape(1,100))[0,0]

print(SimilarityMatrix.shape)
SimilarityMatrix

(76, 76)


array([[ 0.        ,  0.93819743,  0.71694201, ...,  0.02591483,
         0.05465572,  0.41040376],
       [ 0.93819743,  0.        ,  0.66353124, ..., -0.00560175,
         0.04859041,  0.37325883],
       [ 0.71694201,  0.66353124,  0.        , ...,  0.10681148,
         0.15491981,  0.34874249],
       ...,
       [ 0.02591483, -0.00560175,  0.10681148, ...,  0.        ,
         0.53405246,  0.38106482],
       [ 0.05465572,  0.04859041,  0.15491981, ...,  0.53405246,
         0.        ,  0.39142645],
       [ 0.41040376,  0.37325883,  0.34874249, ...,  0.38106482,
         0.39142645,  0.        ]])

In [25]:
NXGraph = nx.from_numpy_array(SimilarityMatrix)
scores = nx.pagerank_numpy(NXGraph)

In [26]:
RankedSentences = sorted(((scores[index], sentence) for index, sentence in enumerate(orisentences)), reverse = True)

In [27]:
SummaryLength = 10

# Generate summary
First = True
GeneratedSummary = ""
for i in range(SummaryLength):
  if First:
    GeneratedSummary += RankedSentences[i][1].capitalize()
    First = False

  else:
    GeneratedSummary += ", " + RankedSentences[i][1]

GeneratedSummary += "."
print(GeneratedSummary)

Sepuluh pronounced seh poo loo, tiga pronounced tee guh, empat pronounced um paht, dua pronounced doo uhh, lima pronounced lee muh, sembilan pronounced sem bee lan, enam pronounced uhh nom, satu pronounced sat, tujuh pronounced jew, pronounced ee meat j pronounced job k pronounced kitchen l pronounced life pronounced man.
