In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from rouge import Rouge

Process the dataset

In [2]:
pd.options.display.max_colwidth = 100
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('max_colwidth',20)
pd.set_option('expand_frame_repr', False)

In [3]:
wikihowAll_data = pd.read_csv("wikihowAll.csv")
print("The dataset's size is ", len(wikihowAll_data['headline']))

The dataset's size is  215365


In [4]:
wikihowAll_data.head(4)

Unnamed: 0,headline,title,text
0,\nKeep related s...,How to Be an Org...,If you're a pho...
1,\nCreate a sketc...,How to Create a ...,See the image f...
2,\nGet a bachelor...,How to Be a Visu...,It is possible ...
3,\nStart with som...,How to Become an...,The best art in...


In [5]:
# remove all the NAN value in the dataset
wikihowAll_data_clean = wikihowAll_data.dropna(subset=['headline', 'title', 'text'])
wikihowAll_data_clean = wikihowAll_data_clean.reset_index(drop=True)
# remove some short text in the dataset
index = []
for i in range(len(wikihowAll_data_clean['text'])):
    if len(nltk.sent_tokenize(wikihowAll_data_clean['text'][i])) < 5:
        index.append(i)
wikihowAll_data_clean = wikihowAll_data_clean.drop(index)
# remove the duplicate data
wikihowAll_data_clean = wikihowAll_data_clean.drop_duplicates()
wikihowAll_data_clean = wikihowAll_data_clean.reset_index(drop=True)
print("The dataset's size is ", len(wikihowAll_data_clean['headline']))

The dataset's size is  185892


In [6]:
num_rows = wikihowAll_data_clean.shape[0]

train_rows = int(num_rows * 0.70)
dev_rows = int(num_rows * 0.15)
test_rows = num_rows - train_rows - dev_rows

train_data = wikihowAll_data_clean[:train_rows]
dev_data = wikihowAll_data_clean[train_rows:train_rows+dev_rows]
test_data = wikihowAll_data_clean[train_rows+dev_rows:]

train_data = train_data.reset_index(drop=True)
dev_data = dev_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [7]:
len(test_data)

27885

Lead-N baseline

In [8]:
def lead_N_summary(text):
    # Divide by paragraph
    paragraphs = text.split("\n\n")
    
    # Get the first sentence of each paragraph
    first_sentences = []
    for paragraph in paragraphs:
        sentences = nltk.sent_tokenize(paragraph)
        if sentences:
            first_sentences.append(sentences[0])
            
    # Join all of the sentence to be the summary
    if len(first_sentences) == 0:
        sentences_list = text.split("\n")
        first_sentences = sentences_list[0:3]
        
    summary = " ".join(first_sentences)
    return summary

In [28]:
lead_N_summarization = []
for text in test_data['text']:
    lead_N_summarization.append(lead_N_summary(text))

In [29]:
reference_summarization = test_data['headline'].tolist()
len(reference_summarization)

27885

In [30]:
rouge = Rouge()
Lead_N_scores = rouge.get_scores(lead_N_summarization, reference_summarization, avg=True)
Lead_N_scores

{'rouge-1': {'f': 0.25997837365051885,
  'p': 0.222851777015474,
  'r': 0.42541135936903135},
 'rouge-2': {'f': 0.06437508050099851,
  'p': 0.05451236379524374,
  'r': 0.10827056042030156},
 'rouge-l': {'f': 0.22753192699108923,
  'p': 0.18790073392745946,
  'r': 0.36854726963371465}}

TextRank baseline

In [12]:
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import numpy as np

stop_words = stopwords.words('english')

In [13]:
glove_file = "glove.6B.50d.txt"
word_embeddings = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        word_embed = np.asarray(values[1:]).astype("float")
        word_embeddings[word] = word_embed

In [14]:
word_embeddings['the'].shape

(50,)

In [22]:
def TextRank_summary(text, num_sentence):
    
    sentences = nltk.sent_tokenize(text)
    
    clean_sentences = []

    for sentence in sentences:
        
        # remove punctuations and numbers
        clean_sentence = re.sub('[^a-zA-Z]', ' ', sentence)
        
        # set all of the numbers to lower case
        clean_sentence = clean_sentence.lower()
        
        # remove the stop words
        clean_sentence_token = nltk.word_tokenize(clean_sentence)
        clean_sentence_token = [word for word in clean_sentence_token if word not in stop_words]
        clean_sentence = " ".join(clean_sentence_token)
        
        clean_sentences.append(clean_sentence)
        
    # Get the sentence
    sentence_vectors = []
    for sentence in clean_sentences:
        if len(sentence) != 0:
            sentence_token = nltk.word_tokenize(sentence)
            sentence_vector =  np.mean([word_embeddings.get(word, np.zeros((50,))) for word in sentence_token], axis=0)
        else:
            sentence_vector = np.zeros((50,))
        sentence_vectors.append(sentence_vector)
    
    similarity_mat = np.zeros((len(clean_sentences), len(clean_sentences)))
    
    for i in range(len(sentence_vectors)):
        for j in range(len(sentence_vectors)):
            if i != j:
                sentence_i = sentence_vectors[i].reshape((1, 50))
                sentence_j = sentence_vectors[j].reshape((1, 50))
                similarity_mat[i][j] = cosine_similarity(sentence_i, sentence_j)[0][0]
    
    sentence_graph = nx.from_numpy_array(similarity_mat)
    sentence_scores = nx.pagerank(sentence_graph, max_iter=500, tol=1e-5, nstart={node: 1.0 for node in sentence_graph.nodes()})
    
    ranked_sentences = []
    for i, sentence in enumerate(sentences):
        ranked_sentences.append((sentence_scores[i], sentence))
    ranked_sentences = sorted(ranked_sentences, reverse=True)
    
    sentences = []
    if len(ranked_sentences) < num_sentence:
        num_sentence = len(ranked_sentences) - 1
    for i in range(num_sentence):
        sentences.append(ranked_sentences[i][1])
    summary = " ".join(sentences)
    
    return summary


In [None]:
# there might be some example can't be convergence, just skip it.
TextRank_summarization = []
num_sentence = 3
for text in test_data['text']:
    TextRank_summarization.append(TextRank_summary(text, num_sentence))
len(TextRank_summarization)
        

In [26]:
rouge = Rouge()
TextRank_scores = rouge.get_scores(TextRank_summarization, reference_summarization, avg=True)
TextRank_scores

{'rouge-1': {'f': 0.2370048915839522,
  'p': 0.22073572879888995,
  'r': 0.3434142197247943},
 'rouge-2': {'f': 0.0551547797639333,
  'p': 0.05056655469616582,
  'r': 0.0831455867890583},
 'rouge-l': {'f': 0.21716238793240872,
  'p': 0.18952708684209132,
  'r': 0.32194096409460754}}