# Text Summarization (Extractive) using Text Rank algorithm

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import re
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [2]:
df = pd.read_csv("tennis_articles.csv", encoding='windows-1252')

df

Unnamed: 0,article_id,article_title,article_text,source
0,1,"I do not have friends in tennis, says Maria Sh...",Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,Federer defeats Medvedev to advance to 14th Sw...,"BASEL, Switzerland (AP) — Roger Federer advanc...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Tennis: Roger Federer ignored deadline set by ...,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Nishikori to face off against Anderson in Vien...,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,Roger Federer has made this huge change to ten...,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...
5,6,Rafael Nadal: World No 1 ARRIVES for Paris Mas...,Nadal has not played tennis since he was force...,https://www.express.co.uk/sport/tennis/1037119...
6,7,"TENNIS.COM PODCAST: POINT DEFENSE, RANKING DRO...","Tennis giveth, and tennis taketh away. The end...",http://www.tennis.com/pro-game/2018/10/tennisc...
7,8,Tennis journalist’s heartbreaking insight on T...,I PLAYED golf last week with Todd Reid. He pic...,https://www.foxsports.com.au/tennis/tennis-jou...


In [3]:
df['article_text'][7]

'I PLAYED golf last week with Todd Reid. He picked me up at 5.30am, half an hour early because he couldn’t sleep. Or hadn’t slept, to be specific. Not because he’d been out on a bender or anything — those days were in the past. The former Wimbledon junior champion was full of hope, excited about getting his life back together after a troubled few years and a touch-and-go battle with pancreatitis. “I’m pleased with that,” he said after grinding out an eight-over-par front nine at the not-so-royal Northbridge club in Sydney and smashing down an egg- and-bacon roll at the halfway house. To most players of his rare sporting gifts, such a modest return would be unacceptable. To Reid the 15-marker, just being up and at ‘em was enough; a few bogeys and one well-made par — broomstick putter and all — vindication for his recent decision to renew his membership at nearby Bankstown. Exhausted after spending half his round deep in the bushes searching for my ball, as well as those of two other gol

## Text pre-processing

In [4]:
# tokenize into sentences

df['article_text'] = df['article_text'].apply(sent_tokenize)

df

Unnamed: 0,article_id,article_title,article_text,source
0,1,"I do not have friends in tennis, says Maria Sh...",[Maria Sharapova has basically no friends as t...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,Federer defeats Medvedev to advance to 14th Sw...,"[BASEL, Switzerland (AP) — Roger Federer advan...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Tennis: Roger Federer ignored deadline set by ...,[Roger Federer has revealed that organisers of...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Nishikori to face off against Anderson in Vien...,[Kei Nishikori will try to end his long losing...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,Roger Federer has made this huge change to ten...,"[Federer, 37, first broke through on tour over...",https://www.express.co.uk/sport/tennis/1036101...
5,6,Rafael Nadal: World No 1 ARRIVES for Paris Mas...,[Nadal has not played tennis since he was forc...,https://www.express.co.uk/sport/tennis/1037119...
6,7,"TENNIS.COM PODCAST: POINT DEFENSE, RANKING DRO...","[Tennis giveth, and tennis taketh away., The e...",http://www.tennis.com/pro-game/2018/10/tennisc...
7,8,Tennis journalist’s heartbreaking insight on T...,"[I PLAYED golf last week with Todd Reid., He p...",https://www.foxsports.com.au/tennis/tennis-jou...


In [5]:
# creating a single pd series for sentences in all the articles

sentences = df['article_text'].explode()

sentences.reset_index(drop=True, inplace=True)

sentences

0      Maria Sharapova has basically no friends as te...
1      The Russian player has no problems in openly s...
2            I think everyone knows this is my job here.
3      When I'm on the courts or when I'm on the cour...
4      So I'm not the one to strike up a conversation...
                             ...                        
125    A big fan and believer in the enigmatic Canber...
126    “Full effort he wouldn’t fail,” Reid said of K...
127    “Full effort Nick could live out his tennis li...
128    “Time will tell for Nick, but he’s still young...
129                        Tragically, his time ran out.
Name: article_text, Length: 130, dtype: object

In [6]:
print(sentences[128])

“Time will tell for Nick, but he’s still young.” Todd Reid was still young, too.


In [7]:
stop = stopwords.words('english')

for i in range(len(stop)):
    stop[i] = re.sub(r'\'', '', stop[i])
    
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', '

In [8]:
def pre_process(text):
     
    # clean spaces
    text = re.sub(r'[,.]', ' ', text)
    text = re.sub(r'[\s]+', ' ', text)
    
    # remove punctuations, numbers and special characters
    text = re.sub(r'[^a-zA-Z ]', '', text)
    
    # make alphabets lowercase
    text = text.lower()

    # remove stopwords
    words = text.split()
    text = ' '.join([x for x in words if x not in stop])
    
    return text 

proc_sentences = sentences.apply(pre_process)

proc_sentences

0      maria sharapova basically friends tennis playe...
1      russian player problems openly speaking recent...
2                               think everyone knows job
3      im courts im court playing im competitor want ...
4      im one strike conversation weather know next m...
                             ...                        
125    big fan believer enigmatic canberran reid want...
126                   full effort fail reid said kyrgios
127    full effort nick could live tennis like tomas ...
128    time tell nick hes still young todd reid still...
129                                  tragically time ran
Name: article_text, Length: 130, dtype: object

## Pre-trained Wikipedia 2014 + Gigaword 5 GloVe vectors

In [9]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove*.zip

In [10]:
# extract word embeddings

word_embeddings = {}

f = open('glove.6B/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

len(word_embeddings)

400000

In [11]:
word_embeddings.get('fan')

array([ 0.17379  ,  0.25769  ,  0.024536 , -0.92853  , -0.13053  ,
        0.92387  ,  0.25318  , -0.63233  ,  0.23499  ,  0.0075372,
       -0.77793  , -0.31312  ,  0.058652 , -0.67262  , -0.11301  ,
       -0.46862  ,  0.33277  , -0.12727  ,  0.053753 ,  0.25048  ,
        0.20814  ,  0.16397  , -0.79137  , -0.1584   ,  0.87201  ,
        0.027197 ,  0.031539 ,  0.063152 ,  0.6791   ,  0.33141  ,
       -0.32604  ,  0.88533  ,  0.8044   ,  0.86361  ,  0.11556  ,
       -0.031355 , -0.012902 ,  0.56023  , -0.20579  , -0.33475  ,
        0.040393 , -0.30971  ,  0.24828  , -0.58003  ,  0.83202  ,
       -0.51174  , -0.45455  , -0.28548  ,  0.94183  , -0.66331  ,
        0.4223   , -0.28905  , -0.10244  ,  0.37231  ,  0.53467  ,
       -1.5937   ,  0.15855  ,  0.52149  ,  0.95929  ,  0.7983   ,
        0.20485  ,  0.71883  , -0.78628  ,  0.59022  ,  0.61297  ,
       -0.054023 ,  0.67273  , -0.063905 , -0.02016  ,  0.44583  ,
        0.47405  , -0.12135  ,  0.27213  , -0.52859  , -0.6310

## Vector Representation of Sentences

In [12]:
sent_vectors = []


# first fetch vectors (each of size 100 elements) for the constituent words in a sentence and then 
# take average of those vectors to arrive at a consolidated vector for the sentence.

for s in proc_sentences:
    if len(s) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in s.split()])/(len(s.split())+0.001)
    else:
        v = np.zeros((100,))
    
    sent_vectors.append(v)

## Similarity Matrix

In [13]:
# initialize similarity matrix

n = len(proc_sentences)
M = np.zeros([n, n])

# use Cosine Similarity to compute the similarity between a pair of sentences

for i in range(n):
    for j in range(n):
        if i != j:
            M[i][j] = cosine_similarity(sent_vectors[i].reshape(1,100), sent_vectors[j].reshape(1,100))[0,0]

In [14]:
M

array([[0.        , 0.64378345, 0.5915699 , ..., 0.83445835, 0.6756162 ,
        0.56470358],
       [0.64378345, 0.        , 0.83267677, ..., 0.75498432, 0.83289951,
        0.72113991],
       [0.5915699 , 0.83267677, 0.        , ..., 0.69285244, 0.83229935,
        0.66373771],
       ...,
       [0.83445835, 0.75498432, 0.69285244, ..., 0.        , 0.77146745,
        0.67270464],
       [0.6756162 , 0.83289951, 0.83229935, ..., 0.77146745, 0.        ,
        0.74560148],
       [0.56470358, 0.72113991, 0.66373771, ..., 0.67270464, 0.74560148,
        0.        ]])

## Creating graph and applying page rank algorithm

In [15]:
# nodes of graph represent the sentences and the edges represent the similarity scores between the sentences. 
# On this graph apply the PageRank algorithm to get sentence rankings.

nx_graph = nx.from_numpy_array(M)
scores = nx.pagerank(nx_graph, alpha=0.8)

## Summary Extraction

In [16]:
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

# Extract top 10 sentences as the summary
for i in range(10):
    print(ranked_sentences[i][1])

“I was on a nice trajectorythen,” Reid recalled.“If I hadn’t got sick, I think I could have started pushing towards the second week at the slams and then who knows.” Duringa comeback attempt some five years later, Reid added Bernard Tomic and 2018 US Open Federer slayer John Millman to his list of career scalps.
Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.
So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
“I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not necessarily during the tournaments.
Speaking at the Swiss Indoors tournament where he will play in Sunday’s final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opte