# Install libraries

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Load data in colab

In [2]:
from google.colab import files
uploaded = files.upload()

Saving tennis_articles.csv to tennis_articles.csv


## Check uploaded data

In [3]:
for file_name in uploaded.keys():
  print(f'Uploaded file : {file_name} with size {len(uploaded[file_name])} bytes')

Uploaded file : tennis_articles.csv with size 17253 bytes


## Read data

In [4]:
df = pd.read_csv(file_name, encoding='unicode_escape')
df.head()

Unnamed: 0,article_id,article_title,article_text,source
0,1,"I do not have friends in tennis, says Maria Sh...",Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,Federer defeats Medvedev to advance to 14th Sw...,"BASEL, Switzerland (AP)  Roger Federer advanc...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Tennis: Roger Federer ignored deadline set by ...,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Nishikori to face off against Anderson in Vien...,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,Roger Federer has made this huge change to ten...,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [17]:
df['article_text'][0]

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net. So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same

In [6]:
from nltk.tokenize import sent_tokenize

In [7]:
sentences = []
for para in df['article_text']:
  sentences.append(sent_tokenize(para))
# print(sentences)
sentences = [y for x in sentences for y in x]
print(sentences[:5])

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.', "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.", 'I think everyone knows this is my job here.', "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.", "So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match."]


## Download Glove WordEmbeddings

In [8]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2022-10-18 11:16:33--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-10-18 11:16:33--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-10-18 11:16:33--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

## Extract word vectors

In [9]:
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
  values = line.split()
  word = values[0]
  coef = np.asarray(values[1:], dtype='float32')
  word_embeddings[word] = coef
len(word_embeddings)

400000

## Text preprocessing

In [37]:
# Remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ").str.replace("\s+", " ")

# print(clean_sentences)
# Make alphabets lowercase
clean_sentences = [sentence.lower () for sentence in clean_sentences]
# print(clean_sentences)

# Remove stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
clean_text = []
for sent in clean_sentences:
  words = sent.split()
  words = [word for word in words if word not in stop_words]
  words = " ".join(words)
  clean_text.append(words)
print(clean_text)
# clean_text = [for sent in clean_sentences for word in sent.split() if word not in stop_words]

['maria sharapova basically friends tennis players wta tour', 'russian player problems openly speaking recent interview said really hide feelings much', 'think everyone knows job', 'courts court playing competitor want beat every single person whether locker room across net', 'one strike conversation weather know next minutes go try win tennis match', 'pretty competitive girl', 'say hellos sending players flowers well', 'uhm really friendly close many players', 'lot friends away courts', 'said really close lot players something strategic', 'different men tour women tour', '', 'think sport mean friends everyone categorized tennis player going get along tennis players', 'think every person different interests', 'friends completely different jobs interests met different parts life', 'think everyone thinks tennis players greatest friends', 'ultimately tennis small part', 'many things interested', 'also read maria sharapova reveals tennis keeps motivated', 'basel switzerland ap roger federe

  
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Create sentence vectors

In [19]:
sentence_vectors = []

for i in clean_text:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

## Similarity matrix

In [22]:
sim_matrix = np.zeros([len(sentences), len(clean_sentences)])

## Cosine similarity

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_matrix[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]


In [25]:
import networkx as nx
nx_graph = nx.from_numpy_array(sim_matrix)
scores = nx.pagerank(nx_graph)


## Summary extraction

In [26]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
for i in range(5):
  print(ranked_sentences[i][1])