In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re

[nltk_data] Downloading package punkt to /Users/sunnyyang/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
df = pd.read_csv("data/tennis_articles.csv",encoding = 'unicode_escape')

In [6]:
df

Unnamed: 0,article_id,article_title,article_text,source
0,1,"I do not have friends in tennis, says Maria Sh...",Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,Federer defeats Medvedev to advance to 14th Sw...,"BASEL, Switzerland (AP)  Roger Federer advanc...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Tennis: Roger Federer ignored deadline set by ...,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Nishikori to face off against Anderson in Vien...,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,Roger Federer has made this huge change to ten...,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...
5,6,Rafael Nadal: World No 1 ARRIVES for Paris Mas...,Nadal has not played tennis since he was force...,https://www.express.co.uk/sport/tennis/1037119...
6,7,"TENNIS.COM PODCAST: POINT DEFENSE, RANKING DRO...","Tennis giveth, and tennis taketh away. The end...",http://www.tennis.com/pro-game/2018/10/tennisc...
7,8,Tennis journalists heartbreaking insight on T...,I PLAYED golf last week with Todd Reid. He pic...,https://www.foxsports.com.au/tennis/tennis-jou...


In [7]:
df['article_text'][0]

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net. So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same

In [9]:
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
    sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list

In [11]:
len(sentences)

130

In [13]:
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sunnyyang/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [17]:
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [18]:
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [19]:
word_embeddings = {}
f = open('data/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [24]:
sentence_vectors = []

for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
    
    sentence_vectors.append(v)

In [27]:
sim_mat = np.zeros([len(sentences), len(sentences)])

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
    

In [32]:
sim_mat

array([[0.        , 0.64378333, 0.5915699 , ..., 0.83445835, 0.67481112,
        0.56470358],
       [0.64378333, 0.        , 0.83267683, ..., 0.75498432, 0.8464365 ,
        0.72113991],
       [0.5915699 , 0.83267683, 0.        , ..., 0.6928525 , 0.83097279,
        0.66373777],
       ...,
       [0.83445835, 0.75498432, 0.6928525 , ..., 0.        , 0.78325516,
        0.67270464],
       [0.67481112, 0.8464365 , 0.83097279, ..., 0.78325516, 0.        ,
        0.75144422],
       [0.56470358, 0.72113991, 0.66373777, ..., 0.67270464, 0.75144422,
        0.        ]])

In [33]:
import networkx as nx

In [34]:
nx_graph = nx.from_numpy_array(sim_mat)

In [37]:
scores = nx.pagerank(nx_graph)

In [38]:
scores

{0: 0.007346221090050093,
 1: 0.007831539913977895,
 2: 0.007188734007130622,
 3: 0.00820147673730496,
 4: 0.008452754199868451,
 5: 0.006900753545749467,
 6: 0.007457164730002992,
 7: 0.0077519974660682466,
 8: 0.007573027579152764,
 9: 0.00790787040590474,
 10: 0.007548487198523171,
 11: 0.00116144201139666,
 12: 0.008092102851545645,
 13: 0.007405939530541618,
 14: 0.007497863333497912,
 15: 0.00771883135135532,
 16: 0.007830369888036122,
 17: 0.007165460007728705,
 18: 0.007497010554996814,
 19: 0.007309036865282253,
 20: 0.007624419927443686,
 21: 0.008048153183624654,
 22: 0.008106134367330204,
 23: 0.006742385779599381,
 24: 0.007457504934892267,
 25: 0.008193202871220438,
 26: 0.007779816052041085,
 27: 0.006086319949642691,
 28: 0.007512104336266764,
 29: 0.00813956571526167,
 30: 0.008285806531440799,
 31: 0.008323071559607285,
 32: 0.0084467531291671,
 33: 0.008277174979045392,
 34: 0.006622587283472015,
 35: 0.008354391561425204,
 36: 0.008332115014628775,
 37: 0.0069976592

In [39]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [41]:
for i in range(10):
    print(ranked_sentences[i][1])

I was on a nice trajectorythen, Reid recalled.If I hadnt got sick, I think I could have started pushing towards the second week at the slams and then who knows. Duringa comeback attempt some five years later, Reid added Bernard Tomic and 2018 US Open Federer slayer John Millman to his list of career scalps.
Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.
So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.
Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.
Currently in ninth place, Nishikori with a win could move to within 125 points of the cut for the eight-man event in London next month.
Exhau