In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
# Load the Dataset
col_to_use = ['article_id','article_title','article_text']
df = pd.read_csv("tennis_articles.csv",usecols=col_to_use)
df

Unnamed: 0,article_id,article_title,article_text
0,1,"I do not have friends in�tennis, says Maria Sh...",Maria Sharapova has basically no friends as te...
1,2,Federer defeats Medvedev to advance to 14th Sw...,"BASEL, Switzerland (AP) � Roger Federer advanc..."
2,3,Tennis: Roger Federer ignored deadline set by ...,Roger Federer has revealed that organisers of ...
3,4,Nishikori to face off against Anderson in Vien...,Kei Nishikori will try to end his long losing ...
4,5,Roger Federer has made this huge change to ten...,"Federer, 37, first broke through on tour over ..."
5,6,Rafael Nadal: World No 1 ARRIVES for Paris Mas...,Nadal has not played tennis since he was force...
6,7,"TENNIS.COM PODCAST: POINT DEFENSE, RANKING DRO...","Tennis giveth, and tennis taketh away. The end..."
7,8,Tennis journalist�s heartbreaking insight on T...,I PLAYED golf last week with Todd Reid. He pic...


In [3]:
df.head()

Unnamed: 0,article_id,article_title,article_text
0,1,"I do not have friends in�tennis, says Maria Sh...",Maria Sharapova has basically no friends as te...
1,2,Federer defeats Medvedev to advance to 14th Sw...,"BASEL, Switzerland (AP) � Roger Federer advanc..."
2,3,Tennis: Roger Federer ignored deadline set by ...,Roger Federer has revealed that organisers of ...
3,4,Nishikori to face off against Anderson in Vien...,Kei Nishikori will try to end his long losing ...
4,5,Roger Federer has made this huge change to ten...,"Federer, 37, first broke through on tour over ..."


In [4]:
df.shape

(8, 3)

### Text Summarisation

In [5]:
text = df['article_text'][0]
text

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net. So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same

In [6]:
len(text)

1629

In [7]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

In [8]:
tokens = [token.text for token in doc]
tokens

['Maria',
 'Sharapova',
 'has',
 'basically',
 'no',
 'friends',
 'as',
 'tennis',
 'players',
 'on',
 'the',
 'WTA',
 'Tour',
 '.',
 'The',
 'Russian',
 'player',
 'has',
 'no',
 'problems',
 'in',
 'openly',
 'speaking',
 'about',
 'it',
 'and',
 'in',
 'a',
 'recent',
 'interview',
 'she',
 'said',
 ':',
 "'",
 'I',
 'do',
 "n't",
 'really',
 'hide',
 'any',
 'feelings',
 'too',
 'much',
 '.',
 'I',
 'think',
 'everyone',
 'knows',
 'this',
 'is',
 'my',
 'job',
 'here',
 '.',
 'When',
 'I',
 "'m",
 'on',
 'the',
 'courts',
 'or',
 'when',
 'I',
 "'m",
 'on',
 'the',
 'court',
 'playing',
 ',',
 'I',
 "'m",
 'a',
 'competitor',
 'and',
 'I',
 'want',
 'to',
 'beat',
 'every',
 'single',
 'person',
 'whether',
 'they',
 "'re",
 'in',
 'the',
 'locker',
 'room',
 'or',
 'across',
 'the',
 'net',
 '.',
 'So',
 'I',
 "'m",
 'not',
 'the',
 'one',
 'to',
 'strike',
 'up',
 'a',
 'conversation',
 'about',
 'the',
 'weather',
 'and',
 'know',
 'that',
 'in',
 'the',
 'next',
 'few',
 'minu

In [9]:
punctuation = punctuation + '\n'
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n'

### Text Cleaning

In [10]:
word_freq = {}

stop_words = list(STOP_WORDS)

for word in doc:
    if word.text.lower() not in stop_words:
        if word.text.lower() not in punctuation:
            if word.text not in word_freq.keys():
                word_freq[word.text] = 1
            else:
                word_freq[word.text] += 1
                
print(word_freq)

{'Maria': 2, 'Sharapova': 2, 'basically': 1, 'friends': 5, 'tennis': 7, 'players': 6, 'WTA': 1, 'Tour': 1, 'Russian': 1, 'player': 2, 'problems': 1, 'openly': 1, 'speaking': 1, 'recent': 1, 'interview': 1, 'said': 2, 'hide': 1, 'feelings': 1, 'think': 4, 'knows': 1, 'job': 1, 'courts': 2, 'court': 1, 'playing': 1, 'competitor': 1, 'want': 1, 'beat': 1, 'single': 1, 'person': 2, 'locker': 1, 'room': 1, 'net': 1, 'strike': 1, 'conversation': 1, 'weather': 1, 'know': 1, 'minutes': 1, 'try': 1, 'win': 1, 'match': 1, 'pretty': 1, 'competitive': 1, 'girl': 1, 'hellos': 1, 'sending': 1, 'flowers': 1, 'Uhm': 1, 'friendly': 1, 'close': 2, 'lot': 2, 'away': 1, 'strategic': 1, 'different': 4, 'men': 1, 'tour': 2, 'women': 1, 'sport': 1, 'mean': 1, 'categorized': 1, 'going': 1, 'interests': 2, 'completely': 1, 'jobs': 1, 'met': 1, 'parts': 1, 'life': 1, 'thinks': 1, 'greatest': 1, 'ultimately': 1, 'small': 1, 'things': 1, 'interested': 1, 'READ': 1, 'reveals': 1, 'keeps': 1, 'motivated': 1}


In [11]:
max_freq = max(word_freq.values())

In [12]:
for word in word_freq.keys():
    word_freq[word] = word_freq[word] / max_freq

In [13]:
print(word_freq)

{'Maria': 0.2857142857142857, 'Sharapova': 0.2857142857142857, 'basically': 0.14285714285714285, 'friends': 0.7142857142857143, 'tennis': 1.0, 'players': 0.8571428571428571, 'WTA': 0.14285714285714285, 'Tour': 0.14285714285714285, 'Russian': 0.14285714285714285, 'player': 0.2857142857142857, 'problems': 0.14285714285714285, 'openly': 0.14285714285714285, 'speaking': 0.14285714285714285, 'recent': 0.14285714285714285, 'interview': 0.14285714285714285, 'said': 0.2857142857142857, 'hide': 0.14285714285714285, 'feelings': 0.14285714285714285, 'think': 0.5714285714285714, 'knows': 0.14285714285714285, 'job': 0.14285714285714285, 'courts': 0.2857142857142857, 'court': 0.14285714285714285, 'playing': 0.14285714285714285, 'competitor': 0.14285714285714285, 'want': 0.14285714285714285, 'beat': 0.14285714285714285, 'single': 0.14285714285714285, 'person': 0.2857142857142857, 'locker': 0.14285714285714285, 'room': 0.14285714285714285, 'net': 0.14285714285714285, 'strike': 0.14285714285714285, 'co

### Sentence Tokenisation

In [14]:
sent_tokens = [sent for sent in doc.sents]
sent_tokens[:5]

[Maria Sharapova has basically no friends as tennis players on the WTA Tour.,
 The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.,
 I think everyone knows this is my job here.,
 When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.,
 So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.]

In [15]:
sent_score = {}

for sent in sent_tokens:
    for word in sent:
        if word.text.lower() in word_freq.keys():
            if sent not in sent_score.keys():
                sent_score[sent] = word_freq[word.text.lower()]
            else:
                sent_score[sent] += word_freq[word.text.lower()]


In [16]:
print(sent_score)

{Maria Sharapova has basically no friends as tennis players on the WTA Tour.: 3.0, The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.: 1.5714285714285712, I think everyone knows this is my job here.: 0.857142857142857, When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.: 1.8571428571428568, So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.: 2.142857142857143, I'm a pretty competitive girl.: 0.42857142857142855, I say my hellos, but I'm not sending any players flowers as well.: 1.2857142857142856, Uhm, I'm not really friendly or close to many players.: 1.2857142857142856, I have not a lot of friends away from the courts.': 1.4285714285714284, When she said she is not really close to a lot

### select 30% sentences with maximum score

In [17]:
from heapq import nlargest

In [18]:
np.ceil(len(sent_score) *0.3)

6.0

### Getting The Summary

In [19]:
summary = nlargest(n = 6,iterable=sent_score, key=sent_score.get)
print(summary)

[I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're categorized, you're a tennis player, so you're going to get along with tennis players., I think everyone just thinks because we're tennis players we should be the greatest of friends., Maria Sharapova has basically no friends as tennis players on the WTA Tour., I have friends that have completely different jobs and interests, and I've met them in very different parts of my life., So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match., When she said she is not really close to a lot of players, is that something strategic that she is doing?]


In [20]:
final_summary = " ".join([word.text for word in summary])
print(final_summary)


I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're categorized, you're a tennis player, so you're going to get along with tennis players. I think everyone just thinks because we're tennis players we should be the greatest of friends. Maria Sharapova has basically no friends as tennis players on the WTA Tour. I have friends that have completely different jobs and interests, and I've met them in very different parts of my life. So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. When she said she is not really close to a lot of players, is that something strategic that she is doing?


In [21]:
print("the percentage reduction of summary is :",1-len(final_summary)/len(text))

the percentage reduction of summary is : 0.5383670963781462


so guys as you can see that we have drastically reduced the summary 

### so guys lets automate these for rest of texts

In [22]:
def text_summarisation(df,title,text_summary):
    print(i, df[title][i])
    text = df[text_summary][i]
    import spacy
    
    from spacy.lang.en.stop_words import STOP_WORDS
    stop_words = list(STOP_WORDS)
    from string import punctuation
    punctuation = punctuation + '\n'
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return doc

In [23]:
def text_cleaning(doc):
    word_freq = {}
    
    for word in doc:
        if word.text.lower() not in stop_words:
            if word.text.lower() not in punctuation:
                if word.text not in word_freq.keys():
                    word_freq[word.text] = 1
                else:
                    word_freq[word.text] += 1

    max_freq = max(word_freq.values())
    for word in word_freq.keys():
        word_freq[word] = word_freq[word] / max_freq
    
    return word_freq

In [24]:
def sentence_tokenisation(doc,word_freq):
    sent_tokens = [sent for sent in doc.sents]
    sent_score = {}

    for sent in sent_tokens:
        for word in sent:
            if word.text.lower() in word_freq.keys():
                if sent not in sent_score.keys():
                    sent_score[sent] = word_freq[word.text.lower()]
                else:
                    sent_score[sent] += word_freq[word.text.lower()]
    return sent_score

In [25]:
def percentage_summary(percent,sent_score):
    from heapq import nlargest
    n_value = int(np.ceil(len(sent_score) *int(percent)/100))
    summary = nlargest(n = n_value,iterable=sent_score, key=sent_score.get)
    final_summary = " ".join([word.text for word in summary])
    print(final_summary)

In [26]:
for i in range(len(df['article_text'])):
    text_summarisation(df,'article_title','article_text')
    text_cleaning(doc)
    sentence_tokenisation(doc,word_freq)
    percentage_summary(30,sent_score)
    print()

0 I do not have friends in�tennis, says Maria Sharapova
I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're categorized, you're a tennis player, so you're going to get along with tennis players. I think everyone just thinks because we're tennis players we should be the greatest of friends. Maria Sharapova has basically no friends as tennis players on the WTA Tour. I have friends that have completely different jobs and interests, and I've met them in very different parts of my life. So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. When she said she is not really close to a lot of players, is that something strategic that she is doing?

1 Federer defeats Medvedev to advance to 14th Swiss Indoors final
I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're cat