In [1]:
import re, pdb, sys, math
from collections import defaultdict

In [2]:
class Graph:
    def __init__(self):
        self.Vertices = []
        self.Edges = []

    def getRankedVertices(self):
        res = defaultdict(float)
        for e in self.Edges:
            res[e.Vertex1] += e.Weight
        return sorted(res.items(), key=lambda x: x[1], reverse=True)

In [3]:
class Vertex:
    def __init__(self):
        self.Sentence = None

In [4]:
class Edge:
    def __init__(self):
        self.Vertex1 = None
        self.Vertex2 = None
        self.Weight = 0

In [5]:
class WordType:
    Content=0
    Function=1
    ContentPunctuation=2
    FunctionPunctuation=3

In [6]:
class Word:
    def __init__(self):
        self.Text=''
        self.Type=''

In [7]:
class Sentence:
    def __init__(self):
        self.Words = []

    def getFullSentence(self):
        text = ''
        for w in self.Words:
            text += w.Text
        return text.strip()

    def getReducedSentence(self):
        sentenceText = ''
        sentenceEnd = self.Words[len(self.Words)-1]
        contentWords = filter(lambda w: w.Type == WordType.Content, self.Words)
        i = 0
        while i < len(contentWords):
            w = contentWords[i]
            # upper case the first character of the sentence
            if i == 0:
                li = list(w.Text)
                li[0] = li[0].upper()
                w.Text = ''.join(li)
            sentenceText += w.Text
            if i < len(contentWords)-1:
                sentenceText += ' '
            elif sentenceEnd.Text != w.Text:
                sentenceText += sentenceEnd.Text
            i = i+1
        return sentenceText



In [8]:
class Paragraph:
    def __init__(self):
        self.Sentences = []

In [9]:
class Reduction:
    functionPunctuation = ' ,-'
    contentPunctuation = '.?!\n'
    punctuationCharacters = functionPunctuation+contentPunctuation
    sentenceEndCharacters = '.?!'

    def isContentPunctuation(self, text):
        for c in self.contentPunctuation:
            if text.lower() == c.lower():
                return True
        return False

    def isFunctionPunctuation(self, text):
        for c in self.functionPunctuation:
            if text.lower() == c.lower():
                return True
        return False

    def isFunction(self, text, stopWords):
        for w in stopWords:
            if text.lower() == w.lower():
                return True
        return False

    def tag(self, sampleWords, stopWords):
        taggedWords = []
        for w in sampleWords:
            tw = Word()
            tw.Text = w
            if self.isContentPunctuation(w):
                tw.Type = WordType.ContentPunctuation
            elif self.isFunctionPunctuation(w):
                tw.Type = WordType.FunctionPunctuation
            elif self.isFunction(w, stopWords):
                tw.Type = WordType.Function
            else:
                tw.Type = WordType.Content
            taggedWords.append(tw)
        return taggedWords

    def tokenize(self, text):
        return filter(lambda w: w != '', re.split('([{0}])'.format(self.punctuationCharacters), text))	

    def getWords(self, sentenceText, stopWords):
        return self.tag(self.tokenize(sentenceText), stopWords) 

    def getSentences(self, line, stopWords):
        sentences = []
        sentenceTexts = filter(lambda w: w.strip() != '', re.split('[{0}]'.format(self.sentenceEndCharacters), line))	
        sentenceEnds = re.findall('[{0}]'.format(self.sentenceEndCharacters), line)
        sentenceEnds.reverse()
        for t in sentenceTexts:
            if len(sentenceEnds) > 0:
                t += sentenceEnds.pop()
            sentence = Sentence()
            sentence.Words = self.getWords(t, stopWords)
            sentences.append(sentence)
        return sentences

    def getParagraphs(self, lines, stopWords):
        paragraphs = []
        for line in lines:
            paragraph = Paragraph()
            paragraph.Sentences = self.getSentences(line, stopWords)
            paragraphs.append(paragraph)
        return paragraphs

    def findWeight(self, sentence1, sentence2):
        length1 = len(list(filter(lambda w: w.Type == WordType.Content, sentence1.Words)))
        length2 = len(list(filter(lambda w: w.Type == WordType.Content, sentence2.Words)))
        if length1 < 4 or length2 < 4:
            return 0
        weight = 0
        for w1 in filter(lambda w: w.Type == WordType.Content, sentence1.Words):
            for w2 in filter(lambda w: w.Type == WordType.Content, sentence2.Words):
                if w1.Text.lower() == w2.Text.lower():
                    weight = weight + 1
        normalised1 = 0
        if length1 > 0:
            normalised1 = math.log(length1)
        normalised2 = 0
        if length2 > 0:
            normalised2 = math.log(length2)
        norm = normalised1 + normalised2
        if norm == 0:
            return 0
        return weight / float(norm)

    def buildGraph(self, sentences):
        g = Graph()
        for s in sentences:
            v = Vertex()
            v.Sentence = s
            g.Vertices.append(v)
        for i in g.Vertices:
            for j in g.Vertices:
                if i != j:
                    w = self.findWeight(i.Sentence, j.Sentence)
                    e = Edge()
                    e.Vertex1 = i
                    e.Vertex2 = j
                    e.Weight = w
                    g.Edges.append(e)
        return g

    def sentenceRank(self, paragraphs):
        sentences = []
        for p in paragraphs:
            for s in p.Sentences:
                sentences.append(s)
        g = self.buildGraph(sentences)
        return g.getRankedVertices()

    def reduce(self, text, reductionRatio):
        stopWordsFile = 'stopWords.txt'
        stopWords= open(stopWordsFile).read().splitlines()

        lines = text.splitlines()
        print("lines", lines)
        contentLines = filter(lambda w: w.strip() != '', lines)
        print("contentLines", contentLines)

        paragraphs = self.getParagraphs(contentLines, stopWords)
        print("paragraphs", paragraphs)

        rankedSentences = self.sentenceRank(paragraphs)

        orderedSentences = []
        for p in paragraphs:
            for s in p.Sentences:
                orderedSentences.append(s)

        reducedSentences = []
        i = 0
        while i < math.trunc(len(rankedSentences) * reductionRatio):
            s = rankedSentences[i][0].Sentence
            position = orderedSentences.index(s)
            reducedSentences.append((s, position))
            i = i + 1
        reducedSentences = sorted(reducedSentences, key=lambda x: x[1])

        reducedText = []
        for s,r in reducedSentences:
            reducedText.append(s.getFullSentence())
        return reducedText	

In [13]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution
import re

[nltk_data] Downloading package punkt to /home/anuradha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
df = pd.read_csv("tennis_articles_v4.csv")

In [15]:
df.head()

Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [17]:
import string
reduction = Reduction()

text = df['article_text'][0]
reduction_ratio = 0.1

reduced_text = reduction.reduce(text, reduction_ratio)

print(reduced_text)


lines ["Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in th

In [18]:
print(text)

Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same s

In [19]:
print(reduced_text)

["I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're categorized, you're a tennis player, so you're going to get along with tennis players."]
