# TF IDF

In [1]:
docA = "The cat sat on my face"
docB = "The dog sat on my bed"

In [2]:
bowA = docA.split(" ")
bowB = docB.split(" ")

In [3]:
bowB

['The', 'dog', 'sat', 'on', 'my', 'bed']

In [4]:
wordSet = set(bowA).union(set(bowB))

In [5]:
wordSet

{'The', 'bed', 'cat', 'dog', 'face', 'my', 'on', 'sat'}

In [6]:
wordDictA = dict.fromkeys(wordSet, 0) 
wordDictB = dict.fromkeys(wordSet, 0)

In [7]:
wordDictA

{'cat': 0, 'on': 0, 'face': 0, 'dog': 0, 'The': 0, 'sat': 0, 'my': 0, 'bed': 0}

In [8]:
for word in bowA:
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1

In [9]:
wordDictA

{'cat': 1, 'on': 1, 'face': 1, 'dog': 0, 'The': 1, 'sat': 1, 'my': 1, 'bed': 0}

In [10]:
import pandas as pd
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,cat,on,face,dog,The,sat,my,bed
0,1,1,1,0,1,1,1,0
1,0,1,0,1,1,1,1,1


In [11]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

In [12]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)

In [13]:
tfBowA

{'cat': 0.16666666666666666,
 'on': 0.16666666666666666,
 'face': 0.16666666666666666,
 'dog': 0.0,
 'The': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'my': 0.16666666666666666,
 'bed': 0.0}

In [14]:
tfBowB

{'cat': 0.0,
 'on': 0.16666666666666666,
 'face': 0.0,
 'dog': 0.16666666666666666,
 'The': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'my': 0.16666666666666666,
 'bed': 0.16666666666666666}

In [15]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

In [16]:
idfs = computeIDF([wordDictA, wordDictB])

In [17]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [18]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

In [19]:
import pandas as pd
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,cat,on,face,dog,The,sat,my,bed
0,0.050172,0.0,0.050172,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.050172,0.0,0.0,0.0,0.050172


In [None]:
from sklearn.feature_selection import 

# Sentence embeddings

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from typing import List

In [None]:
class Word:
    def __init__(self, text, vector):
        self.text = text
        self.vector = vector
    
    def __str__(self):
        return self.text + ' : ' + str(self.vector)

    def __str__(self):
        return self.text + ' : ' + str(self.vector)


In [None]:
class Sentence:
    def __init__(self, word_list):
        self.word_list = word_list

    def len(self):
        return len(self.word_list)

    def __str__(self):
        word_str_list = [word.text for word in self.word_list]
        return ' '.join(word_str_list)

    def __repr__(self):
        return self.__str__()


In [None]:
def get_word_frequency(word_text):
    return 0.0001

In [None]:
def sentence_to_vec(sentence_list, embedding_size, a):
    sentence_set = []
    for sentence in sentence_list:
        vs = np.zeros(embedding_size)
        sentence_length = sentence.len()
        for word in sentence.word_list:
            a_value = a / (a + get_word_frequency(word.text))  # smooth inverse frequency, SIF
            vs = np.add(vs, np.multiply(a_value, word.vector)) # vs += sif * word_vector
        
        vs = np.divide(vs, sentence_length)  # weighted average
        sentence_set.append(vs)  # add to our existing re-calculated set of sentences
    
    # calculate PCA of this sentence set
    pca = PCA()
    pca.fit(np.array(sentence_set))
    u = pca.components_[0]  # the PCA vector
    u = np.multiply(u, np.transpose(u))  # u x uT
    
    if len(u) < embedding_size:
        for i in range(embedding_size - len(u)):
            u = np.append(u, 0)  # add needed extension for multiplication below
    
    # resulting sentence vectors, vs = vs -u x uT x vs
    sentence_vecs = []
    for vs in sentence_set:
        sub = np.multiply(u,vs)
        sentence_vecs.append(np.subtract(vs, sub))

    return sentence_vecs

# textrank

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt') # one time execution
import re

In [None]:
df = pd.read_csv('/Users/zn-nlp/Documents/tennis_articles_v4.csv', encoding='utf-8')

In [None]:
df


In [None]:
sentences = []
for s in df['article_text']:
    sentences.append(sent_tokenize(s))

In [None]:
sentences = [y for x in sentences for y in x]

In [None]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [None]:
nltk.download('stopwords')# one time execution

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [None]:
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [None]:
# download pretrained GloVe word embeddings
! wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
! unzip glove*.zip

In [None]:
# Extract word vectors
word_embeddings = {}
f = open('/Users/zn-nlp/Documents/glove.6B.50d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [None]:
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((50,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((50,))
    sentence_vectors.append(v)

In [None]:
len(sentence_vectors)


In [None]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,50), sentence_vectors[j].reshape(1,50))[0,0]

In [None]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [None]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [None]:
# Specify number of sentences to form the summary
sn = 3

# Generate summary
for i in range(sn):
    print(ranked_sentences[i][1])