In [1]:
from Shapley import *
from TextRank import *
from utils import *

# import libraries
from collections import Counter
from itertools import combinations
from math import sqrt
import matplotlib.pyplot as plt
import networkx as nx
from nltk import word_tokenize, sent_tokenize, FreqDist,pos_tag
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import RegexpTokenizer
from operator import itemgetter
import re
%matplotlib inline

In [2]:
# set of all nouns
NOUNS = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}

In [23]:
class Document():
    '''
    The master class for our Document Summerization module.
    Incorporates all features related to Document
    '''
    
    def __init__(self, document):
        self.document = document
        self.sents = sent_tokenize(self.document)
        self.sents_id = {self.sents[i]: i for i in range(len(self.sents))}
        self.word_freq = FreqDist(clean(self.document))
        self.graph = None
        self.params = { 'thresh': 0.3
            
        }
        
                
    def __str__(self):
        return self.document
    
    
    def statistical_sim(self, sent1, sent2):
        '''
        Statistical similarity between sentences
        based on the cosine method
        Returns: float (the cosine similarity b/w sent1 and sent2)
        '''
        sent_token1 = Counter(sent1)
        sent_token2 = Counter(sent2)
        
        intxn = set(sent_token1) & set(sent_token2)
        numerator = sum([sent_token1[x] * sent_token2[x] for x in intxn])
        
        mod1 = sum([sent_token1[x]**2 for x in sent_token1.keys()])
        mod2 = sum([sent_token2[x]**2 for x in sent_token2.keys()])
        denominator = sqrt(mod1)*sqrt(mod2)
        
        if not denominator:
            return 0.0

        return float(numerator)/denominator
    
    
    def semantic_sim(self, sent1, sent2):
        '''
        A semantic similarity score between two sentences
        based on WordNet
        Returns: float (the semantic similarity measure)
        '''
        score = 0
        sent1 = [word for word in sent1 if word in NOUNS]
        sent2 = [word for word in sent2 if word in NOUNS]
        for t1 in sent1:
            for t2 in sent2:
                score += semantic_score(t1,t2)
        try:
            return score/(len(sent1 + sent2))  
        except:
            return 10000
    
    
    def construct_graph(self):
        '''
        Constructs the word similarity graph
        '''
        length = len(self.sents)
        connected, adj_mat = [], [[0 for i in range(length)] for j in range(length)]
        for pair in combinations(self.sents, 2):
            cpair = clean(pair[0]), clean(pair[1])
            weight = self.statistical_sim(*cpair) + \
                     self.semantic_sim(*cpair)
            connected.append((pair[0], pair[1], weight))
        for sent1, sent2, weight in connected:
            adj_mat[self.sents_id[sent1]][self.sents_id[sent2]] = weight
            adj_mat[self.sents_id[sent2]][self.sents_id[sent1]] = weight
        self.graph = draw_graph(connected, self.params['thresh'])
        self.adj_mat = adj_mat

In [4]:
doc = '''
burning tires tear gas and clashes with riot police the ugly scenes that come with workers strikes are all too familiar in france a country constantly trying to balance its culture of workers rights with a more efficient economy.
such scenes are being played out across the country friday as unions have called for workers to step up protests that have for the past week crippled parts of france.
employees of oil refineries nuclear power plants and some public transportation have left one in three gas stations dry forcing vehicles to search for well stocked stations and causing long lines at the pump.
people are now hoarding gas worried that it may be some time until supply levels are back to normal.
the workers are protesting a labor reform bill put forward by the government that will make it easier for companies to hire and fire employees.
the governments argument is that the strict laws that make french workers among the best protected in the world leave companies in a difficult position where they cant take on new staff.
french prime minister manuel valls told local media on thursday that he might be willing to modify some of the proposals giving hope to french people that the protests and fuel shortages may soon stop.
but workers unions friday responded with a call to step up rallies and blockades demanding a complete withdrawal of the bill.
we call for the continuation and intensification of protests a group of unions behind the protests said in a statement.
the governments violent words its contempt for the social movement and its refusal to withdraw this bill reinforces our commitment it said.
'''

In [5]:
multi_doc = Document(doc)
multi_doc.construct_graph()

textrank_score = textrank_weighted(multi_doc.graph)
for sents, score in textrank_score:
    print "node = %3d\t score = %f"%(multi_doc.sents_id[sents], score)

node =   0	 score = 1.208138
node =   6	 score = 1.199243
node =   2	 score = 1.160498
node =   1	 score = 1.125947
node =   5	 score = 1.092980
node =   4	 score = 1.049301
node =   9	 score = 0.848015
node =   7	 score = 0.837494
node =   3	 score = 0.807110
node =   8	 score = 0.669530


In [6]:
shapley_graph = Graph(multi_doc.adj_mat)



shapley_rank = shapley_graph.shapley(100, 100)

for key, value in shapley_rank:
    print "node = %3d\t Shapley value = %f"%(key, value)

node =   0	 Shapley value = 1.016700
node =   1	 Shapley value = 0.960900
node =   2	 Shapley value = 0.919000
node =   3	 Shapley value = 0.874800
node =   5	 Shapley value = 0.872300
node =   4	 Shapley value = 0.863400
node =   6	 Shapley value = 0.821700
node =   7	 Shapley value = 0.751700
node =   9	 Shapley value = 0.724800
node =   8	 Shapley value = 0.702900


In [7]:
print 'Top-k sents Id by TextRank'
print [multi_doc.sents_id[sents] for sents, score in textrank_score]

print '\nTop-k sentence Id by Shapley'
print shapley_graph.top_k(len(multi_doc.sents))

Top-k sents Id by TextRank
[0, 6, 2, 1, 5, 4, 9, 7, 3, 8]

Top-k sentence Id by Shapley
[0, 1, 2, 3, 5, 4, 6, 7, 9, 8]


## Applying on the DUC Dataset

In [8]:
import glob

In [36]:
for doc in glob.glob('../multi/multi.a.*'):
    print 'Operating on filename: {}'.format(doc)
    name = doc.split('.')[-2]
    with open(doc,'r') as fp:
        textrank,shapley = '',''
        document = fp.read()
        ### Applying text-rank algorithm
        multi_doc = Document(document)
        multi_doc.construct_graph()
        textrank_score = textrank_weighted(multi_doc.graph)
        for sents,score in textrank_score:
            textrank =  textrank + ' ' + sents
            if len(textrank.split(' ')) > 200:
                break
        ### Applying Shapley Value Theorem Algorithm
        shapley_graph = Graph(multi_doc.adj_mat)
        shapley_rank = shapley_graph.shapley(100, 100)
        for key, value in shapley_rank:
            shapley = shapley + ' ' + multi_doc.sents_id.keys()[multi_doc.sents_id.values().index(key)]
            if len(shapley.split(' ')) > 200:
                break
    ### Saving data to the corresponding files
    with open('../textrank/summ.{}.txt'.format(name),'w') as fp:
        fp.write(textrank)
    with open('../shap/summ.{}.txt'.format(name),'w') as fp:
        fp.write(shapley)
    ### Waiting for the user input
    if(len(str(raw_input('Press any key to continue....'))) > 0):
        continue

Operating on filename: ../multi/multi.a.001.txt
Press any key to continue....a
Operating on filename: ../multi/multi.a.002.txt
Press any key to continue....
Operating on filename: ../multi/multi.a.003.txt
Press any key to continue....
Operating on filename: ../multi/multi.a.004.txt
Press any key to continue....
Operating on filename: ../multi/multi.a.005.txt
Press any key to continue....
Operating on filename: ../multi/multi.a.006.txt
Press any key to continue....
Operating on filename: ../multi/multi.a.007.txt
Press any key to continue....
Operating on filename: ../multi/multi.a.008.txt
Press any key to continue....
Operating on filename: ../multi/multi.a.009.txt
Press any key to continue....
Operating on filename: ../multi/multi.a.010.txt
Press any key to continue....
Operating on filename: ../multi/multi.a.011.txt
Press any key to continue....
Operating on filename: ../multi/multi.a.012.txt
Press any key to continue....
Operating on filename: ../multi/multi.a.013.txt
Press any key t

In [42]:
def cleaner(file_handle):
    for f in file_handle:
        with open(f,'r') as fp:
            tokens = sent_tokenize(fp.read())
        with open(f,'w') as fp:
            for line in tokens:
                line = line.strip()
                fp.write(line+'\n')

In [44]:
cleaner(glob.glob('../textrank/*'))
cleaner(glob.glob('../shap/*'))
cleaner(glob.glob('../models/*'))