#### Lab 7 
#### Rouge Metrics

In [1]:
!pip install fuzzywuzzy



In [2]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
import numpy as np
from fuzzywuzzy import fuzz 
import ipywidgets as widgets
import pprint
from ipywidgets import interact, interact_manual
import re
__PATH__ = "/Users/ts/Downloads/data.csv"



In [3]:
df = pd.read_csv(__PATH__,sep=";",header=0)

In [4]:
df.head()

Unnamed: 0,id,updatedDate,publishedDate,title,summary,authors,category,metaData,downloadLink,filePath
0,http://arxiv.org/abs/1407.6950v1,2014-07-24T16:56:39Z,2014-07-24T16:56:39Z,"How,whenAndHowMuchACardDeckIsWellShuffled.pdf",The Thesis Consider The Mixing Of Few 3 4 ...,Benjamin Isac Fargion,cs.DM,"Italian Thesis In Engeenering Computer, 26 Feb...",http://arxiv.org/pdf/1407.6950v1.pdf,"./files/How,whenAndHowMuchACardDeckIsWellShuff..."
1,http://arxiv.org/abs/0907.0618v1,2009-07-03T12:35:10Z,2009-07-03T12:35:10Z,QuantumIsometryGroups.pdf,This Thesis Contains The Formulation And Com...,Jyotishman Bhowmick,math.OA,Thesis,http://arxiv.org/pdf/0907.0618v1.pdf,./files/QuantumIsometryGroups.pdf
2,http://arxiv.org/abs/1806.09601v2,2018-07-14T17:06:27Z,2018-06-25T17:55:59Z,ComputationAndBoundingOfFolkmanNumbers.pdf,Phd Thesis Under The Supervision Of Professo...,Aleksandar Bikov,math.CO,PhD Thesis,http://arxiv.org/pdf/1806.09601v2.pdf,./files/ComputationAndBoundingOfFolkmanNumbers...
3,http://arxiv.org/abs/1905.03014v1,2019-05-08T11:47:34Z,2019-05-08T11:47:34Z,OnChurch'sThesisInCubicalAssemblies.pdf,"We Show That Church's Thesis, The Axiom Stat...","Andrew Swan, Taichi Uemura,",math.LO,0,http://arxiv.org/pdf/1905.03014v1.pdf,./files/OnChurch'sThesisInCubicalAssemblies.pdf
4,http://arxiv.org/abs/1901.04911v1,2019-01-15T16:24:07Z,2019-01-15T16:24:07Z,UnconstrainedChurchTuringThesisCannotPossiblyB...,The Church Turing Thesis Asserts That If A P...,Yuri Gurevich,cs.LO,0,http://arxiv.org/pdf/1901.04911v1.pdf,./files/UnconstrainedChurchTuringThesisCannotP...


#### Preprocessing the title to list of tokens

In [5]:
titles = list(df['title'].apply(
    lambda t : 
        tuple(
            filter(lambda e:not e in stopwords.words('english'),
                map(lambda e:e.lower(),
                       re.findall('([A-Z]{1}[a-z]+)',t.replace('.pdf','')))
                )
            )
        )
    )

In [6]:
res = {}
for title in titles: 
    synsets = {} 
    for word in title: 
        synsets[word]=[synset for synset in wn.synsets(word)] 
    res[title] = synsets

#### Top ten closest articles with fuzzy metrics of titles

In [7]:
def extract_hypernyms_level_1(token):
    hypernyms1 = []
    synsets_ = wn.synsets(token) 
    for i in range(len(synsets_)):
        hypernyms1.extend(wn.synsets(token)[i].hypernyms()) 
    hyper = set(hypernyms1)
    return hyper

def extract_hypernyms_level_2(token):
    hypernym_level_1 = extract_hypernyms_level_1(token)
    hypernyms2 = []
    for hypernym1 in hypernym_level_1:
        hypernyms2.extend(hypernym1.hypernyms())
    hyper = set(hypernyms2)
    return hyper

def common_hypernyms(a,b):
    return len(a.intersection(b))


In [10]:
def calculate_fscore(a,b):
    intersection = a.intersection(b)
    if (len(intersection) == 0):
        fscore = 0
    else:
        prec = len(intersection)/(len(a))
        recall = len(intersection)/(len(b))
        fscore = 2 * prec * recall/(prec + recall)
    return intersection, fscore


def distance(a,b):
    a = set(a)
    b = set(b)
    interseption, fscore = calculate_fscore(a,b)
    a = a - interseption 
    b = b - interseption 
    if len(a) == 0 or len(b) == 0: 
        result = 1 - fscore
    else: 
        penalty = 0
     
        for word_a in a:
            for word_b in b:
                hyp_a_1 = extract_hypernyms_level_1(word_a)
                hyp_b_1 = extract_hypernyms_level_1(word_b)
                number_of_1level_hyp = common_hypernyms(hyp_a_1,hyp_b_1)
                if (number_of_1level_hyp > 0):
                    penalty = penalty + 0.66
              
                if (number_of_1level_hyp == 0):
                    hyp_a_2 = extract_hypernyms_level_2(word_a)
                    hyp_b_2 = extract_hypernyms_level_2(word_b)
                    number_of_2level_hyp = common_hypernyms(hyp_a_2, hyp_b_2) + common_hypernyms(hyp_a_2, hyp_b_1)+ common_hypernyms(hyp_a_1, hyp_b_2)
                    if (number_of_2level_hyp > 0):
                        penalty = penalty + 0.33
       
        if penalty > 0:
            recall_hyp = penalty/len(a)
            prec_hyp = penalty/len(b)
            fscore_penalty = 2 * (recall_hyp*prec_hyp) / (prec_hyp + recall_hyp)
        else:
            fscore_penalty = 0
                                                    
        result = 1 - (2 * fscore + fscore_penalty)/3   
    
    return result;

In [11]:
buff = list(res.items())
epoch = 0
dist = np.zeros((len(buff),len(buff)))
for lli,ll in enumerate(buff):
    for rri,rr in enumerate(buff):
        epoch = epoch + 1
        dist[lli,rri]=distance(ll[0],rr[0])
        
        if (epoch%10000 == 0):
            print('#', epoch)

# 10000
# 20000
# 30000
# 40000
# 50000
# 60000
# 70000
# 80000
# 90000
# 100000
# 110000
# 120000
# 130000
# 140000
# 150000
# 160000
# 170000
# 180000
# 190000
# 200000
# 210000
# 220000
# 230000
# 240000
# 250000
# 260000
# 270000
# 280000
# 290000
# 300000
# 310000
# 320000
# 330000
# 340000
# 350000
# 360000
# 370000
# 380000
# 390000
# 400000
# 410000
# 420000
# 430000
# 440000
# 450000
# 460000
# 470000
# 480000
# 490000
# 500000
# 510000
# 520000
# 530000
# 540000
# 550000
# 560000
# 570000
# 580000
# 590000
# 600000
# 610000
# 620000
# 630000
# 640000
# 650000
# 660000
# 670000
# 680000
# 690000
# 700000
# 710000
# 720000
# 730000
# 740000
# 750000
# 760000
# 770000
# 780000
# 790000
# 800000
# 810000
# 820000
# 830000
# 840000
# 850000
# 860000
# 870000
# 880000
# 890000
# 900000
# 910000
# 920000
# 930000
# 940000
# 950000
# 960000
# 970000
# 980000
# 990000


In [12]:
@interact(ind=(0,len(buff)-1,1))
def h(ind=0):
    pp = pprint.PrettyPrinter(indent=4)
    print(' '.join(buff[ind][0]))
    pp.pprint([buff[i][0] for i in dist[ind][:].argsort()[1:11]])


interactive(children=(IntSlider(value=0, description='ind', max=995), Output()), _dom_classes=('widget-interac…

In [13]:
@interact(ind=(0,len(buff)-1,1))
def hypernyms(ind=0):
    pp = pprint.PrettyPrinter(indent=4)
    print(' '.join(buff[ind][0]))
    pp.pprint(buff[ind][1])

interactive(children=(IntSlider(value=0, description='ind', max=995), Output()), _dom_classes=('widget-interac…