# TD1

### Imports

In [1]:
import pandas as pd
import pke
from rouge import Rouge
from os import listdir
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# Defining constants
# pos and grammar for Position Rank
pos = {'NOUN', 'PROPN', 'ADJ'}
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

# Extractors
position_rank_extractor = pke.unsupervised.PositionRank()
single_rank_extractor = pke.unsupervised.SingleRank()
text_rank_extractor = pke.unsupervised.TextRank()

rouge = Rouge()

# Scores
all_scores_pr = []
all_scores_sr = []

In [3]:
def extract_keyphrases(extractor, doc, grammar=None, text_rank=None):
   # load the content of the document
   extractor.load_document(input=doc, language='en', normalization=None)

   # select the noun phrases up to 3 words as keyphrase candidates
   if grammar is not None:
      extractor.candidate_selection(grammar=grammar, maximum_word_number=3)
   else:
      extractor.candidate_selection()

   # weight the candidates using the sum of their word's scores that are
   # computed using random walk biaised with the position of the words
   # in the document. In the graph, nodes are words (nouns and
   # adjectives only) that are connected if they occur in a window of
   # 10 words.
   if text_rank:
      extractor.candidate_weighting(window=10, pos=pos, top_percent=0.33)
   else:
      extractor.candidate_weighting(window=10, pos=pos)

   # get the 10-highest scored candidates as keyphrases
   keyphrases = extractor.get_n_best(n=10)

   # compute rouge scores
   scores = rouge.get_scores(keyphrases[0][0], doc)

   return scores

In [4]:
def mean_all_scores(all_scores):
    mean_r_1, mean_p_1, mean_f_1 = 0, 0, 0
    mean_r_2, mean_p_2, mean_f_2 = 0, 0, 0
    mean_r_l, mean_p_l, mean_f_l = 0, 0, 0

    total_scores = len(all_scores)

    for scores in all_scores:
        mean_r_1 += scores['rouge-1']['r']
        mean_p_1 += scores['rouge-1']['p']
        mean_f_1 += scores['rouge-1']['f']

        mean_r_2 += scores['rouge-2']['r']
        mean_p_2 += scores['rouge-2']['p']
        mean_f_2 += scores['rouge-2']['f']

        mean_r_l += scores['rouge-l']['r']
        mean_p_l += scores['rouge-l']['p']
        mean_f_l += scores['rouge-l']['f']

    mean_r_1 /= total_scores
    mean_p_1 /= total_scores
    mean_f_1 /= total_scores

    mean_r_2 /= total_scores
    mean_p_2 /= total_scores
    mean_f_2 /= total_scores

    mean_r_l /= total_scores
    mean_p_l /= total_scores
    mean_f_l /= total_scores

    return mean_r_1, mean_p_1, mean_f_1, mean_r_2, mean_p_2, mean_f_2, mean_r_l, mean_p_l, mean_f_l

In [5]:
def get_scores(limitSize):
    all_scores_pr, all_scores_sr, all_scores_tr  = [], [], []

    dir = "Inspec/docsutf8/"
    directory = [dir+f for f in listdir(dir)][:limitSize]

    for i in directory:
        try:
            with open(i) as inspec_file:
                doc = inspec_file.read()
            print(f"Processing file {i}", end='\r')
        except:
            continue

        scores_pr = extract_keyphrases(position_rank_extractor, doc, grammar)
        scores_sr = extract_keyphrases(single_rank_extractor, doc)
        scores_tr = extract_keyphrases(text_rank_extractor, doc, text_rank=True)
        
        if scores_pr != 0:
            all_scores_pr.append(scores_pr[0])
        if scores_sr != 0:
            all_scores_sr.append(scores_sr[0])
        if scores_tr != 0:
            all_scores_tr.append(scores_tr[0])

    return all_scores_pr, all_scores_sr, all_scores_tr

In [6]:
def print_scores(scores):
  mean_r_1, mean_p_1, mean_f_1, mean_r_2, mean_p_2, mean_f_2, mean_r_l, mean_p_l, mean_f_l = mean_all_scores(scores)

  print("Mean_r_1:", mean_r_1)
  print("Mean_p_1:", mean_p_1)
  print("Mean_f_1:", mean_f_1)

  print("Mean_r_2:", mean_r_2)
  print("Mean_p_2:", mean_p_2)
  print("Mean_f_2:", mean_f_2)

  print("Mean_r_l:", mean_r_l)
  print("Mean_p_l:", mean_p_l)
  print("Mean_f_l:", mean_f_l)

In [7]:
all_scores_pr, all_scores_sr, all_scores_tr = get_scores(100)

Processing file Inspec/docsutf8/1053.txt



Processing file Inspec/docsutf8/1735.txt



Processing file Inspec/docsutf8/1721.txt



Processing file Inspec/docsutf8/1047.txt



Processing file Inspec/docsutf8/2200.txt



Processing file Inspec/docsutf8/1709.txt



Processing file Inspec/docsutf8/289.txt



Processing file Inspec/docsutf8/1090.txt



Processing file Inspec/docsutf8/262.txt



Processing file Inspec/docsutf8/276.txt



Processing file Inspec/docsutf8/1084.txt



Processing file Inspec/docsutf8/538.txt



Processing file Inspec/docsutf8/1912.txt



Processing file Inspec/docsutf8/1906.txt



Processing file Inspec/docsutf8/1537.txt



Processing file Inspec/docsutf8/1251.txt



Processing file Inspec/docsutf8/909.txt



Processing file Inspec/docsutf8/1245.txt



Processing file Inspec/docsutf8/1523.txt



Processing file Inspec/docsutf8/2002.txt



Processing file Inspec/docsutf8/921.txt



Processing file Inspec/docsutf8/935.txt



Processing file Inspec/docsutf8/1279.txt



Processing file Inspec/docsutf8/2016.txt



Processing file Inspec/docsutf8/1292.txt



Processing file Inspec/docsutf8/706.txt



Processing file Inspec/docsutf8/712.txt



Processing file Inspec/docsutf8/1286.txt



Processing file Inspec/docsutf8/1443.txt



Processing file Inspec/docsutf8/29.txt



Processing file Inspec/docsutf8/1325.txt



Processing file Inspec/docsutf8/869.txt



Processing file Inspec/docsutf8/1331.txt



Processing file Inspec/docsutf8/1457.txt



Processing file Inspec/docsutf8/2176.txt



Processing file Inspec/docsutf8/15.txt



Processing file Inspec/docsutf8/1319.txt



Processing file Inspec/docsutf8/855.txt



Processing file Inspec/docsutf8/699.txt



Processing file Inspec/docsutf8/841.txt



Processing file Inspec/docsutf8/2162.txt



Processing file Inspec/docsutf8/1480.txt



Processing file Inspec/docsutf8/114.txt



Processing file Inspec/docsutf8/2189.txt



Processing file Inspec/docsutf8/100.txt



Processing file Inspec/docsutf8/1494.txt



Processing file Inspec/docsutf8/128.txt



Processing file Inspec/docsutf8/896.txt



Processing file Inspec/docsutf8/882.txt



Processing file Inspec/docsutf8/1127.txt



Processing file Inspec/docsutf8/1641.txt



Processing file Inspec/docsutf8/1899.txt



Processing file Inspec/docsutf8/1655.txt



Processing file Inspec/docsutf8/1133.txt



Processing file Inspec/docsutf8/1669.txt



Processing file Inspec/docsutf8/316.txt



Processing file Inspec/docsutf8/1682.txt



Processing file Inspec/docsutf8/1696.txt



Processing file Inspec/docsutf8/302.txt



Processing file Inspec/docsutf8/1866.txt



Processing file Inspec/docsutf8/1872.txt



Processing file Inspec/docsutf8/1873.txt



Processing file Inspec/docsutf8/1867.txt



Processing file Inspec/docsutf8/303.txt



Processing file Inspec/docsutf8/1697.txt



Processing file Inspec/docsutf8/1683.txt



Processing file Inspec/docsutf8/317.txt



Processing file Inspec/docsutf8/1668.txt



Processing file Inspec/docsutf8/1654.txt



Processing file Inspec/docsutf8/1132.txt



Processing file Inspec/docsutf8/1126.txt



Processing file Inspec/docsutf8/1898.txt



Processing file Inspec/docsutf8/1640.txt



Processing file Inspec/docsutf8/883.txt



Processing file Inspec/docsutf8/129.txt



Processing file Inspec/docsutf8/897.txt



Processing file Inspec/docsutf8/1495.txt



Processing file Inspec/docsutf8/2188.txt



Processing file Inspec/docsutf8/115.txt



Processing file Inspec/docsutf8/1481.txt



Processing file Inspec/docsutf8/673.txt



Processing file Inspec/docsutf8/840.txt



Processing file Inspec/docsutf8/698.txt



Processing file Inspec/docsutf8/2163.txt



Processing file Inspec/docsutf8/2177.txt



Processing file Inspec/docsutf8/854.txt



Processing file Inspec/docsutf8/1318.txt



Processing file Inspec/docsutf8/14.txt



Processing file Inspec/docsutf8/1330.txt



Processing file Inspec/docsutf8/1456.txt



Processing file Inspec/docsutf8/1442.txt



Processing file Inspec/docsutf8/868.txt



Processing file Inspec/docsutf8/1324.txt



Processing file Inspec/docsutf8/28.txt



Processing file Inspec/docsutf8/1287.txt



Processing file Inspec/docsutf8/713.txt



Processing file Inspec/docsutf8/707.txt



Processing file Inspec/docsutf8/1293.txt



Processing file Inspec/docsutf8/1278.txt



Processing file Inspec/docsutf8/934.txt



## 1. Position Rank

In [8]:
print_scores(all_scores_pr)

Mean_r_1: 0.033937302699837475
Mean_p_1: 0.8383333333333333
Mean_f_1: 0.06473987120564263
Mean_r_2: 0.011970391219890485
Mean_p_2: 0.63
Mean_f_2: 0.023372050642025913
Mean_r_l: 0.03360076423829902
Mean_p_l: 0.8299999999999998
Mean_f_l: 0.06409295762539571


## 2. Single Rank

In [9]:
print_scores(all_scores_sr)

Mean_r_1: 0.04146125153142978
Mean_p_1: 0.8483333333333336
Mean_f_1: 0.0783200384985309
Mean_r_2: 0.01798686206406106
Mean_p_2: 0.7283333333333333
Mean_f_2: 0.0348125409052189
Mean_r_l: 0.04146125153142978
Mean_p_l: 0.8483333333333336
Mean_f_l: 0.0783200384985309


## 3. Text Rank

In [10]:
print_scores(all_scores_tr)

Mean_r_1: 0.03242645378398926
Mean_p_1: 0.8416666666666667
Mean_f_1: 0.06200805257246616
Mean_r_2: 0.012227851350563715
Mean_p_2: 0.6766666666666667
Mean_f_2: 0.02390274002436832
Mean_r_l: 0.03242645378398926
Mean_p_l: 0.8416666666666667
Mean_f_l: 0.06200805257246616
