# TD1

### Imports

In [14]:
import pandas as pd
import pke
from rouge import Rouge
import spacy
nlp = spacy.load("en_core_web_sm")

In [15]:
# Defining constants
# pos and grammar for Position Rank
pos = {'NOUN', 'PROPN', 'ADJ'}
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

# Extractors
position_rank_extractor = pke.unsupervised.PositionRank()
single_rank_extractor = pke.unsupervised.SingleRank()

rouge = Rouge()

# Scores
all_scores_pr = []
all_scores_sr = []

In [16]:
def extract_keyphrases(extractor, doc, grammar=None):
    # load the content of the document
    extractor.load_document(input=doc, language='en', normalization=None)

    # select the noun phrases up to 3 words as keyphrase candidates
    if grammar is not None:
        extractor.candidate_selection(grammar=grammar, maximum_word_number=3)
    else:
        extractor.candidate_selection()

    # weight the candidates using the sum of their word's scores that are
    # computed using random walk biaised with the position of the words
    # in the document. In the graph, nodes are words (nouns and
    # adjectives only) that are connected if they occur in a window of
    # 10 words.
    extractor.candidate_weighting(window=10, pos=pos)

    # get the 10-highest scored candidates as keyphrases
    keyphrases = extractor.get_n_best(n=10)

    # compute rouge scores
    scores = rouge.get_scores(keyphrases[0][0], doc)

    return scores

In [17]:
def mean_all_scores(all_scores):
    mean_r_1, mean_p_1, mean_f_1 = 0, 0, 0
    mean_r_2, mean_p_2, mean_f_2 = 0, 0, 0
    mean_r_l, mean_p_l, mean_f_l = 0, 0, 0

    total_scores = len(all_scores)

    for scores in all_scores:
        mean_r_1 += scores['rouge-1']['r']
        mean_p_1 += scores['rouge-1']['p']
        mean_f_1 += scores['rouge-1']['f']

        mean_r_2 += scores['rouge-2']['r']
        mean_p_2 += scores['rouge-2']['p']
        mean_f_2 += scores['rouge-2']['f']

        mean_r_l += scores['rouge-l']['r']
        mean_p_l += scores['rouge-l']['p']
        mean_f_l += scores['rouge-l']['f']

    mean_r_1 /= total_scores
    mean_p_1 /= total_scores
    mean_f_1 /= total_scores

    mean_r_2 /= total_scores
    mean_p_2 /= total_scores
    mean_f_2 /= total_scores

    mean_r_l /= total_scores
    mean_p_l /= total_scores
    mean_f_l /= total_scores

    return mean_r_1, mean_p_1, mean_f_1, mean_r_2, mean_p_2, mean_f_2, mean_r_l, mean_p_l, mean_f_l

In [18]:
for i in range(2, 100):
    try:
        with open(f'./Inspec/docsutf8/{i}.txt') as inspec_file:
            doc = inspec_file.read()
        print(f"Processing file {i}.txt", end='\r')
    except:
        continue

    position_rank_scores = extract_keyphrases(position_rank_extractor, doc, grammar=grammar)
    single_rank_scores = extract_keyphrases(single_rank_extractor, doc)

    all_scores_pr.append(position_rank_scores[0])
    all_scores_sr.append(position_rank_scores[0])

Processing file 99.txt

## 1. Position Rank

In [19]:
mean_r_1, mean_p_1, mean_f_1, mean_r_2, mean_p_2, mean_f_2, mean_r_l, mean_p_l, mean_f_l = mean_all_scores(all_scores_pr)

print("Mean_r_1:", mean_r_1)
print("Mean_p_1:", mean_p_1)
print("Mean_f_1:", mean_f_1 , "\n")

print("Mean_r_2:", mean_r_2)
print("Mean_p_2:", mean_p_2)
print("Mean_f_2:", mean_f_2, "\n")

print("Mean_r_l:", mean_r_l)
print("Mean_p_l:", mean_p_l)
print("Mean_f_l:", mean_f_l)

Mean_r_1: 0.03428516174368198
Mean_p_1: 0.8452380952380952
Mean_f_1: 0.0650689596969027 

Mean_r_2: 0.013238319316660786
Mean_p_2: 0.7261904761904762
Mean_f_2: 0.025741417478775546 

Mean_r_l: 0.03403928462140754
Mean_p_l: 0.8373015873015874
Mean_f_l: 0.06459200609494908


## 2. Single Rank

In [20]:
mean_r_1, mean_p_1, mean_f_1, mean_r_2, mean_p_2, mean_f_2, mean_r_l, mean_p_l, mean_f_l = mean_all_scores(all_scores_sr)

print("Mean_r_1:", mean_r_1)
print("Mean_p_1:", mean_p_1)
print("Mean_f_1:", mean_f_1, "\n")

print("Mean_r_2:", mean_r_2)
print("Mean_p_2:", mean_p_2)
print("Mean_f_2:", mean_f_2, "\n")

print("Mean_r_l:", mean_r_l)
print("Mean_p_l:", mean_p_l)
print("Mean_f_l:", mean_f_l)

Mean_r_1: 0.03428516174368198
Mean_p_1: 0.8452380952380952
Mean_f_1: 0.0650689596969027 

Mean_r_2: 0.013238319316660786
Mean_p_2: 0.7261904761904762
Mean_f_2: 0.025741417478775546 

Mean_r_l: 0.03403928462140754
Mean_p_l: 0.8373015873015874
Mean_f_l: 0.06459200609494908
