# TD1

### Imports

In [1]:
import pandas as pd
import pke
from rouge import Rouge
from os import listdir
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# Defining constants
# pos and grammar for Position Rank
pos = {'NOUN', 'PROPN', 'ADJ'}
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

# Extractors
position_rank_extractor = pke.unsupervised.PositionRank()
single_rank_extractor = pke.unsupervised.SingleRank()
text_rank_extractor = pke.unsupervised.TextRank()

rouge = Rouge()

In [3]:
def extract_keyphrases(extractor, doc, grammar=None, text_rank=None):
   # load the content of the document
   extractor.load_document(input=doc, language='en', normalization=None)

   # select the noun phrases up to 3 words as keyphrase candidates
   if grammar is not None:
      extractor.candidate_selection(grammar=grammar, maximum_word_number=3)
   else:
      extractor.candidate_selection()

   # weight the candidates using the sum of their word's scores that are
   # computed using random walk biaised with the position of the words
   # in the document. In the graph, nodes are words (nouns and
   # adjectives only) that are connected if they occur in a window of
   # 10 words.
   if text_rank:
      extractor.candidate_weighting(window=10, pos=pos, top_percent=0.33)
   else:
      extractor.candidate_weighting(window=10, pos=pos)

   # get the 10-highest scored candidates as keyphrases
   keyphrases = extractor.get_n_best(n=10)

   # compute rouge scores
   scores = rouge.get_scores(keyphrases[0][0], doc)

   return scores

In [4]:
def mean_all_scores(all_scores):
    mean_r_1, mean_p_1, mean_f_1 = 0, 0, 0
    mean_r_2, mean_p_2, mean_f_2 = 0, 0, 0
    mean_r_l, mean_p_l, mean_f_l = 0, 0, 0

    total_scores = len(all_scores)

    for scores in all_scores:
        mean_r_1 += scores['rouge-1']['r']
        mean_p_1 += scores['rouge-1']['p']
        mean_f_1 += scores['rouge-1']['f']

        mean_r_2 += scores['rouge-2']['r']
        mean_p_2 += scores['rouge-2']['p']
        mean_f_2 += scores['rouge-2']['f']

        mean_r_l += scores['rouge-l']['r']
        mean_p_l += scores['rouge-l']['p']
        mean_f_l += scores['rouge-l']['f']

    mean_r_1 /= total_scores
    mean_p_1 /= total_scores
    mean_f_1 /= total_scores

    mean_r_2 /= total_scores
    mean_p_2 /= total_scores
    mean_f_2 /= total_scores

    mean_r_l /= total_scores
    mean_p_l /= total_scores
    mean_f_l /= total_scores

    return mean_r_1, mean_p_1, mean_f_1, mean_r_2, mean_p_2, mean_f_2, mean_r_l, mean_p_l, mean_f_l

In [5]:
def get_scores(limitSize):
    all_scores_pr, all_scores_sr, all_scores_tr  = [], [], []

    dir = "Inspec/docsutf8/"
    directory = [dir+f for f in listdir(dir)][:limitSize]

    for i in directory:
        try:
            with open(i) as inspec_file:
                doc = inspec_file.read()
            print(f"Processing file {i}", end='\r')
        except:
            continue

        scores_pr = extract_keyphrases(position_rank_extractor, doc, grammar)
        scores_sr = extract_keyphrases(single_rank_extractor, doc)
        scores_tr = extract_keyphrases(text_rank_extractor, doc, text_rank=True)
        
        if scores_pr != 0:
            all_scores_pr.append(scores_pr[0])
        if scores_sr != 0:
            all_scores_sr.append(scores_sr[0])
        if scores_tr != 0:
            all_scores_tr.append(scores_tr[0])

    return all_scores_pr, all_scores_sr, all_scores_tr

In [6]:
def print_scores(scores):
  mean_r_1, mean_p_1, mean_f_1, mean_r_2, mean_p_2, mean_f_2, mean_r_l, mean_p_l, mean_f_l = mean_all_scores(scores)

  print("Mean_r_1:", mean_r_1)
  print("Mean_p_1:", mean_p_1)
  print("Mean_f_1:", mean_f_1)

  print("Mean_r_2:", mean_r_2)
  print("Mean_p_2:", mean_p_2)
  print("Mean_f_2:", mean_f_2)

  print("Mean_r_l:", mean_r_l)
  print("Mean_p_l:", mean_p_l)
  print("Mean_f_l:", mean_f_l)

In [None]:
all_scores_pr, all_scores_sr, all_scores_tr = get_scores(100)

## 1. Position Rank

Position Rank extracts keyphrases by determining the importance of a word based on its position in the document.

It's an unsupervised algorithm that is decomposed like this :
1. Calculates the Term Frequency of a word(TF)
2. Adjusts the term frequency based on the length of the document (Document Length Normalization)
3. Assigns scores to words based on their positions within sentences. **Words in the beggining and end of sentences have higher scores.** (Sentence Position Score)
4. Combines the term frequency and sentence position scores to determine the overall importance of each word (Sentence Salience Score)
5. Extracts words that have the highest salience scores (Keyphrase Extraction)

In [8]:
print_scores(all_scores_pr)

Mean_r_1: 0.033937302699837475
Mean_p_1: 0.8383333333333333
Mean_f_1: 0.06473987120564263
Mean_r_2: 0.011970391219890485
Mean_p_2: 0.63
Mean_f_2: 0.023372050642025913
Mean_r_l: 0.03360076423829902
Mean_p_l: 0.8299999999999998
Mean_f_l: 0.06409295762539571


## 2. Single Rank

In [9]:
print_scores(all_scores_sr)

Mean_r_1: 0.04146125153142978
Mean_p_1: 0.8483333333333336
Mean_f_1: 0.0783200384985309
Mean_r_2: 0.01798686206406106
Mean_p_2: 0.7283333333333333
Mean_f_2: 0.0348125409052189
Mean_r_l: 0.04146125153142978
Mean_p_l: 0.8483333333333336
Mean_f_l: 0.0783200384985309


## 3. TextRank
TextRank is an algorithm that identifies keywords by assessing their significance within a connected graph. It functions by analyzing the relationships between words or phrases to determine their importance in the context of the overall text.
Here is he algorithm:
* Tokenization and part of speech tagging
* Reducing the number of words based on a syntactic filter (in our case we keep only noons propositions and adjectives)
* With all the remainig words are added to the graph and an edge is craeted for every words that co-occur in a window of N words (in our case, N=10)

At this point we have an undirected unweigth graph.

* Then a initial value of 1 is set for every vertice
* Finally a modify version of the PageRank algorithm is run to upgrade the vertice score.
The main idea behind this algorithm is to give more importance to a word which is linked by many others. Moreover a link to word which is linked by many other is more important than a link to word which is linked to only one word. This is the same algorithm used to rankes web pages. The only difference is that we also use a weight to each wich corresponds to the co-occurence score
* After that, we keep only a a third of our vertices which corresponds to the vertices which have the highest score. 
* A post processing is done on the remainng vertices and if two words appears next to each other in the document a multi-word keyword is created.

In [10]:
print_scores(all_scores_tr)

Mean_r_1: 0.03242645378398926
Mean_p_1: 0.8416666666666667
Mean_f_1: 0.06200805257246616
Mean_r_2: 0.012227851350563715
Mean_p_2: 0.6766666666666667
Mean_f_2: 0.02390274002436832
Mean_r_l: 0.03242645378398926
Mean_p_l: 0.8416666666666667
Mean_f_l: 0.06200805257246616


## Conclusion

**Which algorithm got the best RED score?**

From the 3 keyphrase extraction algorithms (Position Rank, Single Rank and Text Rank), Single Rank has the best ROUGE score.



**How would you represent each document and its respective extracted key phrases in the form of a knowledge graph? What vocabulary would you use?**

Low-level : each document would have a graph with nodes representing the extracted keyphrases.

High-level : every documents would be represented as 1 node and would be linked by their predominant keyphrase extracted. 

We can use the vocabulary of the extracted keyphrases of all the documents.