# Position Rank Keyphrase Extraction

The goal of this notebook is to have a look at a few keyphrases extracted by Position Rank and compare them with the labels of the document

In [11]:
import pandas as pd
import pke
from rouge import Rouge
from os import listdir
import spacy
nlp = spacy.load("en_core_web_sm")
import time

In [7]:
pos = {'NOUN', 'PROPN', 'ADJ'}
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

# Extractors
position_rank_extractor = pke.unsupervised.PositionRank()

In [8]:
def extract_keyphrases(extractor, doc, index, grammar=None, text_rank=None, pos=None):
    try:
        # load the content of the document
        extractor.load_document(input=doc, language='en', normalization=None)

        # select the noun phrases up to 3 words as keyphrase candidates
        if grammar is not None:
            extractor.candidate_selection(grammar=grammar, maximum_word_number=3)
        else:
            extractor.candidate_selection()

        # weight the candidates using the sum of their word's scores
        # computed using random walk biaised with the position of the words
        # in the document. In the graph, nodes are words (nouns and
        # adjectives only) that are connected if they occur in a window of
        # 10 words.
        if text_rank:
            extractor.candidate_weighting(window=10, pos=pos, top_percent=0.33)
        else:
            extractor.candidate_weighting(window=10, pos=pos)

        # get the 10-highest scored candidates as keyphrases
        keyphrases = extractor.get_n_best(n=10)

        if keyphrases is not None and keyphrases:
            res = f"{index} : {keyphrases[0][0]}"
        else:
            res = f"{index} : None"

    except Exception as e:
        # Handle specific exceptions if needed
        res = f"Error processing {index}: {str(e)}"

    # return the most predominant keyphrase
    return res

In [9]:
def get_keyphrases(limitSize):
    all_keyphrases = []

    dir = "Inspec/docsutf8/"

    for i in range(0, limitSize):
        try:
            with open(f"{dir}/{i}.txt") as inspec_file:
                doc = inspec_file.read()
            print(f"Processing file {i}.txt", end='\r')
        except:
            continue

        keyphrase = extract_keyphrases(position_rank_extractor, doc, i, grammar)
        
        all_keyphrases.append(keyphrase)

    return all_keyphrases

In [10]:
# extract keyphrases of the first 8 documents
all_keyphrases = get_keyphrases(12)

print(all_keyphrases)

['2 : wavelength services', '3 : new cash', '4 : cheap company stock', '6 : regulatory compliance efforts', '7 : new anti-spam act', '8 : global crossing', '9 : competitive capabilities', '11 : cognitive social capital']


Overall, keyphrases extracted do not correspond to anotated keys. We only have 2 correct keyphrases out of the 8 documents

Here are the keys : 

[ 
 2 : wavelength services
 3 : telecom
 4 : telecom industry
 6 : SBC Communications
 7 : Sprint
 8 : Hutchison Telecommunications
 9 : competitive capabilities
 11 : innovation
]