#### Read the data, it returns a dictionary type, in which there are three keys: id, body and keyphrases. The values corresponds to 2000 scientific articles. The code is tested under python 3.

In [1]:
import json

In [2]:
f = open('data_body_keyphrases.json', encoding='utf-8')
#data is a dic type
data = json.load(f)

In [3]:
data.keys()

dict_keys(['id', 'body', 'keyphrases'])

In [4]:
docids = data['id']
len(docids)

2000

In [5]:
docs = data['body']
len(docs)

2000

In [6]:
keyphrases = data['keyphrases']
len(keyphrases)

2000

In [7]:
docs = docs[:3]
keyphrases = keyphrases[:3]

### Part B task: Develop your unsupervised keyword extraction model(s) here:

## TextRank Algorithm

In [8]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer
import numpy as np
import math
import string


[nltk_data] Downloading package stopwords to C:\Users\Varsha
[nltk_data]     Vijayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Text Preprocessing

In [None]:
key_phrases_list = []
wordnet_lemmatizer = WordNetLemmatizer()

#iterate over every article in the data
for article in docs:

    def clean(text):
        #lowercase words
        text = text.lower()
        #url removal
        text = re.sub(r"http\S+", "", text)
        #filter all sets of punctuation, digits, ascii_letters and whitespace.
        printable = set(string.printable)
        text = filter(lambda x: x in printable, text)
        text = "".join(list(text))
        return text

    #pass each article for text pre processing
    Cleaned_text = clean(article)
    
    #tokenize the clean article
    text = word_tokenize(Cleaned_text)

    #get POS tags for the generated tokens
    POS_tag = nltk.pos_tag(text)

    
    adjective_tags = ['JJ','JJR','JJS']

    lemmatized_text = []
    
    #map appropriate POS tags to the tokens and lemmatize them
    for word in POS_tag:
        if word[1] in adjective_tags:
            lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
        else:
            lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun
    
    
    POS_tag = nltk.pos_tag(lemmatized_text)


    stopwords_list = []

    wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] 

    #remove words no in the above POS tag as they are irrelevant
    for word in POS_tag:
        if word[1] not in wanted_POS:
            stopwords_list.append(word[0])

    punctuations = list(str(string.punctuation))
    
    stopwords_list = stopwords_list + punctuations


    stops = stopwords.words('english')

    # add the general stop words from nltk library to the list of stopwords
    lots_of_stopwords = stopwords


    stopwords_plus = []
    stopwords_plus = stopwords_list + stops


    stopwords_plus = set(stopwords_plus)


    processed_text = []
    for word in lemmatized_text:
        if word not in stopwords_plus:
            processed_text.append(word)
    
    #get unique set of vocabulary
    vocabulary = list(set(processed_text))


    vocab_len = len(vocabulary)

    weighted_edge = np.zeros((vocab_len,vocab_len),dtype=np.float32)

    score = np.zeros((vocab_len),dtype=np.float32)
    window_size = 3
    covered_coocurrences = []

    for i in range(0,vocab_len):
        score[i]=1
        for j in range(0,vocab_len):
            if j==i:
                #assign score as 0 if the same word is compared against one another
                weighted_edge[i][j]=0
            else:
                for window_start in range(0,(len(processed_text)-window_size)):

                    window_end = window_start+window_size

                    window = processed_text[window_start:window_end]

                    if (vocabulary[i] in window) and (vocabulary[j] in window):

                        index_of_i = window_start + window.index(vocabulary[i])
                        index_of_j = window_start + window.index(vocabulary[j])


                        if [index_of_i,index_of_j] not in covered_coocurrences:
                            weighted_edge[i][j]+=1/math.fabs(index_of_i-index_of_j)
                            covered_coocurrences.append([index_of_i,index_of_j])


    inout = np.zeros((vocab_len),dtype=np.float32)

    for i in range(0,vocab_len):
        for j in range(0,vocab_len):
            inout[i]+=weighted_edge[i][j]


    MAX_ITERATIONS = 50
    #set damping factor as 0.85
    #score[i] = (1-d) + d x [ Summation(j) ( (weighted_edge[i][j]/inout[j]) x score[j] ) ] where j belongs to the list of vertieces that has a connection with i.
    d=0.85
    threshold = 0.0001 #convergence threshold

    for iter in range(0,MAX_ITERATIONS):
        prev_score = np.copy(score)

        for i in range(0,vocab_len):

            summation = 0
            for j in range(0,vocab_len):
                if weighted_edge[i][j] != 0:
                    summation += (weighted_edge[i][j]/inout[j])*score[j]

            score[i] = (1-d) + d*(summation)

        if np.sum(np.fabs(prev_score-score)) <= threshold: #convergence condition
            break



    phrases = []

    phrase = " "
    for word in lemmatized_text:

        if word in stopwords_plus:
            if phrase!= " ":
                phrases.append(str(phrase).strip().split())
            phrase = " "
        elif word not in stopwords_plus:
            phrase+=str(word)
            phrase+=" "



    unique_phrases = []

    for phrase in phrases:
        if phrase not in unique_phrases:
            unique_phrases.append(phrase)


    for word in vocabulary:
    #print word
        for phrase in unique_phrases:
            if (word in phrase) and ([word] in unique_phrases) and (len(phrase)>1):
                #if len(phrase)>1 then the current phrase is multi-worded.
                #if the word in vocabulary is present in unique_phrases as a single-word-phrase
                # and at the same time present as a word within a multi-worded phrase,
                # then I will remove the single-word-phrase from the list.
                unique_phrases.remove([word])



    phrase_scores = []
    keywords = []
    for phrase in unique_phrases:
        phrase_score=0
        keyword = ''
        for word in phrase:
            keyword += str(word)
            keyword += " "
            phrase_score+=score[vocabulary.index(word)]
        phrase_scores.append(phrase_score)
        keywords.append(keyword.strip())


    sorted_index = np.flip(np.argsort(phrase_scores),0)

    keywords_num = 10
    key_phrases = []

    for i in range(0,keywords_num):
        key_phrases.append(keywords[sorted_index[i]])

    #all the key phrases calculated above are appended to the final list of key phrases which is to be evaluated
    #against the author assigned key phrases.
    key_phrases_list.append(key_phrases)

### Do evaluation for your model here:


Please use the following function to get the F score of your model.

In [None]:
'''
The following piece of code can help you do the evaluations, 
i.e. get Fscore for your predictions.
'''
def compare(golden_words, system_words):
    golden_words_flatten = list()
    system_words_flatten = list()
    i=0
    score =0
    if len(system_words)>0 and len(golden_words)>0:
        for tokenx in golden_words:
            golden_words_flatten = golden_words_flatten + str(tokenx).lower().split()
        for tokeny in system_words:
            system_words_flatten = system_words_flatten + str(tokeny).lower().split()
        for word in system_words_flatten:
            if word in golden_words_flatten:
                i=i+1
        recall = i/len(golden_words_flatten)
        precision = i/len(system_words_flatten)
        if recall>0 or precision>0:
            score = 2*precision*recall/(precision + recall)
    else:
        score = 0
    return score
            
def get_Fscore(golden_list, system_list):
    score_sum = 0
    num_docs = len(golden_list)
    if len(golden_list)==len(system_list):
        for i in range(0,num_docs):
            score_temp = compare(golden_list[i], system_list[i])
            score_sum = score_sum + score_temp
    else:
        print('Make sure the number of documents is correct !')
    return score_sum/num_docs
    

In [None]:
#author assigned key words
golden_words = keyphrases

#keywords generated using TestRank algorithm
system_words = key_phrases_list
print(get_Fscore(golden_words, system_words ))