In [1]:
import pip
import itertools
import nltk
import re
import string
import requests
import json
import pke
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import traceback
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor
from keybert import KeyBERT
# pip3 install git+https://github.com/boudinfl/pke.git
# pip3 install flashtext
# pip3 install - -upgrade spacy == 2.2.4
# pip3 install keyBERT


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

class MethodSelector:
    @classmethod
    def get_keyword_limit(cls, no_of_words, user_limit=0):
        """Estimates the number of keywords required
           @input: (no_of_words)-Block of continous text on a single topic  
           @Hyperparameters: ()-None
           @Output: (sentences)-Sentence Array"""
        if user_limit == 0:
            return round(no_of_words/15)
    
    @classmethod
    def tokenize_sentences(cls, text):
        """Converts long passages into sentence array     
           @input: (text)-Block of continous text on a single topic  
           @Hyperparameters: ()-None
           @Output: (sentences)-Sentence Array"""
        sentences = sent_tokenize(text)
        sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
        return sentences
    
    @classmethod
    def get_sentences_for_keyword(cls,keywords, sentences):
        """Maps Keywords with their sentences
           @input: (keywords1)-Dictionary of keywords 
           @input: (sentences)-Sentence Array
           @Hyperparameters: ()-None
           @Output: (out)-Dictionary with the syntax {('keyword','partofspeech'):weightage}"""
        keyword_processor = KeywordProcessor()
        keyword_sentences = {}
        for word in keywords:
            keyword_sentences[word[0]] = []
            keyword_processor.add_keyword(word[0])
        for sentence in sentences:
            keywords_found = keyword_processor.extract_keywords(sentence)
            for key in keywords_found:
                keyword_sentences[key].append(sentence)

        for key in keyword_sentences.keys():
            values = keyword_sentences[key]
            values = sorted(values, key=len, reverse=True)
            keyword_sentences[key] = values
        return keyword_sentences
    
    @classmethod
    def get_pos_for_keywords(cls,keywords1):
        """Generates POS for Non-POS based keywords
           @input: (keywords1)-Dictionary of keywords 
           @Hyperparameters: ()-None
           @Output: (out)-Dictionary with the syntax {('keyword','partofspeech'):weightage}"""
        tagged_keywords = nltk.pos_tag(keywords1)
        new_keyword = {}
        for i in tagged_keywords:
            if i[1][0] == 'N':
                new_keyword[i[0],'n'] = keywords1[i[0]]
            elif i[1][0] == 'V':
                new_keyword[i[0],'v'] = keywords1[i[0]]
            elif i[1][0] == 'J':
                new_keyword[i[0],'a'] = keywords1[i[0]]

        return new_keyword
    

    

class KeywordNPOS:
    """ Sample Access Example
        from KeyowrdAndPOSExtraction import KeywordNPOS 
        dictionary = KeywordNPOS.functionname(arguments)"""
    @classmethod
    def get_keywords_MultipartiteRank(cls, text,user_limit=0):
        """Extracts keywords from input text using Graph based MultipartiteRank algorithm
           @input: (text)-Block of continous text on a single topic 
           @Hyperparameters: ()-None
           @Output: (out)-Dictionary with the syntax {('keyword','partofspeech'):weightage}"""
        out = {}
        try:
            # Selecting the extractor and loading the input text in it
            extractor = pke.unsupervised.MultipartiteRank()
            extractor.load_document(input=text)

            # It can extract these three types of "phrases" from the input text
            pos = {'VERB', 'ADJ', 'NOUN'}
            convert_pos = {'VERB': 'v', 'NOUN': 'n', 'ADJ': 'a'}

            stoplist = list(string.punctuation)
            stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
            stoplist += stopwords.words('english')

            keyword_limit = MethodSelector.get_keyword_limit(len(text.split()),user_limit)

            for partsofspeech in pos:
                extractor.candidate_selection(
                    pos=partsofspeech, stoplist=stoplist)
                extractor.candidate_weighting(
                    alpha=1.1, threshold=0.75, method='average')
                keyphrases = extractor.get_n_best(n=keyword_limit)

                for val in keyphrases:
                    out[(val[0], convert_pos[partsofspeech])] = val[1]

        except:
            out = {}
            traceback.print_exc()

        return out

    @classmethod
    def get_keywords_PositionRank(cls, text,user_limit=0):
        """Extracts keywords from input text using Graph based PositionRank algorithm
            @input: (text)-Block of continous text on a single topic 
            @Hyperparameters: ()-None
            @Output: (out)-Dictionary with the syntax {('keyword','partofspeech'):weightage}"""

        out = {}
        pos = {'NOUN', 'VERB', 'ADJ'}
        convert_pos = {'VERB': 'v', 'NOUN': 'n', 'ADJ': 'a'}
        try:
            # define the grammar for selecting the keyphrase candidates
            grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

            # 1. create a PositionRank extractor.
            extractor = pke.unsupervised.PositionRank()
            #print("Keyword extraction Using the PositionRank model")

            # 2. load the content of the document.
            extractor.load_document(
                input=text, language='en', normalization=None)

            # 3. select the noun phrases up to 3 words as keyphrase candidates.
            extractor.candidate_selection(
                grammar=grammar, maximum_word_number=3)

            keyword_limit = MethodSelector.get_keyword_limit(len(text.split()),user_limit)

            # 4. weight the candidates using the sum of their word's scores that are
            #    computed using random walk biaised with the position of the words
            #    in the document. In the graph, nodes are words (nouns and
            #    adjectives only) that are connected if they occur in a window of
            #    10 words.
            for partsofspeech in pos:
                extractor.candidate_weighting(window=10, pos=partsofspeech)
                # 5. get the 10-highest scored candidates as keyphrases
                keyphrases = extractor.get_n_best(n=keyword_limit)
                for val in keyphrases:
                    out[(val[0], convert_pos[partsofspeech])] = val[1]
        except:
            out = {}
            traceback.print_exc()

        return out

    @classmethod
    def get_keywords_SingleRank(cls, text,user_limit=0):
        """Extracts keywords from input text using Graph based SingleRank algorithm
           @input: (text)-Block of continous text on a single topic 
           @Hyperparameters: ()-None
           @Output: (out)-Dictionary with the syntax {('keyword','partofspeech'):weightage}"""
        out = {}
        try:
            # Selecting the extractor and loading the input text in it
            extractor = pke.unsupervised.SingleRank()
            extractor.load_document(input=text)

            # It can extract these three types of "phrases" from the input text
            pos = {'NOUN', 'VERB', 'ADJ'}
            convert_pos = {'VERB': 'v', 'NOUN': 'n', 'ADJ': 'a'}

            keyword_limit = MethodSelector.get_keyword_limit(len(text.split()),user_limit)

            for partsofspeech in pos:
                extractor.candidate_selection(pos=partsofspeech)
                extractor.candidate_weighting(window=10, pos=partsofspeech)
                try:
                    keyphrases = extractor.get_n_best(n=keyword_limit)
                except ZeroDivisionError:
                    continue

                for val in keyphrases:
                    out[(val[0], convert_pos[partsofspeech])] = val[1]

        except:
            out = {}
            traceback.print_exc()

        return out

    @classmethod
    def get_keywords_TextRank(cls, text,user_limit=0):
        """Extracts keywords from input text using Graph based TextRank algorithm
           @input: (text)-Block of continous text on a single topic 
           @Hyperparameters: ()-None
           @Output: (out)-Dictionary with the syntax {('keyword','partofspeech'):weightage}"""
        out = {}
        try:
            # Selecting the extractor and loading the input text in it
            extractor = pke.unsupervised.TextRank()
            extractor.load_document(input=text)

            # It can extract these three types of "phrases" from the input text
            pos = {'NOUN', 'VERB', 'ADJ'}
            convert_pos = {'VERB': 'v', 'NOUN': 'n', 'ADJ': 'a'}

            keyword_limit = MethodSelector.get_keyword_limit(len(text.split()),user_limit)

            for partsofspeech in pos:
                # extractor.candidate_selection(pos=partsofspeech)
                extractor.candidate_weighting(
                    window=3, pos=partsofspeech, top_percent=1)
                try:
                    keyphrases = extractor.get_n_best(n=keyword_limit)
                except ZeroDivisionError:
                    continue

                for val in keyphrases:
                    out[(val[0], convert_pos[partsofspeech])] = val[1]

        except:
            out = {}
            traceback.print_exc()

        return out



    @classmethod
    def get_keywords_YAKE(cls, text,user_limit=0):
        """Extracts keywords from input text using Graph based TextRank algorithm
            @input: (text)-Block of continous text on a single topic 
            @Hyperparameters: ()-None
            @Output: (out)-Dictionary with the syntax {('keyword','partofspeech'):weightage}"""
        out = {}
        try:
            # Selecting the extractor and loading the input text in it
            extractor = pke.unsupervised.YAKE()
            extractor.load_document(
                input=text, language='en', normalization=None)

            # It can extract these three types of "phrases" from the input text
            #pos = {'NOUN','VERB','ADJ'}
            #convert_pos = {'VERB': 'v', 'NOUN': 'n','ADJ': 'a'}

            stoplist = list(string.punctuation)
            stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
            stoplist = stopwords.words('english')

            keyword_limit = MethodSelector.get_keyword_limit(len(text.split()),user_limit)

            extractor.candidate_selection(n=3, stoplist=stoplist)
            extractor.candidate_weighting(
                window=3, stoplist=stoplist, use_stems=False)
            keyphrases = extractor.get_n_best(n=keyword_limit*2, threshold=0.8)

            for val in keyphrases:
                out[(val[0])] = val[1]

        except:
            out = {}
            traceback.print_exc()

        return MethodSelector.get_pos_for_keywords(out)

    @classmethod
    def get_keywords_KeyBERT(cls, text,user_limit=0):
        """Extracts keywords from input text using Word Embedding based TextRank algorithm
            @input: (text)-Block of continous text on a single topic 
            @Hyperparameters: ()-None
            @Output: (out)-Dictionary with the syntax {('keyword','partofspeech'):weightage}"""
        out = {}
        try:
            kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')

            keyword_limit = MethodSelector.get_keyword_limit(len(text.split()),user_limit)

            keywords = kw_extractor.extract_keywords(
                text, stop_words='english', top_n=keyword_limit*2, keyphrase_ngram_range=(1, 2))

            for val in keywords:
                out[(val[0])] = val[1]

        except:
            out = {}
            traceback.print_exc()

        return MethodSelector.get_pos_for_keywords(out)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\16692\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\16692\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\16692\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#!pip3 install --upgrade spacy==2.2.3 --user

In [3]:
# This is the sample input text
text = """There is a lot of volcanic activity at divergent plate boundaries in the oceans. For example, many undersea volcanoes are found along the Mid-Atlantic Ridge. This is a divergent plate boundary that runs north-south through the middle of the Atlantic Ocean. As tectonic plates pull away from each other at a divergent plate boundary, they create deep fissures, or cracks, in the crust. Molten rock, called magma, erupts through these cracks onto Earth’s surface. At the surface, the molten rock is called lava. It cools and hardens, forming rock. Divergent plate boundaries also occur in the continental crust. Volcanoes form at these boundaries, but less often than in ocean crust. That’s because continental crust is thicker than oceanic crust. This makes it more difficult for molten rock to push up through the crust. Many volcanoes form along convergent plate boundaries where one tectonic plate is pulled down beneath another at a subduction zone. The leading edge of the plate melts as it is pulled into the mantle, forming magma that erupts as volcanoes. When a line of volcanoes forms along a subduction zone, they make up a volcanic arc. The edges of the Pacific plate are long subduction zones lined with volcanoes. This is why the Pacific rim is called the “Pacific Ring of Fire.”"""
len(text.split())

215

In [4]:
keywords = KeywordNPOS.get_keywords_YAKE(text)
print(keywords)
print()
sentences = MethodSelector.tokenize_sentences(text)
print(sentences)
print()
result  = MethodSelector.get_sentences_for_keyword(keywords,sentences)
#print(result)
for i in result:
    print(f"{i}:  {result[i]}")
    

{('atlantic ridge', 'n'): 0.04599435489729826, ('divergent plate', 'n'): 0.05051811913746707, ('plate', 'n'): 0.05675656522364285, ('divergent plate boundary', 'a'): 0.0681994202941495, ('crust', 'n'): 0.0818394401636323, ('volcanoes', 'n'): 0.08644397137081854, ('divergent', 'a'): 0.09255724673191565, ('plate boundaries', 'n'): 0.09885746497827541, ('atlantic ocean', 'v'): 0.1026581477748343, ('atlantic', 'a'): 0.10929097903133549, ('molten rock', 'n'): 0.11286214022530992, ('rock', 'n'): 0.12285379179720222, ('boundaries', 'n'): 0.12594121245326878, ('pacific', 'v'): 0.1275454360674287, ('volcanic activity', 'n'): 0.1601944782681947, ('molten', 'n'): 0.17157472445232513, ('called', 'v'): 0.17436246924346388, ('subduction', 'n'): 0.19820853169223906, ('mid', 'n'): 0.20116891760408584, ('ridge', 'n'): 0.20116891760408584, ('oceans', 'n'): 0.20903296229448917, ('pacific plate', 'v'): 0.2102683729058276, ('continental crust', 'a'): 0.2177600078479168, ('pacific ring', 'n'): 0.21816147022

In [5]:
keywords1 = KeywordNPOS.get_keywords_YAKE(text)


def get_pos_forKeywords(keywords1):
    keywords2 = KeywordNPOS.get_keywords_PositionRank(text)
    new_keyword = {}
    for i in keywords2:
        if i[0] in keywords1:
            if (i[0],'n') not in new_keyword and (i[0],'a') not in new_keyword:
                new_keyword[(i[0],i[1])]=keywords1[i[0]]

    return new_keyword

In [6]:
keywords1

{('atlantic ridge', 'n'): 0.04599435489729826,
 ('divergent plate', 'n'): 0.05051811913746707,
 ('plate', 'n'): 0.05675656522364285,
 ('divergent plate boundary', 'a'): 0.0681994202941495,
 ('crust', 'n'): 0.0818394401636323,
 ('volcanoes', 'n'): 0.08644397137081854,
 ('divergent', 'a'): 0.09255724673191565,
 ('plate boundaries', 'n'): 0.09885746497827541,
 ('atlantic ocean', 'v'): 0.1026581477748343,
 ('atlantic', 'a'): 0.10929097903133549,
 ('molten rock', 'n'): 0.11286214022530992,
 ('rock', 'n'): 0.12285379179720222,
 ('boundaries', 'n'): 0.12594121245326878,
 ('pacific', 'v'): 0.1275454360674287,
 ('volcanic activity', 'n'): 0.1601944782681947,
 ('molten', 'n'): 0.17157472445232513,
 ('called', 'v'): 0.17436246924346388,
 ('subduction', 'n'): 0.19820853169223906,
 ('mid', 'n'): 0.20116891760408584,
 ('ridge', 'n'): 0.20116891760408584,
 ('oceans', 'n'): 0.20903296229448917,
 ('pacific plate', 'v'): 0.2102683729058276,
 ('continental crust', 'a'): 0.2177600078479168,
 ('pacific rin

In [7]:
def get_pos_for_keywords(keywords1):    
    tagged_keywords = nltk.pos_tag(keywords1)
    new_keyword = {}
    for i in tagged_keywords:
        if i[1][0] == 'N':
            new_keyword[i[0],'n'] = keywords1[i[0]]
        elif i[1][0] == 'V':
            new_keyword[i[0],'v'] = keywords1[i[0]]
        elif i[1][0] == 'J':
            new_keyword[i[0],'a'] = keywords1[i[0]]
        

    return new_keyword

In [8]:
sentences = MethodSelector.tokenize_sentences(text)
keyword_sentences = MethodSelector.get_sentences_for_keyword(keywords1,sentences)
for i in keyword_sentences:
    print(f"{i} : {keyword_sentences[i]} \n")

atlantic ridge : ['For example, many undersea volcanoes are found along the Mid-Atlantic Ridge.'] 

divergent plate : ['There is a lot of volcanic activity at divergent plate boundaries in the oceans.', 'Divergent plate boundaries also occur in the continental crust.'] 

plate : ['Many volcanoes form along convergent plate boundaries where one tectonic plate is pulled down beneath another at a subduction zone.', 'The leading edge of the plate melts as it is pulled into the mantle, forming magma that erupts as volcanoes.'] 

divergent plate boundary : ['As tectonic plates pull away from each other at a divergent plate boundary, they create deep fissures, or cracks, in the crust.', 'This is a divergent plate boundary that runs north-south through the middle of the Atlantic Ocean.'] 

crust : ['As tectonic plates pull away from each other at a divergent plate boundary, they create deep fissures, or cracks, in the crust.', 'This makes it more difficult for molten rock to push up through th

### The final class for the distractor generation for a given keyword

In [9]:
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet as wn
#!pip install sense2vec==1.0.3
from sense2vec import Sense2Vec

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\16692\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
class DistractorSupport:
    """This is a helper class that provides support functions for DistractorGeneration Class"""
    
    @classmethod
    def wordnet_distractor_list(cls,distractor_dict,library_weight,grand_weight):
        """Generates the Wordnet Based distractor list
           @input: (distractor_dict)- A dictionary of words for which we need to find distractors   
           @Hyperparameters: (library_weight,grand_weight)-Weight of library and correction for grand child
           @Output: (distractor_dict)-List now containing distractors as value of keywords in dictionary"""
        
        for element in distractor_dict:
            #Initialization
            distractor={}
            word= element[0].lower()
            orig_word = word
            if len(word.split())>0:
                word = word.replace(" ","_")
                #This logic doesnt work in this library. Improvement needed
            hypernym = element[1].hypernyms()

            
            #Distractor generation using parents 
            if len(hypernym) != 0:
                grand_hypernym = hypernym[0].hypernyms()
                #print(grand_hypernym)
                for item in hypernym[0].hyponyms():
                    name = item.lemmas()[0].name()
                    #print ("name ",name, " word",orig_word)
                    if name == orig_word:
                        continue
                    name = name.replace("_"," ")
                    name = " ".join(w.capitalize() for w in name.split())
                    if name is not None and name not in distractor:
                        distractor[name] = library_weight
            
            #Distractor generation using grand-parents 
            if len(distractor) < 10 and len(grand_hypernym) != 0:
                for chypernym in grand_hypernym[0].hyponyms():
                    for item in chypernym.hyponyms():
                        name = item.lemmas()[0].name()
                        #print ("name ",name, " word",orig_word)
                        if name == orig_word:
                            continue
                        name = name.replace("_"," ")
                        name = " ".join(w.capitalize() for w in name.split())
                        if name is not None and name not in distractor:
                            distractor[name] = library_weight*grand_weight
                            #print(distractor,"\n\n")

            distractor_dict[element] = distractor
        return distractor_dict
    
    @classmethod
    def sense2vec_distractor_pruning(cls,dis):
        """Pruning distractor list of the sense2vec library
           @input: (dis)- A dictionary of distractor which are repeated and too similar to original word   
           @Hyperparameters: (library_weight,grand_weight)-None
           @Output: (dis)-List now containing distractors as value of keywords in dictionary"""        
        ps = PorterStemmer()
        for i in dis:
            splitted_word = i.split()
            test_dict = dis[i]
            del_list = []
            for word in splitted_word:
                word = ps.stem(word)
                for distractor in test_dict:
                    if word in distractor: 
                        del_list.append(distractor)
                    if distractor in word:
                        del_list.append(distractor)
            for duplicate in del_list :
                test_dict.pop(duplicate)

            dis[i] = test_dict
        return dis
    
    @classmethod
    def get_setenence_cosine_similarity(cls,X,Y):
        """ Generates the cosine similarity for two senteces
            @input: (X,Y)- Two sentences
            @Hyperparamters: ()= None
            @Output: (cosine)- Returns the similarity measure between the two sentences"""
        X_list = word_tokenize(X) 
        Y_list = word_tokenize(Y)
        ps = PorterStemmer()
        # sw contains the list of stopwords
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')
        l1 =[];l2 =[]

        # remove stop words from the string
        X_set = {ps.stem(w) for w in X_list if not w in stoplist} 
        Y_set = {ps.stem(w) for w in Y_list if not w in stoplist}

        # form a set containing keywords of both strings 
        rvector = X_set.union(Y_set) 
        for w in rvector:
            if w in X_set: l1.append(1) # create a vector
            else: l1.append(0)
            if w in Y_set: l2.append(1)
            else: l2.append(0)
        c = 0

        # cosine formula 
        for i in range(len(rvector)):
                c+= l1[i]*l2[i]
        try:
            cosine = c / float((sum(l1)*sum(l2))**0.5)
        except:
            cosine  = 0

        return cosine
    

class DistractorGeneration:
    """Single Responsibility of Overgenerating an exhaustive list of distractors"""
    @classmethod
    def get_wordnet_distractor(cls,keywords,keyword_sentences,library_weight = 1,grand_weight = 0.8):
        """Returns a list of distractors generated using WordNet Library
            @input: (keyword,keyword_sentences)- Set of keywords and their corresponding sentences
            @hyperparamter: (library_weight,grand_weight)- Weights to adjust the weightage of the library
            @Output: (distractor_list)- List now containing distractors as value of keywords in dictionary"""
        distractor_dict = {}
        for element in keywords:
            word = element[0].lower()
            if len(word.split())>0:
                j = 0
                #word = word.replace(" ","_")
                # Add different logic, this doesn't work
            syns = wn.synsets(word,element[1])
            if syns ==[]:
                syns = wn.synsets(word,'n')
            if syns ==[]:
                syns = wn.synsets(word,'v')
            if syns ==[]:
                syns = wn.synsets(word,'a')


            if len(syns) > 1:
                cosine = 0
                for syn in syns[::-1]:
                    # print(element[0])
                    # print (syn, ": ",syn.definition())
                    # Tried with element[1] instead of the whole text. The results were not accurate
                    intial_cosine = DistractorSupport.get_setenence_cosine_similarity(syn.definition(),text)
                    # print(intial_cosine,"\n")
                    if cosine <= intial_cosine:
                        cosine = intial_cosine
                        new_syn = syn
            elif len(syns) == 1:
                new_syn = syns[0]
            else:
                new_syn =[]

            str1 = " "
            Y = str1.join(keyword_sentences[element[0]])
            if new_syn and keyword_sentences[element[0]]:
                distractor_dict[(element[0],new_syn,Y)] = 1
            
        return DistractorSupport.wordnet_distractor_list(distractor_dict,library_weight,grand_weight)
    
    @classmethod
    def get_distractors_conceptnet(cls,keywords,keyword_sentences,library_weight = 1):
        """Returns a list of distractors generated using ConceptNet Library
            @input: (keyword,keyword_sentences)- Set of keywords and their corresponding sentences
            @hyperparamter: (library_weight)- Weights to adjust the weightage of the library
            @Output: (distractor_list)- List now containing distractors as value of keywords in dictionary"""
        distractor_list = {}

        for element in keywords:
            distractor = {}
            word = element[0].lower()
            original_word= word
            if (len(word.split())>0):
                word = word.replace(" ","_")
            url = "http://api.conceptnet.io/query?node=/c/en/%s/%s&rel=/r/PartOf&start=/c/en/%s&limit=5"%(word,element[1],word)
            obj = requests.get(url).json()

            for edge in obj['edges']:
                link = edge['end']['term'] 

                url2 = "http://api.conceptnet.io/query?node=%s&rel=/r/PartOf&end=%s&limit=10"%(link,link)
                obj2 = requests.get(url2).json()
                for edge in obj2['edges']:
                    word2 = edge['start']['label']
                    if word2 not in distractor and original_word.lower() not in word2.lower():
                        distractor[word2] = library_weight
            distractor_list[word] = distractor

        return distractor_list
    
    def sense2vec_get_words(cls,keywords,keyword_sentences,library_weight = 1):
        """Returns a list of distractors generated using Sense2vec Library
            @input: (keyword,keyword_sentences)- Set of keywords and their corresponding sentences
            @hyperparamter: (library_weight)- Weights to adjust the weightage of the library
            @Output: (distractor_list)- List now containing distractors as value of keywords in dictionary"""
        
        s2v = Sense2Vec().from_disk('s2v_old')
        distractor_list = {}
        for element in keywords:
            output = {}
            #print(element[0])
            word = element[0].lower()
            word = word.replace(" ", "_")
            most_similar = []
            sense = s2v.get_best_sense(word)
            distractor_limit=10
            while(distractor_limit>2):
                try:
                    most_similar = s2v.most_similar(sense, n=distractor_limit)
                    break
                except:
                    distractor_limit -= 2

            # print ("most_similar ",most_similar)

            for each_word in most_similar:
                append_word = each_word[0].split("|")[0].replace("_", " ").lower()
                if append_word.lower() != word:
                    output[append_word] = each_word[1] * library_weight

            distractor_list[element[0]] = output
            #print(f"{element[0]}:{distractor_list[element[0]]} ")
        return DistractorSupport.sense2vec_distractor_pruning(distractor_list)


In [11]:
k = DistractorGeneration.get_wordnet_distractor(keywords,keyword_sentences)

for i in k:
    print(f"{i}:{k[i]} \n\n")

('plate', Synset('plate.n.06'), 'Many volcanoes form along convergent plate boundaries where one tectonic plate is pulled down beneath another at a subduction zone. The leading edge of the plate melts as it is pulled into the mantle, forming magma that erupts as volcanoes.'):{'Horst': 1, 'Corn Snow': 0.8, 'Crud': 0.8, 'Bed': 0.8, 'Cambium': 0.8, 'Dermis': 0.8, 'Epidermis': 0.8, 'Horizon': 0.8, 'Seam': 0.8, 'Stratum Corneum': 0.8, 'Stratum Germinativum': 0.8, 'Stratum Granulosum': 0.8, 'Stratum Lucidum': 0.8, 'Substrate': 0.8, 'Superstrate': 0.8, 'Wall': 0.8, 'Floor': 0.8} 


('crust', Synset('crust.n.01'), 'As tectonic plates pull away from each other at a divergent plate boundary, they create deep fissures, or cracks, in the crust. This makes it more difficult for molten rock to push up through the crust. Volcanoes form at these boundaries, but less often than in ocean crust. That’s because continental crust is thicker than oceanic crust.'):{'Asthenosphere': 1, 'Cell Wall': 1, 'Chromo

In [12]:

def get_distractors_conceptnet(keywords,keyword_sentences,library_weight = 1,grand_weight = 0.8):
    distractor_list = {}
    
    for element in keywords:
        distractor = {}
        word = element[0].lower()
        original_word= word
        if (len(word.split())>0):
            word = word.replace(" ","_")
        url = "http://api.conceptnet.io/query?node=/c/en/%s/%s&rel=/r/PartOf&start=/c/en/%s&limit=5"%(word,element[1],word)
        obj = requests.get(url).json()

        for edge in obj['edges']:
            link = edge['end']['term'] 

            url2 = "http://api.conceptnet.io/query?node=%s&rel=/r/PartOf&end=%s&limit=10"%(link,link)
            obj2 = requests.get(url2).json()
            for edge in obj2['edges']:
                word2 = edge['start']['label']
                if word2 not in distractor and original_word.lower() not in word2.lower():
                    distractor[word2] = library_weight
        distractor_list[word] = distractor
                   
    return distractor_list

In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
   


In [14]:
library_weight = 1
grand_weight = 0.8
distractor={}

temp_distractors = []
word= "cricket"
syns = wn.synsets(word,'n')
syn = syns[1]
orig_word = word
if len(word.split())>0:
    word = word.replace(" ","_")
hypernym = syn.hypernyms()
print(hypernym)
if len(hypernym) != 0:
    grand_hypernym = hypernym[0].hypernyms()
    print(grand_hypernym)
    for item in hypernym[0].hyponyms():
        name = item.lemmas()[0].name()
        #print ("name ",name, " word",orig_word)
        if name == orig_word:
            continue
        name = name.replace("_"," ")
        name = " ".join(w.capitalize() for w in name.split())
        if name is not None and name not in distractor:
            distractor[name] = library_weight
if len(distractor) < 10 and len(grand_hypernym) != 0:
    for chypernym in grand_hypernym[0].hyponyms():
        for item in chypernym.hyponyms():
            name = item.lemmas()[0].name()
            #print ("name ",name, " word",orig_word)
            if name == orig_word:
                continue
            name = name.replace("_"," ")
            name = " ".join(w.capitalize() for w in name.split())
            if name is not None and name not in distractor:
                distractor[name] = library_weight*grand_weight
        
print(distractor)  

[Synset('field_game.n.01')]
[Synset('outdoor_game.n.01')]
{'Ball Game': 1, 'Field Hockey': 1, 'Football': 1, 'Hurling': 1, 'Lacrosse': 1, 'Polo': 1, 'Pushball': 1, 'Ultimate Frisbee': 1, 'Clock Golf': 0.8, 'Match Play': 0.8, 'Medal Play': 0.8, 'Miniature Golf': 0.8, 'Professional Golf': 0.8, 'Round Of Golf': 0.8}


In [15]:
from sense2vec import Sense2Vec

In [16]:
!pip list

Package                            Version
---------------------------------- -------------------
absl-py                            0.11.0
admin                              0.0.1
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 1.9.12
anaconda-project                   0.8.3
appdirs                            1.4.4
argcomplete                        1.10.0
argh                               0.26.2
asn1crypto                         1.3.0
astroid                            2.3.3
astropy                            4.0.1.post1
astunparse                         1.6.3
atomicwrites                       1.4.0
attrs                              20.3.0
audioread                          2.1.9
autopep8                           1.5.1
Babel                              2.8.0
backcall                           0.2.0
backports.functools-lru-cache      1.6.1
backports.shutil-get-terminal-size 1.0.0
backports.tempfile           

In [17]:
def sense2vec_get_words(keywords,keyword_sentences,library_weight = 1,grand_weight = 0.8):
    s2v = Sense2Vec().from_disk('s2v_old')
    distractor_list = {}
    for element in keywords:
        output = {}
        #print(element[0])
        word = element[0].lower()
        word = word.replace(" ", "_")
        most_similar = []
        sense = s2v.get_best_sense(word)
        distractor_limit=10
        while(distractor_limit>2):
            try:
                most_similar = s2v.most_similar(sense, n=distractor_limit)
                break
            except:
                distractor_limit -= 2

        # print ("most_similar ",most_similar)

        for each_word in most_similar:
            append_word = each_word[0].split("|")[0].replace("_", " ").lower()
            if append_word.lower() != word:
                output[append_word] = each_word[1] * library_weight

        distractor_list[element[0]] = output
        #print(f"{element[0]}:{distractor_list[element[0]]} ")
    return distractor_list


google-auth-oauthlib               0.4.2
google-pasta                       0.2.0
greenlet                           0.4.16
grpcio                             1.33.2
h5py                               2.10.0
HeapDict                           1.0.1
html2text                          2020.1.16
html5lib                           1.1
idna                               2.10
imageio                            2.9.0
imagesize                          1.2.0
IMAPClient                         2.1.0
importlib-metadata                 1.7.0
iniconfig                          1.1.1
intervaltree                       3.0.2
ipykernel                          5.3.2
ipympl                             0.6.3
ipython                            7.16.1
ipython-genutils                   0.2.0
ipywidgets                         7.6.3
isort                              4.3.21
itsdangerous                       1.1.0
jdcal                              1.4.1
jedi                               0.17.1
Jinja2   

In [18]:
print(keywords)
d = sense2vec_get_words(keywords,keyword_sentences)

{('atlantic ridge', 'n'): 0.04599435489729826, ('divergent plate', 'n'): 0.05051811913746707, ('plate', 'n'): 0.05675656522364285, ('divergent plate boundary', 'a'): 0.0681994202941495, ('crust', 'n'): 0.0818394401636323, ('volcanoes', 'n'): 0.08644397137081854, ('divergent', 'a'): 0.09255724673191565, ('plate boundaries', 'n'): 0.09885746497827541, ('atlantic ocean', 'v'): 0.1026581477748343, ('atlantic', 'a'): 0.10929097903133549, ('molten rock', 'n'): 0.11286214022530992, ('rock', 'n'): 0.12285379179720222, ('boundaries', 'n'): 0.12594121245326878, ('pacific', 'v'): 0.1275454360674287, ('volcanic activity', 'n'): 0.1601944782681947, ('molten', 'n'): 0.17157472445232513, ('called', 'v'): 0.17436246924346388, ('subduction', 'n'): 0.19820853169223906, ('mid', 'n'): 0.20116891760408584, ('ridge', 'n'): 0.20116891760408584, ('oceans', 'n'): 0.20903296229448917, ('pacific plate', 'v'): 0.2102683729058276, ('continental crust', 'a'): 0.2177600078479168, ('pacific ring', 'n'): 0.21816147022

In [19]:
output = {}
#print(element[0])
word = 'crust'
word = word.replace(" ", "_")
s2v = Sense2Vec().from_disk('s2v_old')
sense = s2v.get_best_sense(word)
distractor_limit=10
most_similar = []
while(distractor_limit>2):
    try:
        most_similar = s2v.most_similar(sense, n=distractor_limit)
        break
    except:
        distractor_limit -= 2

In [20]:
import spacy
import neuralcoref
nlp = spacy.load("en_core_web_sm")



In [None]:
neuralcoref.add_to_pipe(nlp)
doc = nlp(text)
resolved_text = doc._.coref_resolved

In [None]:
#!pip install neuralcoref