In [7]:
import re
import csv
import nltk
from collections import Counter
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [8]:
def pre_processing(sentence):
    return remove_stopwords(tokenize_sentence(remove_punctuation(sentence)))

# Remove punctuation from a list of words
def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

# Remove stopwords from a list of words
def remove_stopwords(words_list):
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    return [value.lower() for value in words_list if value.lower() not in stopwords_list]

# Tokenize the input sentence and also lemmatize its words
def tokenize_sentence(sentence):
    words_list = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        if (tag[1][:2] == "NN"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.NOUN))
        elif (tag[1][:2] == "VB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.VERB))
        elif (tag[1][:2] == "RB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADV))
        elif (tag[1][:2] == "JJ"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADJ))
    return words_list

# return row of the .csv file
def get_rows(readCSV):
    for row in readCSV: return row

# Union of the pre-processed words of the definitions and terms from the examples in WN for a sense.
def get_signature(sense):
    signature = []
    for word in tokenize_sentence(sense.definition()):  # definition tokenization
        signature.append(word)
    for example in sense.examples():  # example tokenization
        for word in tokenize_sentence(example):
            # Merge definition and examples
            signature.append(word)
    return signature

Takes the .csv definitions file as input and outputs a dictionary in which each word ('Courage', 'Paper', 'Apprehension', 'Sharpener')  is associated with a list of pre-processed definitions.

Returns a dict with terms as keys and a list of words within  term's definitions as value.

In [9]:
def get_definitions(file):
    readCSV = csv.reader(file, delimiter=',')
    words = get_rows(readCSV)[1:] 

    definitions_words = dict()

    for row in readCSV:
        for index, definition in enumerate(row):
             # check the definition is empty
            if definition: 
                if index > 0:
                    word = words[index - 1]
                    if word not in definitions_words.keys():
                        definitions_words[word] = pre_processing(definition)
                    else:
                        definitions_words[word].extend(pre_processing(definition))
    return definitions_words

Returns the list of the 5 most important (most frequent) terms that appear in the bag of words of the definitions of the concept expressed by the word 'concept_word'.

In [10]:
genus_num = 5  # the number of most important terms to extract for a concept.
def get_genus(definitions_word):
    dict_Counter = dict (Counter(definitions_word))
    bag_of_words_count = []
    for key in dict_Counter.keys():
        bag_of_words_count.append((key, dict_Counter[key]))

    return sorted(bag_of_words_count, key=lambda x: x[1], reverse=True)[:genus_num] # takes only first 5 entries

For each genus, get a possible candidate. So for each genus get a list of hyponyms of all synsets related to the term genus.
Return the synset hyponym that comes closest to the definitions of the concept_word, that is the one with the signature (gloss + examples) that have a better overlap with the definitions.

In [11]:
def get_candidates(genus_list, definitions_word):
    bag_of_words_defs = set(definitions_word)
    candidates = []
    for genus in genus_list:
        hyponym_list = []
        for synset in wn.synsets(genus[0]): # Obtain synsets (only for nouns) of genus word, We can also consider 
            hyponym_list.extend(synset.hyponyms())
        if len(hyponym_list) != 0:
            candidate = genus[0], hyponym_list[0]
            max_overlap = 0
            for hyponym in hyponym_list:
                hyponym_signature = get_signature(hyponym)
                overlap = len(list(set(hyponym_signature) & bag_of_words_defs))
                if overlap > max_overlap:
                    candidate = genus[0], hyponym
                    max_overlap = overlap
            candidates.append(candidate)
    return candidates

In [12]:
with open('definizioni.csv') as file:
        # Create a dict with terms as keys and a list of words within  term's defitions as a value

        definitions_words = get_definitions(file)
        for key in definitions_words.keys():
            print("-----------------------")
            print("Concept: ",key)
            print("Wordnet synsets: ", wn.synsets(key))
            print()
            print("Genus list (with frequency): ")
            genus_list = get_genus(definitions_words[key])
            print(genus_list)
            print("\nCandidates:")
            print(get_candidates(genus_list, definitions_words[key]))

-----------------------
Concept:  Courage
Wordnet synsets:  [Synset('courage.n.01')]

Genus list (with frequency): 
[('ability', 18), ('fear', 17), ('face', 9), ('situation', 7), ('scar', 5)]

Candidates:
[('ability', Synset('physical_ability.n.01')), ('fear', Synset('stage_fright.n.01')), ('face', Synset('take_the_bull_by_the_horns.v.01')), ('situation', Synset('crowding.n.01')), ('scar', Synset('keloid.n.01'))]
-----------------------
Concept:  Paper
Wordnet synsets:  [Synset('paper.n.01'), Synset('composition.n.08'), Synset('newspaper.n.01'), Synset('paper.n.04'), Synset('paper.n.05'), Synset('newspaper.n.02'), Synset('newspaper.n.03'), Synset('paper.v.01'), Synset('wallpaper.v.01')]

Genus list (with frequency): 
[('material', 23), ('write', 18), ('cellulose', 7), ('wood', 6), ('tree', 5)]

Candidates:
[('material', Synset('composite_material.n.01')), ('write', Synset('handwrite.v.01')), ('cellulose', Synset('pulp.n.03')), ('wood', Synset('balsa.n.01')), ('tree', Synset('poon.n.02'