In [1]:
import re
import csv
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn

In [2]:
def pre_processing(sentence):
    return remove_stopwords(tokenize_sentence(remove_punctuation(sentence)))

# Remove punctuation from a list of words
def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

# Remove stopwords from a list of words
def remove_stopwords(words_list):
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    return [value.lower() for value in words_list if value.lower() not in stopwords_list]

# Tokenize the input sentence and also lemmatize its words
def tokenize_sentence(sentence):
    words_list = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        if (tag[1][:2] == "NN"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.NOUN))
        elif (tag[1][:2] == "VB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.VERB))
        elif (tag[1][:2] == "RB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADV))
        elif (tag[1][:2] == "JJ"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADJ))
    return words_list

def get_words(readCSV):
    for row in readCSV: return row  # return row

def get_signature(sense):
    signature = []
    for word in tokenize_sentence(sense.definition()):  # definition tokenization
        signature.append(word)
    for example in sense.examples():  # example tokenization
        for word in tokenize_sentence(example):
            # Merge definition and examples
            signature.append(word)
    return signature

prende in input il file delle definizioni .csv
e restituisce in output un dizionario in cui ad ogni
parola ('Courage', 'Paper', 'Apprehension', 'Sharpener')
è associato un set di definizioni

le definizioni prima di essere inserite nel dizionario
vengono pre-processate.In definitiva avremo una lista di liste
per parola in cui ogni lista rappresenta una definizione in forma di bag-

In [3]:
def get_definitions_words(csvfile):
    """
    :param csvfile: link to csv file
    :return: a dictionary as {word: {definition1,...}}
    """
    readCSV = csv.reader(csvfile, delimiter=',')
    
    # get list of words to analize
    words = get_words(readCSV)[1:]

    definitions_words = dict()

    for row in readCSV:
        for index, definition in enumerate(row):
            # check if the definition is empty
            if definition:  
                if index > 0:
                    word = words[index - 1]
                    if word not in definitions_words.keys():
                        definitions_words[word] = [pre_processing(definition)]
                    else:
                        definitions_words[word].append(pre_processing(definition))
    return definitions_words

Calculate cosine similarity scores between word's definitions.

In [4]:
def compute_results(definitions_words):
    results = dict()
    for word in definitions_words.keys():
        definitions = definitions_words[word]
        results[word] = compute_average_cosine_similarity(definitions)
    return results

Calculate similarity between two vectors which correspond presence or absence of certain word (for each word in definition 1 and definition 2).

In [5]:
def cosine_similarity(def1, def2):
    """
    :param def1: text of definiton 1
    :param def2: text of definiton 2
    """
    vector_def1 = []
    vector_def2 = []
    
    # Obtain a vector indicating precense or absence of all words
    both_def = list(set(def1) | set(def2))
    for word in both_def:
        if word in def1:
            vector_def1.append(1)
        else:
            vector_def1.append(0)
        if word in def2:
            vector_def2.append(1)
        else:
            vector_def2.append(0)

    c = 0
    for i in range(len(both_def)):
        c += vector_def1[i] * vector_def2[i]
    cosine_score = c / float((sum(vector_def1) * sum(vector_def2)) ** 0.5)
    return cosine_score


Compute average cosine similarity between all definitions of a concept.

In [6]:
def compute_average_cosine_similarity(definitions):
    """
    :param definitions : all definitions of a concept
    :return: average cosine similarity
    """
    average_similarity = 0
    count = 0
    for def1 in definitions:
        for def2 in definitions:
            if not def1 == def2:
                average_similarity += cosine_similarity(def1, def2)
                count += 1
    return average_similarity / count

In [7]:
with open('definizioni.csv') as csvfile:
    definitions_words = get_definitions_words(csvfile)
    results = compute_results(definitions_words)
    print(results)

{'Courage': 0.21054727554969985, 'Paper': 0.29258850377799267, 'Apprehension': 0.0830330313557733, 'Sharpener': 0.3863878711824424}
