In [1]:
import re
import csv
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn

In [2]:
def pre_processing(sentence):
    return remove_stopwords(tokenize_sentence(remove_punctuation(sentence)))

# Remove punctuation from a list of words
def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

# Remove stopwords from a list of words
def remove_stopwords(words_list):
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    return [value.lower() for value in words_list if value.lower() not in stopwords_list]

# Tokenize the input sentence and also lemmatize its words
def tokenize_sentence(sentence):
    words_list = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        if (tag[1][:2] == "NN"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.NOUN))
        elif (tag[1][:2] == "VB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.VERB))
        elif (tag[1][:2] == "RB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADV))
        elif (tag[1][:2] == "JJ"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADJ))
    return words_list

# return row of the .csv file
def get_rows(readCSV):
    for row in readCSV: return row

# Union of the pre-processed words of the definitions and terms from the examples in WN for a sense.
def get_signature(sense):
    signature = []
    for word in tokenize_sentence(sense.definition()):  # definition tokenization
        signature.append(word)
    for example in sense.examples():  # example tokenization
        for word in tokenize_sentence(example):
            # Merge definition and examples
            signature.append(word)
    return signature

Take in input file of definition .csv. in output return a dictionary in which each word ('Courage', 'Paper', 'Apprehension', 'Sharpener') is associated with a set of definitions

Definitions are pre-processed before being entered into the dictionary.

In output we will have a list of lists per word in which each list represents a definition in the form of BoW.

In [3]:
def get_definitions(file):
    """
    :param file: link to csv file
    :return: a dictionary as {word: {definition1,...}}
    """
    readCSV = csv.reader(file, delimiter=',')
    
    # get list of words to analize
    words = get_rows(readCSV)[1:]

    definitions_words = dict()

    for row in readCSV:
        for index, definition in enumerate(row):
            # check if the definition is empty
            if definition:  
                if index > 0:
                    word = words[index - 1]
                    if word not in definitions_words.keys():
                        definitions_words[word] = [pre_processing(definition)]
                    else:
                        definitions_words[word].append(pre_processing(definition))
    return definitions_words

Calculate cosine similarity scores between word's definitions.

Calculate similarity between two vectors which correspond presence or absence of certain word (for each word in definition 1 and definition 2).

In [4]:
def cosine_sim(def1, def2):
    """
    :param def1: text of definiton 1
    :param def2: text of definiton 2
    """
    vector_def1 = []
    vector_def2 = []
    
    # Obtain a vector indicating precense or absence of all words
    both_def = list(set(def1) | set(def2))
    for word in both_def:
        if word in def1:
            vector_def1.append(1)
        else:
            vector_def1.append(0)
        if word in def2:
            vector_def2.append(1)
        else:
            vector_def2.append(0)

    c = 0
    for i in range(len(both_def)):
        c += vector_def1[i] * vector_def2[i]
    cosine_score = c / float((sum(vector_def1) * sum(vector_def2)) ** 0.5)
    return cosine_score


Compute results doing an average between all couple of definition.

In [5]:
def compute_results(definitions_words):
    results = dict()
    for word in definitions_words.keys():
        definitions = definitions_words[word]
        
        avg_similarity = 0
        count = 0
        for def1 in definitions:
            for def2 in definitions:
                if not def1 == def2:
                    avg_similarity += cosine_sim(def1, def2)
                    count += 1
        
        results[word] = avg_similarity / count
        
    return results

Calculate words wich appears at least in 50 per cent of definitions.

In [6]:
def most_frequent_words(definitions):
    BoW = set([word for definition in definitions for word in definition])

    freq_words = dict()

    # count how times appears a certain word
    for word in BoW:
        for definition in definitions:
            if word in definition:
                if word not in freq_words.keys():
                    freq_words[word] = 1
                else:
                    count = freq_words[word] + 1
                    freq_words[word] = count

    # takes only words wich appears at least in 50 per cent of definitions
    
    most_frequent_words = []
    
    for word in freq_words.keys():
        if freq_words[word] >= (0.5 * len(definitions)):
            most_frequent_words.append(word)

    return most_frequent_words

Compute average cosine similarity between all definitions of a concept.

In [7]:
with open('definizioni.csv') as csvfile:
    definitions_words = get_definitions(csvfile)
    results = compute_results(definitions_words)
    print('Average definition\'s cosine similarity: ')
    print(results)
    print('\nMost frequent words (at least in 50% of definitions) : ')
    print([(key, most_frequent_words(definitions_words[key])) for key in ['Courage', 'Paper', 'Apprehension', 'Sharpener']])

Average definition's cosine similarity: 
{'Courage': 0.21054727554969985, 'Paper': 0.29258850377799267, 'Apprehension': 0.0830330313557733, 'Sharpener': 0.3863878711824424}

Most frequent words (at least in 50% of definitions) : 
[('Courage', ['fear', 'ability']), ('Paper', ['material', 'write']), ('Apprehension', []), ('Sharpener', ['pencil', 'sharpen', 'tool'])]
