In [11]:
import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [12]:
def pre_processing(sentence):
    return remove_stopwords(tokenize_sentence(remove_punctuation(sentence)))

# Remove punctuation from a list of words
def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]', '', sentence)

# Remove stopwords from a list of words
def remove_stopwords(words_list):
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    return [value.lower() for value in words_list if value.lower() not in stopwords_list]

# Tokenize the input sentence and also lemmatize its words
def tokenize_sentence(sentence):
    words_list = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        if (tag[1][:2] == "NN"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.NOUN))
        elif (tag[1][:2] == "VB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.VERB))
        elif (tag[1][:2] == "RB"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADV))
        elif (tag[1][:2] == "JJ"):
            words_list.append(lmtzr.lemmatize(tag[0], pos=wn.ADJ))
    return words_list

# Union of the pre-processed words of the definitions and terms from the examples in WN for a sense.
def get_signature(sense):
    signature = []
    for word in tokenize_sentence(sense.definition()):  # definition tokenization
        signature.append(word)
    for example in sense.examples():  # example tokenization
        for word in tokenize_sentence(example):
            # Merge definition and examples
            signature.append(word)
    return signature

#### First exercize
It calculates average definition lenght for each section (nouns, verbs, adjectives and adverbs)

In [13]:
from statistics import mean

def avg_len_section_definitons():
    pos_tag_list = ['n', 'v', 'a', 'r']
    average_lenghts = []

    for pos_tag in pos_tag_list:
        synsets_lenght = []
        for synset in list(wn.all_synsets(pos_tag)):
            synsets_lenght.append(len(synset.definition().split(" ")))
        average_lenghts.append((pos_tag, mean(synsets_lenght)))

    print("\n", average_lenghts, "\n")


In [14]:
avg_len_section_definitons()


 [('n', 11.470035925226815), ('v', 6.146655044672042), ('a', 7.238433575677462), ('r', 5.028169014084507)] 



#### Second exercize
The variation of the length along the path of the hyperonyms that lead from a given synset to its root

In [15]:
def all_hypernym_paths(word):
    
    def_lens = []
    
    for syn in wn.synsets(word):


        single_path = []
        
        hyp_path = syn.hypernym_paths()
        
        for i in range (0, len(hyp_path[0])):
            
            single_path.append((hyp_path[0][i],len((hyp_path[0][i].definition()).split())))

        print(single_path)
        print()
        def_lens.append(single_path)

    return def_lens

In [16]:
for word in ['Courage', 'Paper', 'Apprehension', 'Sharpener']: 
    print("\n------------------------\n")
    print("Concept: ",word)
    all_hypernym_paths(word)
    


------------------------

Concept:  Courage
[(Synset('entity.n.01'), 17), (Synset('abstraction.n.06'), 11), (Synset('attribute.n.02'), 9), (Synset('trait.n.01'), 7), (Synset('character.n.03'), 18), (Synset('spirit.n.03'), 9), (Synset('courage.n.01'), 15)]


------------------------

Concept:  Paper
[(Synset('entity.n.01'), 17), (Synset('physical_entity.n.01'), 6), (Synset('matter.n.03'), 7), (Synset('substance.n.01'), 11), (Synset('material.n.01'), 12), (Synset('paper.n.01'), 15)]

[(Synset('entity.n.01'), 17), (Synset('abstraction.n.06'), 11), (Synset('communication.n.02'), 12), (Synset('written_communication.n.01'), 10), (Synset('writing.n.02'), 24), (Synset('essay.n.01'), 6), (Synset('composition.n.08'), 8)]

[(Synset('entity.n.01'), 17), (Synset('physical_entity.n.01'), 6), (Synset('object.n.01'), 12), (Synset('whole.n.02'), 11), (Synset('artifact.n.01'), 7), (Synset('instrumentality.n.03'), 13), (Synset('medium.n.01'), 9), (Synset('print_media.n.01'), 6), (Synset('press.n.02'), 1

#### Third exercize
Distance from the word's root and words within the definitions

In [17]:
def calculate_distance_root(synset):
    return (min([len(path) for path in synset.hypernym_paths()]))

def distance_root(word):
    
    output = dict()
    
    for syn in wn.synsets(word):
        
        actual_syn_dis = calculate_distance_root(syn)
        output[syn] = {word :actual_syn_dis} 
                
        syn_definition_processed = pre_processing(syn.definition())
        for def_word in syn_definition_processed:
            min_dis=9999
            for def_syn in wn.synsets(def_word):
                if min_dis > calculate_distance_root(def_syn):
                    min_dis = calculate_distance_root(def_syn)
                    output[syn].update({def_word : min_dis})
                
    return output

In [18]:
for word in ['Courage', 'Paper', 'Apprehension', 'Sharpener']: 
    print("\n------------------------\n")
    print("Concept: ",word)
    print(distance_root(word))


------------------------

Concept:  Courage
{Synset('courage.n.01'): {'Courage': 7, 'quality': 1, 'spirit': 5, 'enable': 2, 'face': 1, 'danger': 4, 'pain': 4, 'fear': 1}}

------------------------

Concept:  Paper
{Synset('paper.n.01'): {'Paper': 6, 'material': 1, 'cellulose': 9, 'pulp': 3, 'derive': 1, 'wood': 6, 'rag': 2, 'grass': 2}, Synset('composition.n.08'): {'Paper': 7, 'essay': 2, 'write': 1, 'assignment': 6}, Synset('newspaper.n.01'): {'Paper': 10, 'daily': 1, 'weekly': 1, 'publication': 4, 'folded': 1, 'sheet': 2, 'news': 6, 'article': 5, 'advertisement': 6}, Synset('paper.n.04'): {'Paper': 8, 'medium': 1, 'write': 1, 'communication': 3}, Synset('paper.n.05'): {'Paper': 9, 'scholarly': 1, 'article': 5, 'describe': 1, 'result': 2, 'observation': 7, 'hypothesis': 6}, Synset('newspaper.n.02'): {'Paper': 10, 'business': 5, 'firm': 1, 'publish': 2, 'newspaper': 7}, Synset('newspaper.n.03'): {'Paper': 8, 'physical': 1, 'object': 2, 'product': 6, 'newspaper': 7, 'publisher': 7}, Sy

#### Fourth exercize
Calculate similarity scores between hypernyms and hyponyms definitions with concept's definition. We will use Sentence Transformers.

In [19]:
from sentence_transformers import SentenceTransformer
from scipy.spatial import distance

'''
from nltk.translate.bleu_score import sentence_bleu
from bleu import multi_list_bleu
from rouge import Rouge

'''

def definition_overlap(word):
    
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')

    for syn in wn.synsets(word):
        
        '''
        rouge = Rouge()
        bleu_count = 0
        f_count = 0
        '''
        
        embedding_sim = 0
        
        actual_def_processed = syn.definition()
        
        print ("\n\nDefinition of", syn,  "=", actual_def_processed)
        print ()
        
        hyper_list = syn.hypernyms()
        
        for hy in hyper_list:
            
            hy_def = hy.definition()
            hyper_def_list = []
            
            hyper_def_list.append(actual_def_processed)
            hyper_def_list.append(hy_def)
            hyper_def_list_emb = model.encode(hyper_def_list)
            
            embedding_sim += 1 - distance.cosine(hyper_def_list_emb[0], hyper_def_list_emb[1])
            
            '''bleu_count += sentence_bleu([actual_def_processed], hy_def, weights=(1, 0, 0, 0))
            #print("BLEU score: ", sentence_bleu([actual_def_processed], hy_def, weights=(1, 0, 0, 0)))
            
            rouge_scores = rouge.get_scores(' '.join(hy_def), ' '.join(actual_def_processed))
            #print("Rogue scores: ", rouge_scores)
            f_count += rouge_scores[0]['rouge-1']['f']'''

        if (len(hyper_list)!=0):
            print ("Average similarity for hypernyms:",  embedding_sim/len(hyper_list))    
            
        '''if (len(hyper_list) != 0):
            print ("Bleu score for hypernyms (1-gram):", bleu_count / len(hyper_list))
            print ("Rogue f1 for hypernyms (1-gram):", f_count / len(hyper_list))
        else:
            print("No hypernyms")'''

               
        print ()
        
        '''bleu_count = 0
        f_count = 0'''
        embedding_sim = 0

        hypo_list = syn.hyponyms()
               
        for hy in hypo_list:
            hy_def = hy.definition()
            
            
            hypo_def_list = []
            
            hypo_def_list.append(hy_def)
            hypo_def_list.append(actual_def_processed)
            
            hypo_def_list_em = model.encode(hypo_def_list)
            
            embedding_sim += 1 - distance.cosine(hypo_def_list_em[0], hypo_def_list_em[1])


            
            '''bleu_count += sentence_bleu([actual_def_processed], hy_def, weights=(1, 0, 0, 0))
            #print("BLEU score: ", sentence_bleu([actual_def_processed], hy_def, weights=(1, 0, 0, 0)))
            
            rouge_scores = rouge.get_scores(' '.join(hy_def), ' '.join(actual_def_processed))
            #print("Rogue scores: ", rouge_scores)
            f_count += rouge_scores[0]['rouge-1']['f']'''
            
        if (len(hypo_list)!=0):
            print ("Average similarity for hyponyms:",  embedding_sim/len(hypo_list))

        '''if (len(hypo_list) != 0):
            print ("Bleu score for hyponyms (1-gram):", bleu_count / len(hypo_list))
            print ("Rogue f1 for hyponyms (1-gram):", f_count / len(hypo_list))
        else:
            print("No hyponyms")'''

In [20]:
for word in ['Courage', 'Paper', 'Apprehension', 'Sharpener']: 
    print("\n------------------------\n")
    print("Concept: ",word)
    print(definition_overlap(word))


------------------------

Concept:  Courage


Definition of Synset('courage.n.01') = a quality of spirit that enables you to face danger or pain without showing fear

Average similarity for hypernyms: 0.559572160243988

Average similarity for hyponyms: 0.6555851783071246
None

------------------------

Concept:  Paper


Definition of Synset('paper.n.01') = a material made of cellulose pulp derived mainly from wood or rags or certain grasses

Average similarity for hypernyms: 0.418277770280838

Average similarity for hyponyms: 0.5440274962159091


Definition of Synset('composition.n.08') = an essay (especially one written as an assignment)

Average similarity for hypernyms: 0.6943868398666382

Average similarity for hyponyms: 0.613116443157196


Definition of Synset('newspaper.n.01') = a daily or weekly publication on folded sheets; contains news and articles and advertisements

Average similarity for hypernyms: 0.7504271268844604

Average similarity for hyponyms: 0.6325632035732269


