In [20]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords as sw
from itertools import chain
import re

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adrianvazquezbarrera/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Ejercicio 1

In [28]:
def simplified_lesk(context_sentence, ambiguous_word):
    
    stopwords = sw.words('english')
    max_overlaps = -1; 
    lesk_sense = None
    
    context_sentence = [x for x in context_sentence.split() if x not in stopwords]
        
    for ss in wn.synsets(ambiguous_word):
            
        lesk_dictionary = []

        # Includes definition.
        lesk_dictionary += [x for x in ss.definition().split() if x not in stopwords]
        
        # Includes lemma_names.
        lesk_dictionary += [x for x in ss.lemma_names() if x not in stopwords]
        
        lesk_dictionary = [re.sub(r'\[.*?\]|\(.*?\)|\W+', ' ', x).replace('_', ' ') for x in lesk_dictionary]
        
        overlaps = set(lesk_dictionary).intersection(set(context_sentence))
                        
        if len(overlaps) > max_overlaps:
            lesk_sense = ss
            max_overlaps = len(overlaps)
                
    return lesk_sense

In [29]:
frases=[("Yesterday I went to the bank to withdraw the money and the credit card did not work",'bank'),
       ("The river overflowed the bank.",'bank'),
       ("The van pulled up outside the bank and three masked men got out.",'bank'),
       ("The boy leapt from the bank into the cold water.",'bank'),
       ("I went fishing for some sea bass.",'bass'),
       ("The bass line of the song is too weak.",'bass'),
       ("I can hear bass frequency sound.",'bass'),
       ("He likes to eat grilled bass fish.",'bass')]

for frase, palabra in frases:
    print(f"Frase: {frase} \nPalabra: {palabra}")
    print(f"Desambiguación:")
    print(simplified_lesk(frase, palabra).definition())
    print(10*"---")



Frase: Yesterday I went to the bank to withdraw the money and the credit card did not work 
Palabra: bank
Desambiguación:
a financial institution that accepts deposits and channels the money into lending activities
------------------------------
Frase: The river overflowed the bank. 
Palabra: bank
Desambiguación:
sloping land (especially the slope beside a body of water)
------------------------------
Frase: The van pulled up outside the bank and three masked men got out. 
Palabra: bank
Desambiguación:
a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
------------------------------
Frase: The boy leapt from the bank into the cold water. 
Palabra: bank
Desambiguación:
sloping land (especially the slope beside a body of water)
------------------------------
Frase: I went fishing for some sea bass. 
Palabra: bass
Desambiguación:
the lowest part of the musical range
------------------------------
Frase: The bas

## Ejercicio 2

In [30]:
import nltk
import gensim
from nltk.data import find
import math
import re

nltk.download('word2vec_sample') 


# Cargar el modelo de embeding pre-entrenados del NLTK
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)


[nltk_data] Downloading package word2vec_sample to
[nltk_data]     /Users/adrianvazquezbarrera/nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!


In [31]:
def cosine_similarity(A, B):
    
    numerator = 0
    denominatorA = 0
    denominatorB = 0
    
    if (len(A) != len(B)):
        raise Exception("Dimension does not match")
    
    for i in range(len(A)):
        numerator += A[i] * B[i]
        denominatorA += A[i]**2
        denominatorB += B[i]**2
    
    return numerator / (math.sqrt(denominatorA)) * (math.sqrt(denominatorB))

In [32]:
def get_name(wn_word):
    return (wn_word.name().split('.')[0])

In [33]:
def get_similar_words(word, related_words, depth = 20, alpha = 0.5):
    
    words = []
    rel = []
    
    for w in related_words:
        try:
            model[w]
            rel.append(w)
        except:
            continue
                           
    for w, d in model.most_similar(positive=rel, topn=depth):
        try:
            if cosine_similarity( model[word], model[w] ) >= alpha:
                #print(cosine_similarity( model[word], model[w] ))
                words.append(w)
        except:
            continue
         
    return words


def get_antonyms_words(word, related_words, depth=20, alpha = 0.5):
    
    words = []
    rel = []
    
    for w in related_words:
        try:
            model[w]
            rel.append(w)
        except:
            continue
                
    related_words = rel
            
    for w, d in model.most_similar(negative=related_words, topn = depth):
        
        if cosine_similarity( model[word], model[w]) <= alpha:
            #print(cosine_similarity( model[word], model[w] ))
            words.append(w)
            
    return words
    

In [47]:
def embeddings_lesk(context_sentence, ambiguous_word, depth = 40, alpha = 0.5):
    
    stopwords = sw.words('english')
    max_overlaps = -1; 
    lesk_sense = None
    
    context_sentence = [x for x in context_sentence.split() if x not in stopwords]
    
    for ss in wn.synsets(ambiguous_word):
            
        lesk_dictionary = []

        # Includes definition.
        lesk_dictionary += [x for x in ss.definition().split() if x not in stopwords]
        
        # Includes lemma_names.
        lesk_dictionary += [x for x in ss.lemma_names() if x not in stopwords]
        
        lesk_dictionary = [re.sub(r'\[.*?\]|\(.*?\)|\W+', ' ', x).replace('_', ' ') for x in lesk_dictionary]
        
        l = []
        for m in lesk_dictionary:
            if " " in m:
                l += m.split(" ")
            else:
                l.append(m)
                                    
        lesk_dictionary = l
        
        lesk_dictionary += get_similar_words(get_name(ss), l, depth=depth, alpha=alpha)
        lesk_dictionary = [re.sub(r'\[.*?\]|\(.*?\)|\W+', ' ', x).replace('_', ' ') for x in lesk_dictionary]

        overlaps = set(lesk_dictionary).intersection(context_sentence)
        
        
        if len(overlaps) > max_overlaps:
            lesk_sense = ss
            max_overlaps = len(overlaps)
                
    return lesk_sense

In [48]:
for frase, palabra in frases:
    print(f"Frase: {frase} \nPalabra: {palabra}")
    print(f"Desambiguación:")
    print(embeddings_lesk(frase, palabra, depth=30, alpha=0.6).definition())

    print(10*"---")

Frase: Yesterday I went to the bank to withdraw the money and the credit card did not work 
Palabra: bank
Desambiguación:
a financial institution that accepts deposits and channels the money into lending activities
------------------------------
Frase: The river overflowed the bank. 
Palabra: bank
Desambiguación:
sloping land (especially the slope beside a body of water)
------------------------------
Frase: The van pulled up outside the bank and three masked men got out. 
Palabra: bank
Desambiguación:
a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
------------------------------
Frase: The boy leapt from the bank into the cold water. 
Palabra: bank
Desambiguación:
sloping land (especially the slope beside a body of water)
------------------------------
Frase: I went fishing for some sea bass. 
Palabra: bass
Desambiguación:
the lean flesh of a saltwater fish of the family Serranidae
----------------------