###**EJERCICIO 1**

Implementar,	 usando	 NLTK	 y	 Python,	 el	 algoritmo	 de	 Lesk	 simplificado	 para	 desambiguar	el	 sentido	 de	las	 palabras	 (WSD).	 La	 función	 recibirá	 una	 palabra	 y	una	 frase	que	la	contenga	y	decidirá	el	mejor	sentido	para	esa	palabra.	Las	 frases	serán	en	inglés	 y	 se	 deberá	eliminar	de	la	 frase, de	la	glosa	 y de	los ejemplos	 de	 cada	sentido	las	‘stopwords’.

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords as sw

sw_english=sw.words('english')


def get_signature(synset):
  words = set()
  for sentence in synset.examples():
      for word in nltk.word_tokenize(sentence):
        word = wn.morphy(word)
        if word not in sw_english and word is not None:
          words.add(word)

  for word in nltk.word_tokenize(synset.definition()):
      word = wn.morphy(word)
      if word not in sw_english and word is not None:
          words.add(word)
  return words


# Algoritmo Lesk simplificado
def lesk_algorithm(frase, palabra):
  sentidos_palabra = wn.synsets(palabra)
  mejor_sentido = sentidos_palabra[0]
  max_overlap = 0
  contexto =  set(wn.morphy(word) for word in nltk.word_tokenize(frase) if wn.morphy(word) not in sw_english and wn.morphy(word) is not None)
  for sentido in sentidos_palabra:
      signature = get_signature(sentido)
      overlap = len(contexto.intersection(signature))
      if overlap > max_overlap:
          max_overlap = overlap
          mejor_sentido = sentido
  return mejor_sentido

print(lesk_algorithm("Yesterday I went to the bank to withdraw the money and the credit card did not work",'bank'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Synset('depository_financial_institution.n.01')


###**EJERCICIO 2**

Implementar un	algoritmo	similar	para	la	desambiguación	semántica utilizando Word Embeddings	y	una	distancia	de	similitud	semántica como	la	distancia	coseno.

In [None]:
import nltk
nltk.download('word2vec_sample')
nltk.download('stopwords')
nltk.download('punkt')

# Word embeddings
import gensim
from nltk.data import find
import numpy as np

# Cargar el modelo de embeding pre-entrenados del NLTK
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Package word2vec_sample is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from nltk.corpus import stopwords as sw

sw_english=sw.words('english')


def get_context_embedding(sentence):
    context = []
    for word in nltk.word_tokenize(sentence):
        word = wn.morphy(word)
        if word is not None and word not in sw_english and model[word] is not None:
            context.append(word)
    context = sum([model[w] for w in context if model[w] is not None])
    return context

def get_signature(synset):
  words = set()
  for sentence in synset.examples():
      for word in nltk.word_tokenize(sentence):
        word = wn.morphy(word)
        if word not in sw_english and word is not None:
          words.add(word)

  for word in nltk.word_tokenize(synset.definition()):
      word = wn.morphy(word)
      if word not in sw_english and word is not None:
          words.add(word)
  return words

# Algoritmo Lesk simplificado
def lesk_algorithm_embeddings(frase, palabra):
  sentidos_palabra = wn.synsets(palabra)
  mejor_sentido = sentidos_palabra[0]
  max_overlap = 0
  contexto = get_context_embedding(frase)
  for sentido in sentidos_palabra:
      signature_words = list(get_signature(sentido))
      signature_embeddings = []
      for w in signature_words:
          try:
            signature_embeddings.append(model[w])
          except Exception:
              #print('Palabra ' + w + ' no encontrada')
              continue
      signature_embeddings = sum(signature_embeddings)
      overlap = np.dot(contexto, signature_embeddings) / (np.linalg.norm(contexto) * np.linalg.norm(signature_embeddings))
      if overlap > max_overlap:
          max_overlap = overlap
          mejor_sentido = sentido
  
  return mejor_sentido

frases=[("Yesterday I went to the bank to withdraw the money and the credit card did not work",'bank'), # depository_financial_institution.n.01
        ("The river overflowed the bank.",'bank'), # bank.n.01 
        ("The van pulled up outside the bank and three masked men got out.",'bank'), # bank.n.06
        ("The boy leapt from the bank into the cold water.",'bank'), # bank.n.01
        ("I went fishing for some sea bass.",'bass'), # bass.n.08
        ("The bass line of the song is too weak.",'bass'), # bass.s.01
        ]

for frase in frases:
    print(lesk_algorithm_embeddings(frase[0], frase[1]))


Synset('depository_financial_institution.n.01')
Synset('bank.n.01')
Synset('bank.n.06')
Synset('bank.n.01')
Synset('bass.n.08')
Synset('bass.s.01')
