## Simplified Lesk Algorithm: WordNet Definitions for Similarity Comparison ##
From http://pydoc.net/Python/nltk/3.0.0b2/nltk.wsd/# 


In [2]:
# Natural Language Toolkit: Word Sense Disambiguation Algorithms
#
# Author: Liling Tan <alvations@gmail.com>
#
# Copyright (C) 2001-2014 NLTK Project
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT
from nltk.corpus import wordnet as wn

In [45]:
def compare_overlaps_greedy(context, synsets_signatures, pos=None):
    """
    Calculate overlaps between the context sentence and the synset_signature
    and returns the synset with the highest overlap.
    
    :param context: ``context_sentence`` The context sentence where the ambiguous word occurs.
    :param synsets_signatures: ``dictionary`` A list of words that 'signifies' the ambiguous word.
    :param pos: ``pos`` A specified Part-of-Speech (POS).
    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
    """
    max_overlaps = 0
    lesk_sense = None
    for ss in synsets_signatures:
        if pos and str(ss.pos()) != pos: # Skips different POS.
            continue
        overlaps = set(synsets_signatures[ss]).intersection(context)
        if len(overlaps) > max_overlaps:
            lesk_sense = ss
            max_overlaps = len(overlaps)  
    if lesk_sense:
        return (lesk_sense, lesk_sense.definition())
    else:
        return None

In [37]:
def lesk(context_sentence, ambiguous_word, pos=None, dictionary=None):
    """
    This function is the implementation of the original Lesk algorithm (1986).
    It requires a dictionary which contains the definition of the different
    sense of each word. See http://goo.gl/8TB15w

        >>> from nltk import word_tokenize
        >>> sent = word_tokenize("I went to the bank to deposit money.")
        >>> word = "bank"
        >>> pos = "n"
        >>> lesk(sent, word, pos)
        Synset('bank.n.07')
    
    :param context_sentence: The context sentence where the ambiguous word occurs.
    :param ambiguous_word: The ambiguous word that requires WSD.
    :param pos: A specified Part-of-Speech (POS).
    :param dictionary: A list of words that 'signifies' the ambiguous word.
    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
    """
    if not dictionary:
        dictionary = {}
        for ss in wn.synsets(ambiguous_word):
            dictionary[ss] = ss.definition().split()
    best_sense = compare_overlaps_greedy(context_sentence, dictionary, pos)
    return best_sense


## Try out the code on an example: ##

In [36]:
from nltk.tokenize import word_tokenize

sentence = word_tokenize("I went to the bank to deposit money.")
print(lesk(sentence, "bank", "n"))
print(lesk(sentence,"deposit", "v"))

(Synset('savings_bank.n.02'), 'a container (usually with a slot in the top) for keeping money at home')
(Synset('deposit.v.02'), 'put into a bank account')


##Exercise: modify this code to print out the chosen definition  in addition to the chosen synset to help determine if the correct sense was selected.##

In [16]:
## See code change in function defintion above

##Exercise: try out other sentences containing bank to see how well the algorithm works.##

In [29]:
print(lesk(word_tokenize("Andy is on his laptop computer."), "computer", "n"))
print(lesk(word_tokenize("A roundabout is better than a traffic signal."), "signal", "n"))
print(lesk(word_tokenize("Something is a amiss in Apple's patents for the flux capicitor."), "flux", "n"))

None
(Synset('signal.n.01'), 'any nonverbal action or gesture that encodes a message')
(Synset('flux_density.n.01'), '(physics) the number of changes in energy flow across a given surface per unit area')


##Exercise: try out other ambiguous words and contextualizing sentences.##

In [47]:
print(lesk(word_tokenize("What is the meaning of life?"), "life", "n"))
print(lesk(word_tokenize("What is the meaning of life?"), "meaning", "n"))
print(lesk(word_tokenize("Did you phone him earlier today?"), "phone", "v"))
print(lesk(word_tokenize("Did you get the new phone?"), "phone", "n"))
print(lesk(word_tokenize("Did you phone him on the telephone earlier today?"), "phone", "v"))
print(lesk(word_tokenize("A newspaper reporter will be phoning you later."), "phoning", "v"))

(Synset('biography.n.01'), "an account of the series of events making up a person's life")
(Synset('meaning.n.02'), 'the idea that is intended')
None
(Synset('earphone.n.01'), 'electro-acoustic transducer for converting electric signals into sounds; it is held over or inserted into the ear')
(Synset('call.v.03'), 'get or try to get into communication (with someone) by telephone')
None


##Exercise: (optional, for later) What happens with ties in the number of overlaps? ##

In [43]:
print(lesk(word_tokenize("What is the meaning of life?"), "meaning", "n"))


 tie!  2
Synset('meaning.n.01') Synset('meaning.n.02')
(Synset('meaning.n.02'), 'the idea that is intended')
