In [1]:
from pprint import pprint

In [2]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import wordnet as wn  # to get the synsets
from nltk.corpus import stopwords  # common stop words in English
from nltk.tokenize import word_tokenize  # to tokenize the sentences
from nltk.wsd import lesk # fow word sense disambiguation

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**Word sense disambiguation**

a word with multiple word senses --> arouses ambiguity

WSD --> identifies the right word sense

improving relevance of search engines, anaphora resolution, coherence, and inference. 

Ref: https://en.wikipedia.org/wiki/Word-sense_disambiguation

**LESK algorithm**

Given an ambiguous word and the context in which the word occurs, Lesk returns a Synset with the highest number of overlapping words between the context sentence and different definitions from each Synset.

Original paper: https://dl.acm.org/doi/pdf/10.1145/318723.318728

Ref: https://web.stanford.edu/~jurafsky/slp3/slides/Chapter18.wsd.pdf

In [8]:
sent_1 = "A rock is classified according to characteristics such as mineral and chemical composition."
sent_2 = "Queen are a British rock band formed in London in 1970."

In [9]:
# For example, given a word 'rock', WordNet gives different sysnets for this word.
# For each of the sysnet, we can find the definition from WordNet.
print("\nDefinition of each synset of the word 'rock'")
for syn in wn.synsets('rock'):
  print(syn, '\t\t', syn.definition())


Definition of each synset of the word 'rock'
Synset('rock.n.01') 		 a lump or mass of hard consolidated mineral matter
Synset('rock.n.02') 		 material consisting of the aggregate of minerals like those making up the Earth's crust
Synset('rock.n.03') 		 United States gynecologist and devout Catholic who conducted the first clinical trials of the oral contraceptive pill (1890-1984)
Synset('rock.n.04') 		 (figurative) someone who is strong and stable and dependable; ; --Gospel According to Matthew
Synset('rock_candy.n.01') 		 hard bright-colored stick candy (typically flavored with peppermint)
Synset('rock_'n'_roll.n.01') 		 a genre of popular music originating in the 1950s; a blend of black rhythm-and-blues with white country-and-western
Synset('rock.n.07') 		 pitching dangerously to one side
Synset('rock.v.01') 		 move back and forth or sideways
Synset('rock.v.02') 		 cause to move back and forth


In [5]:
# Common stop words in English
pprint(stopwords.words('english'))

print("\nThere are %d stop words in the list" 
      % (len(stopwords.words('english'))))

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [10]:
def remove_stop_words(tokens):
  """
  this function removes the stop words from the list of tokens
  """
  # Get the list of common stop words in English from WordNet
  stop_words = stopwords.words('english')
  tokens_filtered = []
  # Iterate through the list of tokens
  for token in tokens:
    # If the token is not a stop word, add it to the new list
    if token not in stop_words:
      tokens_filtered.append(token)
  return tokens_filtered

## Without stop word removal

In [19]:
# Giving the POS
syn = lesk(word_tokenize(sent_1), 'rock', 'n')
print(syn, '\n', syn.definition())

Synset('rock.n.04') 
 (figurative) someone who is strong and stable and dependable; ; --Gospel According to Matthew


In [8]:
# Giving the POS
syn = lesk(word_tokenize(sent_2), 'rock', 'n')
print(syn, '\n', syn.definition())

Synset('rock_'n'_roll.n.01') 
 a genre of popular music originating in the 1950s; a blend of black rhythm-and-blues with white country-and-western


In [9]:
# w/o giving the POS
syn = lesk(word_tokenize(sent_1), 'rock')
print(syn, '\n', syn.definition())

Synset('rock.v.02') 
 cause to move back and forth


In [10]:
# w/o giving the POS
syn = lesk(word_tokenize(sent_2), 'rock')
print(syn, '\n', syn.definition())

Synset('rock_'n'_roll.n.01') 
 a genre of popular music originating in the 1950s; a blend of black rhythm-and-blues with white country-and-western


## With stop word removal

In [20]:
# Giving POS
syn = lesk(remove_stop_words(word_tokenize(sent_1)), 'rock', 'n')
print(syn, '\n', syn.definition())

Synset('rock.n.01') 
 a lump or mass of hard consolidated mineral matter


In [12]:
# Giving POS
syn = lesk(remove_stop_words(word_tokenize(sent_2)), 'rock', 'n')
print(syn, '\n', syn.definition())

Synset('rock_candy.n.01') 
 hard bright-colored stick candy (typically flavored with peppermint)


In [13]:
# w/o giving POS
syn = lesk(remove_stop_words(word_tokenize(sent_1)), 'rock')
print(syn, '\n', syn.definition())

Synset('rock.n.01') 
 a lump or mass of hard consolidated mineral matter


In [14]:
# w/o giving POS
syn = lesk(remove_stop_words(word_tokenize(sent_2)), 'rock')
print(syn, '\n', syn.definition())

Synset('rock_candy.n.01') 
 hard bright-colored stick candy (typically flavored with peppermint)
