In [1]:
from pprint import pprint

In [2]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import wordnet as wn  # to get the synsets
from nltk.corpus import stopwords  # common stop words in English
from nltk.tokenize import word_tokenize  # to tokenize the sentences

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# define the dictionary with words as the keys and definitions as the values
word_def = {
    'politics': 'a rather broad term accomodating the spectrum of ideas and choices of an individual or a group of individuals or a society as a whole.',
    'justice': 'the absolute right which can not be challenged',
    'food': 'anything that one can literally eat or rather consume',
    'patience': 'ability to stay calm and composed at pressing times'
}

In [4]:
# For example, given a word 'wood', WordNet guves different sysnets for this word.
print("Different synsets of the word 'wood'")
pprint(wn.synsets('wood'))

# For each of the sysnet, we can find the definition from WordNet.
print("\nDefinition of each synset of the word 'wood'")
for syn in wn.synsets('wood'):
  print(syn, '\t\t', syn.definition())

Different synsets of the word 'wood'
[Synset('wood.n.01'),
 Synset('forest.n.01'),
 Synset('wood.n.03'),
 Synset('wood.n.04'),
 Synset('wood.n.05'),
 Synset('wood.n.06'),
 Synset('woodwind.n.01'),
 Synset('wood.n.08')]

Definition of each synset of the word 'wood'
Synset('wood.n.01') 		 the hard fibrous lignified substance under the bark of trees
Synset('forest.n.01') 		 the trees and other plants in a large densely wooded area
Synset('wood.n.03') 		 United States film actress (1938-1981)
Synset('wood.n.04') 		 English conductor (1869-1944)
Synset('wood.n.05') 		 English writer of novels about murders and thefts and forgeries (1814-1887)
Synset('wood.n.06') 		 United States painter noted for works based on life in the Midwest (1892-1942)
Synset('woodwind.n.01') 		 any wind instrument other than the brass instruments
Synset('wood.n.08') 		 a golf club with a long shaft used to hit long shots; originally made with a wooden head


In [5]:
# Common stop words in English
pprint(stopwords.words('english'))

print("\nThere are %d stop words in the list" 
      % (len(stopwords.words('english'))))

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
def remove_stop_words(tokens):
  """
  this function removes the stop words from the list of tokens
  """
  # Get the list of common stop words in English from WordNet
  stop_words = stopwords.words('english')
  tokens_filtered = []
  # Iterate through the list of tokens
  for token in tokens:
    # If the token is not a stop word, add it to the new list
    if token not in stop_words:
      tokens_filtered.append(token)
  return tokens_filtered

In [7]:
# Iterate through differnt words in the dictionary
for word in word_def:
  print("Word:", word)
  print("My definition:", word_def[word])
  print()

  # Tokenize the definition and remove the stop words in the list
  tokens = word_tokenize(word_def[word])
  tokens = remove_stop_words(tokens)

  # Iterate through different synset of the word
  for syn in wn.synsets(word):
    print(syn, syn.definition())

    # Tokenize the definition of sysnet and remove the stop words in the list
    defn_tokens = word_tokenize(syn.definition())
    defn_tokens = remove_stop_words(defn_tokens)

    count = 0
    # Iterate through each of the token in the list
    for token in tokens:
      # If the tokens match, increment the count.
      if token in defn_tokens:
        count += 1
    
    print("Number of matching words:", count)
  print("----------------------------")

Word: politics
My definition: a rather broad term accomodating the spectrum of ideas and choices of an individual or a group of individuals or a society as a whole.

Synset('politics.n.01') social relations involving intrigue to gain authority or power
Number of matching words: 0
Synset('politics.n.02') the study of government of states and other political units
Number of matching words: 0
Synset('politics.n.03') the profession devoted to governing and to political affairs
Number of matching words: 0
Synset('politics.n.04') the opinion you hold with respect to political questions
Number of matching words: 0
Synset('politics.n.05') the activities and affairs involved in managing a state or a government
Number of matching words: 0
----------------------------
Word: justice
My definition: the absolute right which can not be challenged

Synset('justice.n.01') the quality of being just or fair
Number of matching words: 0
Synset('justice.n.02') judgment involved in the determination of right