In [1]:
import nltk

In [2]:
from nltk.corpus import wordnet as wn

### 1 - Sentence Tokenizer

In [4]:
parag = "My name is Vishal. I like nltk. I like python. I make tutorials."

In [5]:
from nltk.tokenize import sent_tokenize

In [6]:
sent_tokenize(parag)

['My name is Vishal.', 'I like nltk.', 'I like python.', 'I make tutorials.']

In [7]:
my_arr = sent_tokenize(parag)

In [8]:
import nltk.data

In [10]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [11]:
parag = "I like crayons. I fly high. I love the radio. You are awesome"

In [12]:
array_t = tokenizer.tokenize(parag)

In [13]:
array_t

['I like crayons.', 'I fly high.', 'I love the radio.', 'You are awesome']

### 2 - Word Tokenizer

In [1]:
from nltk.tokenize import word_tokenize

In [2]:
word_tokenize('This is Vishal hacking your PC')

['This', 'is', 'Vishal', 'hacking', 'your', 'PC']

In [3]:
arr_words = word_tokenize("This is Vishal hacking your PC")

In [4]:
from nltk.tokenize import TreebankWordTokenizer

In [5]:
tok2 = TreebankWordTokenizer()

In [6]:
from nltk.tokenize import WordPunctTokenizer

In [7]:
tok3 = WordPunctTokenizer()

In [8]:
sent = "This is Vishal hacking your pc"

In [9]:
word_tokenize(sent)

['This', 'is', 'Vishal', 'hacking', 'your', 'pc']

In [11]:
tok2.tokenize(sent)

['This', 'is', 'Vishal', 'hacking', 'your', 'pc']

In [12]:
tok3.tokenize(sent)

['This', 'is', 'Vishal', 'hacking', 'your', 'pc']

In [13]:
sent2 = "I won't let you bring cake"

In [14]:
word_tokenize(sent2)

['I', 'wo', "n't", 'let', 'you', 'bring', 'cake']

In [15]:
tok2.tokenize(sent2)

['I', 'wo', "n't", 'let', 'you', 'bring', 'cake']

In [16]:
tok3.tokenize(sent2)

['I', 'won', "'", 't', 'let', 'you', 'bring', 'cake']

### 3 - Regexp tokenizer

- [\w] = word
- [\d] = digit
- [\s] = space

In [17]:
from nltk.tokenize import regexp_tokenize

In [18]:
sent3 = "I can't do this. I won't do that"

In [26]:
regexp_tokenize(sent3, "[\w']+")

['I', "can't", 'do', 'this', 'I', "won't", 'do', 'that']

In [27]:
from nltk.tokenize import RegexpTokenizer

In [29]:
tokenizer = RegexpTokenizer("[\w']+")

In [30]:
tokenizer.tokenize(sent3)

['I', "can't", 'do', 'this', 'I', "won't", 'do', 'that']

### 4 - Stop words

In [31]:
from nltk.corpus import stopwords

In [32]:
ensw = stopwords.words('english')

In [33]:
ensw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [34]:
from nltk.tokenize import word_tokenize

In [35]:
parag1 = "What are you doing exactly right now? I wanted to get to my office."

#### Now I want to filter out the words that are important to me in this sentence, so I will remove out the stop words

In [36]:
parag_arr = word_tokenize(parag1)

In [38]:
filter_arr = [item for item in parag_arr if item not in ensw]

In [39]:
filter_arr

['What', 'exactly', 'right', '?', 'I', 'wanted', 'get', 'office', '.']

### 5 - Synsets, Hypernyms and Hyponyms

In [41]:
from nltk.corpus import wordnet

In [42]:
word1 = "weapon"

In [43]:
syn_array = wordnet.synsets(word1)

In [44]:
syn_array

[Synset('weapon.n.01'), Synset('weapon.n.02')]

In [45]:
woi = syn_array[0]

In [46]:
woi

Synset('weapon.n.01')

In [48]:
woi.definition()

'any instrument or instrumentality used in fighting or hunting'

In [49]:
woi.name()

'weapon.n.01'

In [50]:
woi.pos()

'n'

- **hypernyms** = more abstract than a given word eg. cat - animal, living beings etc
- **hyponyms** = more specific less abstract than hyponyms eg. cat - tiger, panther, cat-food

In [51]:
woi.hypernyms()

[Synset('instrument.n.01')]

In [52]:
woi.hyponyms()

[Synset('bow.n.04'),
 Synset('bow_and_arrow.n.01'),
 Synset('brass_knucks.n.01'),
 Synset('fire_ship.n.01'),
 Synset('flamethrower.n.01'),
 Synset('greek_fire.n.01'),
 Synset('gun.n.01'),
 Synset('knife.n.02'),
 Synset('light_arm.n.01'),
 Synset('missile.n.01'),
 Synset('pike.n.04'),
 Synset('projectile.n.01'),
 Synset('slasher.n.02'),
 Synset('sling.n.04'),
 Synset('spear.n.01'),
 Synset('stun_gun.n.01'),
 Synset('sword.n.01'),
 Synset('tomahawk.n.01'),
 Synset('weapon_of_mass_destruction.n.01')]

In [54]:
woi.hyponyms()[1].definition()

'a weapon consisting of arrows and the bow to shoot them'

In [55]:
woi.hypernym_paths()

[[Synset('entity.n.01'),
  Synset('physical_entity.n.01'),
  Synset('object.n.01'),
  Synset('whole.n.02'),
  Synset('artifact.n.01'),
  Synset('instrumentality.n.03'),
  Synset('device.n.01'),
  Synset('instrument.n.01'),
  Synset('weapon.n.01')]]

### 6 - Lemma, synonyms and antonyms

In [56]:
from nltk.corpus import wordnet

In [57]:
s_arr = wordnet.synsets('win')

In [67]:
s_arr # Array of synsets

[Synset('win.n.01'),
 Synset('winnings.n.01'),
 Synset('win.v.01'),
 Synset('acquire.v.05'),
 Synset('gain.v.05'),
 Synset('succeed.v.01')]

In [59]:
woi = s_arr[2]

In [60]:
woi

Synset('win.v.01')

In [61]:
woi.pos()

'v'

In [62]:
woi.definition()

'be the winner in a contest or competition; be victorious'

In [63]:
woi.lemmas()

[Lemma('win.v.01.win')]

In [68]:
woi.lemmas()[0].name()

'win'

#### Lemmas
- gateway to finding more synonyms or antonyms

In [66]:
syn_arr = [] # Array of synonyms

In [65]:
ant_arr = []

In [69]:
for syn in s_arr:
    for lem in syn.lemmas():
        syn_arr.append(lem.name())

In [70]:
syn_arr

['win',
 'winnings',
 'win',
 'profits',
 'win',
 'acquire',
 'win',
 'gain',
 'gain',
 'advance',
 'win',
 'pull_ahead',
 'make_headway',
 'get_ahead',
 'gain_ground',
 'succeed',
 'win',
 'come_through',
 'bring_home_the_bacon',
 'deliver_the_goods']

In [72]:
len(syn_arr)

20

In [71]:
set(syn_arr)

{'acquire',
 'advance',
 'bring_home_the_bacon',
 'come_through',
 'deliver_the_goods',
 'gain',
 'gain_ground',
 'get_ahead',
 'make_headway',
 'profits',
 'pull_ahead',
 'succeed',
 'win',
 'winnings'}

In [73]:
len(set(syn_arr))

14

In [81]:
woi.lemmas()[0].antonyms()[0].name()

'lose'

In [86]:
ant_arr = []

In [87]:
for syn in s_arr:
    for lem in syn.lemmas():
        for antonymn in lem.antonyms():
            ant_arr.append(antonymn.name())

In [88]:
ant_arr

['losings', 'lose', 'lose', 'fall_back', 'fail']

In [90]:
len(ant_arr)

5

In [89]:
set(ant_arr)

{'fail', 'fall_back', 'lose', 'losings'}

In [91]:
len(set(ant_arr))

4

### 7 - Wu Palmer Similarity

In [93]:
from nltk.corpus import wordnet

In [94]:
# cake, loaf, bread

In [95]:
s_cake = wordnet.synsets('cake')

In [96]:
s_loaf = wordnet.synsets('loaf')

In [97]:
s_bread = wordnet.synsets('bread')

In [98]:
s_cake

[Synset('cake.n.01'),
 Synset('patty.n.01'),
 Synset('cake.n.03'),
 Synset('coat.v.03')]

In [101]:
cake = s_cake[0]

In [99]:
s_loaf

[Synset('loaf_of_bread.n.01'),
 Synset('loaf.n.02'),
 Synset('bum.v.02'),
 Synset('loiter.v.01')]

In [102]:
loaf = s_loaf[1]
loafb = s_loaf[0]

In [100]:
s_bread

[Synset('bread.n.01'), Synset('boodle.n.01'), Synset('bread.v.01')]

In [103]:
bread = s_bread[0]

#### Wu Palmer similarity Uses the hypernym tree to find out similarity of two words

In [104]:
cake.wup_similarity(loaf)

0.3076923076923077

In [105]:
cake.wup_similarity(loafb)

0.26666666666666666

In [106]:
cake.wup_similarity(bread)

0.2857142857142857

In [107]:
loaf.wup_similarity(loafb)

0.7142857142857143

In [108]:
loaf.wup_similarity(bread)

0.7692307692307693

In [109]:
loafb.wup_similarity(bread)

0.9411764705882353

In [112]:
loaf.hyponyms()

[Synset('haslet.n.01'),
 Synset('headcheese.n.01'),
 Synset('lunch_meat.n.01'),
 Synset('pound_cake.n.01'),
 Synset('scrapple.n.01'),
 Synset('sugarloaf.n.01')]

In [117]:
for syn  in wordnet.synsets("loaf"):
    for lem in syn.lemmas():
        print(lem.name())

loaf_of_bread
loaf
loaf
bum
bum_around
bum_about
arse_around
arse_about
fuck_off
loaf
frig_around
waste_one's_time
lounge_around
loll
loll_around
lounge_about
loiter
lounge
footle
lollygag
loaf
lallygag
hang_around
mess_about
tarry
linger
lurk
mill_about
mill_around
