In [7]:
import nltk

# nltk.download()
# Sentence Tokenization

text = "Mary had a little lamb. Her fleece was white as snow."

from nltk.tokenize import word_tokenize, sent_tokenize

sents = sent_tokenize(text, 'english')
print(sents)

['Mary had a little lamb.', 'Her fleece was white as snow.']


In [9]:
# Word Tokenization
words = [word_tokenize(sent) for sent in sents]
print(words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'white', 'as', 'snow', '.']]


In [11]:
# Stopwords [how to filter a text]

from nltk.corpus import stopwords
from string import punctuation

custom_stop_words = set(stopwords.words('english') + list(punctuation))
words_wo_stop_words = [word for word in word_tokenize(text) if word not in custom_stop_words]

print(words_wo_stop_words)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'white', 'snow']


In [17]:
# Ngrams [Bigrams]
from nltk.collocations import *

finder = BigramCollocationFinder.from_words(words_wo_stop_words)
sorted(finder.ngram_fd.items())

# Trigram
# finder2 = TrigramCollocationFinder.from_words(words_wo_stop_words)
# sorted(finder2.ngram_fd.items())

[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'white'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1),
 (('white', 'snow'), 1)]

In [19]:
text2 = "Mary closed on closing night when she was in the mood to close."

from nltk.stem.lancaster import LancasterStemmer

st = LancasterStemmer()
stemmed_words = [st.stem(word) for word in word_tokenize(text2)]
print(stemmed_words)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']


In [20]:
nltk.pos_tag(word_tokenize(text2))

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

In [22]:
from nltk.corpus import wordnet as wn

for ss in wn.synsets('bass'):
    print(ss, ss.definition())

Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range


In [23]:
from nltk.wsd import lesk

sense1 = lesk(word_tokenize('Sing in a lower tone, along with the bass.'), 'bass')
print(sense1, sense1.definition())

Synset('bass.n.07') the member with the lowest range of a family of musical instruments


In [24]:
sense2 = lesk(word_tokenize('This sea bass is really hard to catch.'), 'bass')
print(sense2, sense2.definition())

Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae


In [30]:
# Test keywords
for ss1 in wn.synsets('learning'):
    print(ss1, ss1.definition())

Synset('learning.n.01') the cognitive process of acquiring skill or knowledge
Synset('eruditeness.n.01') profound scholarly knowledge
Synset('learn.v.01') gain knowledge or skills
Synset('learn.v.02') get to know or become aware of, usually accidentally
Synset('memorize.v.01') commit to memory; learn by heart
Synset('learn.v.04') be a student of a certain subject
Synset('teach.v.01') impart skills or knowledge to
Synset('determine.v.08') find out, learn, or determine with certainty, usually by making an inquiry or other effort


In [31]:
sense3 = lesk(word_tokenize('Machine learning is a future for Software Engineers.'), 'learning')
print(sense3, sense3.definition())

Synset('learn.v.04') be a student of a certain subject
