In [1]:
import nltk # need install nltk by: pip isntall -U nltk

In [8]:
########################## Tokenizing ##########################
text = "Mary had a little lamb. Her fleece was with as snow" # raw words
from nltk.tokenize import word_tokenize, sent_tokenize
sents = sent_tokenize(text)
print (sents)

['Mary had a little lamb.', 'Her fleece was with as snow']


In [5]:
words= [word_tokenize(sent) for sent in sents]
print (words)

[['Mary', 'had', 'a', 'little', 'lamb', '.'], ['Her', 'fleece', 'was', 'with', 'as', 'snow']]


In [9]:
########################## Removing stopwords ##########################
from nltk.corpus import stopwords
from string import punctuation
customStopWords = set(stopwords.words('english')+list(punctuation))
wordsWOStopwords = [word for word in word_tokenize(text) if word not in customStopWords]
print (wordsWOStopwords)

['Mary', 'little', 'lamb', 'Her', 'fleece', 'snow']


In [13]:
########################## Identifying Bigrams ##############################
# N-grams are groups of words that occur commonly together from any piece of text. below shows how to 
# construct bigrams from a list of words and also see what is the frequency of occurrence of those
# bigrams are within that list of words
# Bigrams are any pair of words that occur consecutively
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(wordsWOStopwords)
# to print them out, we use function ngram_fd.items(), it will print all bigrams along with their frequencies
# sort: the most important bigrams on top
sorted(finder.ngram_fd.items()) 


[(('Her', 'fleece'), 1),
 (('Mary', 'little'), 1),
 (('fleece', 'snow'), 1),
 (('lamb', 'Her'), 1),
 (('little', 'lamb'), 1)]

In [16]:
########################## Stemming ########################## 
# below example: different morphological forms of the same word - close
text2 = "Mary closed on closing night when she was in the mood to close."
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
stemmedWords = [st.stem(word) for word in word_tokenize(text2)]
print (stemmedWords)

['mary', 'clos', 'on', 'clos', 'night', 'when', 'she', 'was', 'in', 'the', 'mood', 'to', 'clos', '.']


In [18]:
nltk.pos_tag(word_tokenize(text2)) # tag each words is a noun or verb or ...

[('Mary', 'NNP'),
 ('closed', 'VBD'),
 ('on', 'IN'),
 ('closing', 'NN'),
 ('night', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mood', 'NN'),
 ('to', 'TO'),
 ('close', 'VB'),
 ('.', '.')]

In [19]:
########################## Disambiguating word meanings ##########################
from nltk.corpus import wordnet as wn
for ss in wn.synsets('bass'):
    print (ss, ss.definition())

(Synset('bass.n.01'), u'the lowest part of the musical range')
(Synset('bass.n.02'), u'the lowest part in polyphonic music')
(Synset('bass.n.03'), u'an adult male singer with the lowest voice')
(Synset('sea_bass.n.01'), u'the lean flesh of a saltwater fish of the family Serranidae')
(Synset('freshwater_bass.n.01'), u'any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)')
(Synset('bass.n.06'), u'the lowest adult male singing voice')
(Synset('bass.n.07'), u'the member with the lowest range of a family of musical instruments')
(Synset('bass.n.08'), u'nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes')
(Synset('bass.s.01'), u'having or denoting a low vocal or instrumental range')


In [22]:
from nltk.wsd import lesk
sensel = lesk(word_tokenize("Sing in a lower tone, along with the bbass"), 'bass')
print (sensel, sensel.definition())

(Synset('bass.n.07'), u'the member with the lowest range of a family of musical instruments')


In [23]:
### if to solve classification problem, you can apply algorithms like the Naive Bayes, or the Support Vector
### if to sove clustering problem, you can apply algorithms like K-Means or the Hierarchical Clustering