In [13]:
import nltk
nltk.download('wordnet')
nltk.download('gutenberg')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [14]:
md = nltk.corpus.gutenberg.words("melville-moby_dick.txt")
md[:22]

['[',
 'Moby',
 'Dick',
 'by',
 'Herman',
 'Melville',
 '1851',
 ']',
 'ETYMOLOGY',
 '.',
 '(',
 'Supplied',
 'by',
 'a',
 'Late',
 'Consumptive',
 'Usher',
 'to',
 'a',
 'Grammar',
 'School',
 ')']

In [15]:
for word in md[:22]:
    if word.isalpha():
        print(word)

Moby
Dick
by
Herman
Melville
ETYMOLOGY
Supplied
by
a
Late
Consumptive
Usher
to
a
Grammar
School


In [16]:
for word in md[:22]:
    print(word.lower())

[
moby
dick
by
herman
melville
1851
]
etymology
.
(
supplied
by
a
late
consumptive
usher
to
a
grammar
school
)


# Tokenization

In [17]:
text = "The boy's cars aren't different colors."

### WhitespaceTokenizer

In [18]:
tokens = nltk.tokenize.WhitespaceTokenizer().tokenize(text)
print(tokens)
tokens_lower = [word.lower() for word in tokens ]
print(tokens_lower)
tokens_lower_isalpha = [word.lower() for word in tokens if word.isalpha()]
print(tokens_lower_isalpha)

['The', "boy's", 'cars', "aren't", 'different', 'colors.']
['the', "boy's", 'cars', "aren't", 'different', 'colors.']
['the', 'cars', 'different']


### word_tokenize

In [19]:
tokens = nltk.word_tokenize(text)
print(tokens)
tokens_lower = [word.lower() for word in tokens ]
print(tokens_lower)
tokens_lower_isalpha = [word.lower() for word in tokens if word.isalpha()]
print(tokens_lower_isalpha)

['The', 'boy', "'s", 'cars', 'are', "n't", 'different', 'colors', '.']
['the', 'boy', "'s", 'cars', 'are', "n't", 'different', 'colors', '.']
['the', 'boy', 'cars', 'are', 'different', 'colors']


### PorterStemmer

In [20]:
porter = nltk.PorterStemmer()
my_list = ["cat","cats","lie","lying","run","running","city","cities","month","monthly","woman","women"]
print (my_list)
for word in my_list:
    print (word,"   >  ",porter.stem(word))

['cat', 'cats', 'lie', 'lying', 'run', 'running', 'city', 'cities', 'month', 'monthly', 'woman', 'women']
cat    >   cat
cats    >   cat
lie    >   lie
lying    >   lie
run    >   run
running    >   run
city    >   citi
cities    >   citi
month    >   month
monthly    >   monthli
woman    >   woman
women    >   women


### LancasterStemmer

In [21]:
lancaster = nltk.LancasterStemmer()
my_list = ["eats","cats","lie","lying","run","running","city","cities","month","monthly","woman","women"]
print (my_list)
for word in my_list:
    print (word,"   >  ",lancaster.stem(word))

['eats', 'cats', 'lie', 'lying', 'run', 'running', 'city', 'cities', 'month', 'monthly', 'woman', 'women']
eats    >   eat
cats    >   cat
lie    >   lie
lying    >   lying
run    >   run
running    >   run
city    >   city
cities    >   city
month    >   mon
monthly    >   month
woman    >   wom
women    >   wom


### WordNetLemmatizer

In [22]:
wnlem = nltk.WordNetLemmatizer()
my_list = ["feet","cats","wolves","lying","run","running","city","cities","month","monthly","woman","women"]
for word in my_list:
    print(word, "   ->   ", wnlem.lemmatize(word))

feet    ->    foot
cats    ->    cat
wolves    ->    wolf
lying    ->    lying
run    ->    run
running    ->    running
city    ->    city
cities    ->    city
month    ->    month
monthly    ->    monthly
woman    ->    woman
women    ->    woman
