In [40]:
import nltk
from nltk.stem import PorterStemmer
porter = PorterStemmer()

In [29]:
wordList = ["walking","walked","walks","ran","running","bosses","replacement"]
for word in wordList:
  print(f'{word} HAS STEMMED FORMAT {porter.stem(word)}')

print('---------------------------------------------------')
sentence = "Lemmatization is more sophisticated than stemming".split()
for token in sentence:
  print(porter.stem(token), end=" ")

print('\n---------------------------------------------------')
wordList = ["unnecessary","berry"]
for word in wordList:
  print(f'{word} HAS STEMMED FORMAT {porter.stem(word)}')

walking HAS STEMMED FORMAT walk
walked HAS STEMMED FORMAT walk
walks HAS STEMMED FORMAT walk
ran HAS STEMMED FORMAT ran
running HAS STEMMED FORMAT run
bosses HAS STEMMED FORMAT boss
replacement HAS STEMMED FORMAT replac
---------------------------------------------------
lemmat is more sophist than stem 
---------------------------------------------------
unnecessary HAS STEMMED FORMAT unnecessari
berry HAS STEMMED FORMAT berri


In [41]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
wordList = ["walking","going","ran"]
for word in wordList:
  print(f'LEMMATIZED {word} IS {lemmatizer.lemmatize(word)} AND WITH TAGGING IS {lemmatizer.lemmatize(word, pos=wordnet.VERB)}')

LEMMATIZED walking IS walking AND WITH TAGGING IS walk
LEMMATIZED going IS going AND WITH TAGGING IS go
LEMMATIZED ran IS ran AND WITH TAGGING IS run


In [43]:
print('ORIGINAL WORD : mice')
print(f'STEM OF WORD mice: {porter.stem("mice")}')
print(f'LEMMA OF WORD mice: {lemmatizer.lemmatize("mice")}')
print('\n---------------------------------------------------')
print('ORIGINAL WORD : was')
print(f'STEM OF WORD was: {porter.stem("was")}')
print(f'LEMMA OF WORD was: {lemmatizer.lemmatize("was", pos=wordnet.VERB)}')
print('\n---------------------------------------------------')
print('ORIGINAL WORD : is')
print(f'STEM OF WORD is: {porter.stem("is")}')
print(f'LEMMA OF WORD is: {lemmatizer.lemmatize("is", pos=wordnet.VERB)}')
print('\n---------------------------------------------------')
print('ORIGINAL WORD : better')
print(f'STEM OF WORD better: {porter.stem("better")}')
print(f'LEMMA OF WORD better: {lemmatizer.lemmatize("better", pos=wordnet.ADJ)}')

ORIGINAL WORD : mice
STEM OF WORD mice: mice
LEMMA OF WORD mice: mouse

---------------------------------------------------
ORIGINAL WORD : was
STEM OF WORD was: wa
LEMMA OF WORD was: be

---------------------------------------------------
ORIGINAL WORD : is
STEM OF WORD is: is
LEMMA OF WORD is: be

---------------------------------------------------
ORIGINAL WORD : better
STEM OF WORD better: better
LEMMA OF WORD better: good


In [46]:
def get_wordnet_pos(treebank_tag):
  if treebank_tag.startswith('J'):
    return wordnet.ADJ
  elif treebank_tag.startswith('V'):
    return wordnet.VERB
  elif treebank_tag.startswith('N'):
    return wordnet.NOUN
  elif treebank_tag.startswith('R'):
    return wordnet.ADV
  else:
    return wordnet.NOUN

In [47]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [48]:
sentence = "Donald Trump has a devoted following".split()

words_and_tags = nltk.pos_tag(sentence)
words_and_tags

[('Donald', 'NNP'),
 ('Trump', 'NNP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('devoted', 'VBN'),
 ('following', 'NN')]

In [50]:
for word, tag in words_and_tags:
  lemma = lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
  print(lemma, end=" ")

Donald Trump have a devote following 

In [51]:

sentence = "The cat was following the bird as it flew by".split()

words_and_tags = nltk.pos_tag(sentence)
words_and_tags

[('The', 'DT'),
 ('cat', 'NN'),
 ('was', 'VBD'),
 ('following', 'VBG'),
 ('the', 'DT'),
 ('bird', 'NN'),
 ('as', 'IN'),
 ('it', 'PRP'),
 ('flew', 'VBD'),
 ('by', 'IN')]

In [52]:
for word, tag in words_and_tags:
  lemma = lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
  print(lemma, end=" ")

The cat be follow the bird a it fly by 