# Introduction

- **Stemming** is very **crude** - it just chops off the end of the word. The result is not necessarily a real word

- **Lammatization** is more sophisticated, uses actual rules of language. The true root will be returned

In [1]:
import nltk

# PorterStemmer

In [1]:
from nltk.stem import PorterStemmer

In [2]:
porter = PorterStemmer()

In [3]:
porter.stem('walking')

'walk'

In [4]:
porter.stem('walked')

'walk'

In [5]:
porter.stem('walks')

'walk'

In [6]:
porter.stem('ran')

'ran'

In [7]:
porter.stem('running')

'run'

In [8]:
porter.stem('bosses')

'boss'

In [9]:
porter.stem('replacement')

'replac'

In [11]:
sentence = "Lemmatization is more sophisticated than stemming".split()
sentence

['Lemmatization', 'is', 'more', 'sophisticated', 'than', 'stemming']

In [12]:
for token in sentence:
    print(porter.stem(token), end=' ')

lemmat is more sophist than stem 

In [13]:
# interesting behaviour
porter.stem('unnecessary')

'unnecessari'

In [14]:
porter.stem('berry')

'berri'

# WordNetLemmatizer

In [12]:
from nltk.stem import WordNetLemmatizer

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/tamer/nltk_data...


True

In [10]:
from nltk.corpus import wordnet

In [13]:
lemmatizer = WordNetLemmatizer()

In [14]:
lemmatizer.lemmatize('walking')

'walking'

In [15]:
lemmatizer.lemmatize('walking', pos=wordnet.VERB)

'walk'

In [16]:
lemmatizer.lemmatize('going')

'going'

In [17]:
lemmatizer.lemmatize('going', pos=wordnet.VERB)

'go'

In [18]:
lemmatizer.lemmatize('ran')

'ran'

In [19]:
lemmatizer.lemmatize('ran', pos=wordnet.VERB)

'run'

In [20]:
porter.stem('mice')

'mice'

In [21]:
lemmatizer.lemmatize('mice')

'mouse'

In [22]:
 porter.stem('was')

'wa'

In [23]:
lemmatizer.lemmatize('was', pos=wordnet.VERB)

'be'

In [24]:
porter.stem('is')

'is'

In [25]:
lemmatizer.lemmatize('is', pos=wordnet.VERB)

'be'

In [26]:
porter.stem('better')

'better'

In [27]:
lemmatizer.lemmatize('better', pos=wordnet.VERB)

'better'

In [28]:
lemmatizer.lemmatize('better', pos=wordnet.ADJ)

'good'

# Tagging

In [37]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [38]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tamer/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [39]:
sentence = "Donald Trump has a devoted following".split()

In [42]:
for token in sentence:
    print(porter.stem(token), end=' ')

donald trump ha a devot follow 

In [40]:
words_and_tags = nltk.pos_tag(sentence)
words_and_tags

[('Donald', 'NNP'),
 ('Trump', 'NNP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('devoted', 'VBN'),
 ('following', 'NN')]

In [41]:
for word, tag in words_and_tags:
    lemma = lemmatizer.lemmatize(word, get_wordnet_pos(tag))
    print(lemma, end=' ')

Donald Trump have a devote following 

In [43]:
sentence = 'The cat was following the bird as it flew by'.split()

In [44]:
words_and_tags = nltk.pos_tag(sentence)
words_and_tags

[('The', 'DT'),
 ('cat', 'NN'),
 ('was', 'VBD'),
 ('following', 'VBG'),
 ('the', 'DT'),
 ('bird', 'NN'),
 ('as', 'IN'),
 ('it', 'PRP'),
 ('flew', 'VBD'),
 ('by', 'IN')]

In [45]:
for word, tag in words_and_tags:
    lemma = lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
    print(lemma, end=' ')

The cat be follow the bird a it fly by 