In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Wordnet Lemmatizer with NLTK 

In [2]:
from nltk.stem import WordNetLemmatizer 

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("bats"))
print(lemmatizer.lemmatize("are"))
print(lemmatizer.lemmatize("feet"))

bat
are
foot


### Sentence Lemmitization

Let’s lemmatize a simple sentence. We first tokenize the sentence into words using nltk.word_tokenize and then we will call lemmatizer.lemmatize() on each word.

In [3]:
# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best"
sentence

'The striped bats are hanging on their feet for best'

In [4]:
# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)

['The', 'striped', 'bats', 'are', 'hanging', 'on', 'their', 'feet', 'for', 'best']


In [5]:
# Lemmatize list of words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
print(lemmatized_output)

The striped bat are hanging on their foot for best


# Wordnet Lemmatizer with appropriate POS tag

In [6]:
# Define the sentence to be lemmatized
sentence = "The striped bats are hanging on their feet for best"
sentence

'The striped bats are hanging on their feet for best'

In [7]:
print(nltk.pos_tag(['feet']))

[('feet', 'NNS')]


CC= coordinating conjunction , 
RB=adverbs ,
IN=preposition , 
NN=noun ,
JJ=adjective

In [8]:
print(nltk.pos_tag(nltk.word_tokenize(sentence)))

[('The', 'DT'), ('striped', 'JJ'), ('bats', 'NNS'), ('are', 'VBP'), ('hanging', 'VBG'), ('on', 'IN'), ('their', 'PRP$'), ('feet', 'NNS'), ('for', 'IN'), ('best', 'JJS')]


In [9]:
# Lemmatize list of words and join
lemmatized_output = ' '.join([lemmatizer.lemmatize(w,"v") for w in word_list])
print(lemmatized_output)

The strip bat be hang on their feet for best


 Notice it didn’t do a good job. Because, ‘are’ is not converted to ‘be’ and ‘hanging’ is not converted to ‘hang’ as expected

# TextBlob Lemmatizer

In [10]:
# pip install textblob
from textblob import TextBlob, Word

# Lemmatize a word
word = 'stripes'
w = Word(word)
w.lemmatize()

'stripe'

In [19]:
sentence = "The striped bats are hanging on their feet for best"
sent = TextBlob(sentence)  # by default tokenize and pos
sent.tags

[('The', 'DT'),
 ('striped', 'JJ'),
 ('bats', 'NNS'),
 ('are', 'VBP'),
 ('hanging', 'VBG'),
 ('on', 'IN'),
 ('their', 'PRP$'),
 ('feet', 'NNS'),
 ('for', 'IN'),
 ('best', 'JJS')]

In [12]:
" ". join([w.lemmatize() for w in sent.words])

'The striped bat are hanging on their foot for best'

## TextBlob Lemmatizer with appropriate POS tag

In [20]:
def lemmatize_with_postag(sentence):
    sent = TextBlob(sentence)
    tag_dict = {"J": 'a',  ##adjectives
                "N": 'n', ## Noun
                "V": 'v', ## Verb
                "R": 'r'} ## adverbs
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    # Based on POS (first element) we are converting J=a & n=n other = n
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags] # we are lemmatize every word based on noun 
    return " ".join(lemmatized_list)

In [21]:
# Lemmatize
sentence = "The striped bats are hanging on their feet for best"
lemmatize_with_postag(sentence)

'The striped bat be hang on their foot for best'

# Pattern Lemmatizer


In [15]:
import pattern
from pattern.en import lemma

In [37]:
sentence = "The striped BATS Were Hanging on their feet and ate best fishes"
" ".join([lemma(wd) for wd in sentence.split()])

'the stripe bat be hang on their feet and eat best fishes'

# Comparing 

In [38]:
import string
from nltk.corpus import wordnet


In [39]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [41]:
sentence = """Following mice attacks, caring farmers were marching to Delhi for better living conditions. 
Delhi police on Tuesday fired water cannons and teargas shells at protesting farmers as they tried to 
break barricades with their cars, automobiles and tractors."""

# WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(" ".join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence) if w not in string.punctuation]))

# TextBlob
print(lemmatize_with_postag(sentence))

# Pattern
from pattern.en import lemma
print(" ".join([lemma(wd) for wd in sentence.split()]))

Following mouse attack care farmer be march to Delhi for well living condition Delhi police on Tuesday fire water cannon and teargas shell at protest farmer a they try to break barricade with their car automobile and tractor
Following mouse attack care farmer be march to Delhi for good living condition Delhi police on Tuesday fire water cannon and teargas shell at protest farmer a they try to break barricade with their car automobile and tractor
follow mice attacks, care farmer be march to delhi for better live conditions. delhi police on tuesday fire water cannon and tearga shell at protest farmer a they try to break barricade with their cars, automobile and tractors.
