In [None]:
"""
                ◘ Lemmatization
            • Process of grouping together the inflected forms of a word
              to be analyzed as a single root word or lemma.
            • Unlike stemming, it reduces the inflected words properly ensuring
              that the root word (lemma) belongs to the language
            • A lemma is the canonical form, dictionary form, or citation form
              of a set of words.
"""
"""
                ◘ Lemmatization vs stemming
            • speed vs Accuracy tradeoff
            • Stemming is typically faster
              ○ simply chops off the end of a word using heuritics,
              ○ no understanding of the context
            • Lemmatization is typically more accurate
              ○ Uses more informed analysis
              ○ Always reduces to a dictionary word
              ○ More accurate but computationally expensive

"""

### WordNetLemmatizer

In [4]:
import nltk
wn = nltk.WordNetLemmatizer()
#dir(wn)

In [5]:
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))

goose
goose


In [6]:
print(wn.lemmatize('cactus'))
print(wn.lemmatize('cacti'))

cactus
cactus


### Apply wordnetLammatizer on data

In [7]:
import os
os.chdir("D:\\python\\2.NLTK")

In [9]:
import pandas as pd
import re
import string
pd.set_option('display.max_colwidth',100)
stopwods = nltk.corpus.stopwords.words('english')
data = pd.read_csv('SMSSpamCollection',sep='\t',header=None)
data.columns = ['label','msg']
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


### Data Cleaning

In [10]:
# Get rid of punctuation
def clean_text(txt):
    txt = "".join([c for c in txt if c not in string.punctuation])
    tokens = re.split('\W+',txt)
    txt = [word for word in tokens if word not in stopwods]
    return txt

In [11]:
data['msg_nostop'] = data['msg'].apply(lambda x : clean_text(x))
data.head()

Unnamed: 0,label,msg,msg_nostop
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, early, hor, U, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[Nah, I, dont, think, goes, usf, lives, around, though]"


### Now ready to apply Lemmatization

In [12]:
def lemmatization(token_txt):
    text = [wn.lemmatize(word) for word in token_txt]
    return token_txt

In [13]:
data['msg_lemmatized'] = data['msg_nostop'].apply(lambda x: lemmatization(x))
data.head()

Unnamed: 0,label,msg,msg_nostop,msg_lemmatized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]","[Go, jurong, point, crazy, Available, bugis, n, great, world, la, e, buffet, Cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[Ok, lar, Joking, wif, u, oni]","[Ok, lar, Joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv...","[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[U, dun, say, early, hor, U, c, already, say]","[U, dun, say, early, hor, U, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[Nah, I, dont, think, goes, usf, lives, around, though]","[Nah, I, dont, think, goes, usf, lives, around, though]"
