In [None]:
"""
                ◘ Stemming
                
        Process of reducing inflected (or derived) words to their root word
        or word stem.
        example as below : all words belong to root word to "code"
            code
            coder
            coders           code
            coding
"""

"""
                ◘ Errors in Stemming
                
            1. Overstemming
                ○ Too much of word is cut off (Meaning lost)
                ○ words of different stems reduced to same stem
            2. Understemming
                ○ 2 words of same  stem mapped to different stems
"""
"""
                ◘ Why is stemming useful?
            • Reduces the corpus of words  the model needs to work with
            • Explicitly correlates words with similar meaning
"""
"""
                ◘ Stemming Algorithms
            • Porter Stemmer
            • Snowball Stemmer
            • Lancaster Stemmer
            • Regex-based Stemmer
"""

### Porter Stemming

In [1]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
#dir(ps)

In [2]:
print(ps.stem('coder'))
print(ps.stem('coding'))
print(ps.stem('code'))

coder
code
code


In [3]:
print(ps.stem('data'))
print(ps.stem('datum'))

data
datum


In [4]:
print(ps.stem('bowl'))
print(ps.stem('bowling'))
print(ps.stem('bowler'))

bowl
bowl
bowler


In [None]:
## Applying Porter stemmer on On Data

In [7]:
import os
os.chdir("D:\\python\\2.NLTK")

In [8]:
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth',100)
stopwords = nltk.corpus.stopwords.words('english')
data = pd.read_csv('SMSSpamCollection',sep='\t',header=None)
data.columns = ['label','msg']
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


### Clean Text

In [9]:
def clean_text(text):
    text ="".join([c for c in text if c not in string.punctuation])
    tokens = re.split('\W+',text)
    text = [word for word in tokens if word not in stopwords]
    return text

In [12]:
data['msg_nostop'] = data['msg'].apply(lambda x: clean_text(x.lower()))
data.head()

Unnamed: 0,label,msg,msg_nostop
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"


### Stem the text

In [13]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [14]:
data['msg_stemmed'] = data['msg_nostop'].apply(lambda x : stemming(x))
data.head()

Unnamed: 0,label,msg,msg_nostop,msg_stemmed
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"
