### Basic Text Preprocessing

#### Stemming and Lemmatization

In [1]:
import nltk
from nltk.stem import PorterStemmer

Difference between Stemming and Lemmatization:
 
Stemming is very crude, it just chops of end of the words mostly. Result might not be actual words

Lemmatization uses rules of language, hence returns actual words


In [2]:
porter = PorterStemmer()

In [3]:
porter.stem("walking")

'walk'

In [4]:
porter.stem("walked")

'walk'

In [5]:
porter.stem("replacement")

'replac'

In [6]:
sentence = "Difference between Stemming and Lemmatization-Stemming is very crude, it just chops of end of the words mostly. Result might not be actual words. Lemmatization uses rules of language, hence returns actual words".split()

sentence

['Difference',
 'between',
 'Stemming',
 'and',
 'Lemmatization-Stemming',
 'is',
 'very',
 'crude,',
 'it',
 'just',
 'chops',
 'of',
 'end',
 'of',
 'the',
 'words',
 'mostly.',
 'Result',
 'might',
 'not',
 'be',
 'actual',
 'words.',
 'Lemmatization',
 'uses',
 'rules',
 'of',
 'language,',
 'hence',
 'returns',
 'actual',
 'words']

In [7]:
for token in sentence:
    print(porter.stem(token))

differ
between
stem
and
lemmatization-stem
is
veri
crude,
it
just
chop
of
end
of
the
word
mostly.
result
might
not
be
actual
words.
lemmat
use
rule
of
language,
henc
return
actual
word


In [8]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
from nltk.corpus import wordnet
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/souvik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/souvik/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
lemmatizer.lemmatize('walking')

'walking'

In [11]:
## Specifying POS for lemmatization
lemmatizer.lemmatize('walking',pos = wordnet.VERB)

'walk'

In [12]:
lemmatizer.lemmatize('going',pos = wordnet.VERB)

'go'

In [13]:
lemmatizer.lemmatize('mice')

'mouse'

In [14]:
for token in sentence:
    print(lemmatizer.lemmatize(token))

Difference
between
Stemming
and
Lemmatization-Stemming
is
very
crude,
it
just
chop
of
end
of
the
word
mostly.
Result
might
not
be
actual
words.
Lemmatization
us
rule
of
language,
hence
return
actual
word


In [15]:
lemmatizer.lemmatize('better',pos = wordnet.ADJ)

'good'

In [16]:
## Part of Speech tagging

In [17]:
def get_word_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [18]:
nltk.download('averaged_perceptron_tagger') ## For detecting POS

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/souvik/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [19]:
words_tags = nltk.pos_tag(sentence)
words_tags

[('Difference', 'NN'),
 ('between', 'IN'),
 ('Stemming', 'VBG'),
 ('and', 'CC'),
 ('Lemmatization-Stemming', 'NNP'),
 ('is', 'VBZ'),
 ('very', 'RB'),
 ('crude,', 'JJ'),
 ('it', 'PRP'),
 ('just', 'RB'),
 ('chops', 'NNS'),
 ('of', 'IN'),
 ('end', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('words', 'NNS'),
 ('mostly.', 'VBP'),
 ('Result', 'NNP'),
 ('might', 'MD'),
 ('not', 'RB'),
 ('be', 'VB'),
 ('actual', 'JJ'),
 ('words.', 'JJ'),
 ('Lemmatization', 'NNP'),
 ('uses', 'VBZ'),
 ('rules', 'NNS'),
 ('of', 'IN'),
 ('language,', 'JJ'),
 ('hence', 'NN'),
 ('returns', 'NNS'),
 ('actual', 'JJ'),
 ('words', 'NNS')]

In [20]:
for word, tag in words_tags:
    lemma = lemmatizer.lemmatize(word,pos = get_word_pos(tag))
    print(lemma)
    

Difference
between
Stemming
and
Lemmatization-Stemming
be
very
crude,
it
just
chop
of
end
of
the
word
mostly.
Result
might
not
be
actual
words.
Lemmatization
use
rule
of
language,
hence
return
actual
word
