<a href="https://colab.research.google.com/github/umeshrawat/AI_Math_Vedas/blob/master/1)_NLP1_Intro_Tokenization%2C_Stemming%2C_and_Lemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dealing with Natural Language (Text)
We will use the Tweets dataset from Kaggle. The tweets are from US customers about their air travel experience.

In [None]:
import warnings
warnings.filterwarnings("ignore")
import nltk

In [None]:
# Install NLTK (Natural Language ToolKit)
!pip install nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


True

In [None]:
import nltk
import pandas as pd
import numpy as np
import csv

In [None]:
df = pd.read_csv('Tweets.csv')

In [None]:
df.head(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)


In [None]:
#Column ['text'] contains the tweets
print(np.random.choice(df['text'], 5)) #5 random tweets

['@AmericanAir LGA 2 Nashville Cancelled Flightled phone center no help. Fabulous staff at gate D4 helped-2 young men handled crowd well.'
 '@AmericanAir I was rebooked on a flight that was too Late Flight for my connection!'
 "@AmericanAir so what are you going to do for me since I can't take the option you gave me. What type of refund will you do for me?"
 '@united you need a bag bouncer. Get it together'
 "@united I should be fine. They automatically changed my connecting flight for me so I wouldn't miss it. A+ work."]


In [None]:
#Types of tokenization: a) by word b) by sentence

In [None]:
#Word Tokenization in a naive way
df.loc[:5,'text'].str.split(' ')
# You can see that some words are in uppercase. Perhaps, we should put everything in lowercase

0             [@VirginAmerica, What, @dhepburn, said.]
1    [@VirginAmerica, plus, you've, added, commerci...
2    [@VirginAmerica, I, didn't, today..., Must, me...
3    [@VirginAmerica, it's, really, aggressive, to,...
4    [@VirginAmerica, and, it's, a, really, big, ba...
5    [@VirginAmerica, seriously, would, pay, $30, a...
Name: text, dtype: object

In [None]:
df.loc[:5,'text'].str.lower().str.split(' ')# Next, we will remove special characters like @,

0             [@virginamerica, what, @dhepburn, said.]
1    [@virginamerica, plus, you've, added, commerci...
2    [@virginamerica, i, didn't, today..., must, me...
3    [@virginamerica, it's, really, aggressive, to,...
4    [@virginamerica, and, it's, a, really, big, ba...
5    [@virginamerica, seriously, would, pay, $30, a...
Name: text, dtype: object

In [None]:
df.loc[:5,'text'].str.replace('@','').str.lower().str.split(' ')

#How many issues will you resolve this way? There are quotes(single & double),
#other special characters like exclamation marks, hashtags, etc.
#Note that the last word is having a '.' at the end.
#There are so many trivial issues that need to be handled

0               [virginamerica, what, dhepburn, said.]
1    [virginamerica, plus, you've, added, commercia...
2    [virginamerica, i, didn't, today..., must, mea...
3    [virginamerica, it's, really, aggressive, to, ...
4    [virginamerica, and, it's, a, really, big, bad...
5    [virginamerica, seriously, would, pay, $30, a,...
Name: text, dtype: object

In [None]:
# NLTK provides a fuction word_tokenize which can take care of most of such issues.
for each in df.loc[:5,'text'].str.lower():
    print(nltk.word_tokenize(each))

['@', 'virginamerica', 'what', '@', 'dhepburn', 'said', '.']
['@', 'virginamerica', 'plus', 'you', "'ve", 'added', 'commercials', 'to', 'the', 'experience', '...', 'tacky', '.']
['@', 'virginamerica', 'i', 'did', "n't", 'today', '...', 'must', 'mean', 'i', 'need', 'to', 'take', 'another', 'trip', '!']
['@', 'virginamerica', 'it', "'s", 'really', 'aggressive', 'to', 'blast', 'obnoxious', '``', 'entertainment', "''", 'in', 'your', 'guests', "'", 'faces', '&', 'amp', ';', 'they', 'have', 'little', 'recourse']
['@', 'virginamerica', 'and', 'it', "'s", 'a', 'really', 'big', 'bad', 'thing', 'about', 'it']
['@', 'virginamerica', 'seriously', 'would', 'pay', '$', '30', 'a', 'flight', 'for', 'seats', 'that', 'did', "n't", 'have', 'this', 'playing', '.', 'it', "'s", 'really', 'the', 'only', 'bad', 'thing', 'about', 'flying', 'va']


# Removing Stopwords
> Stopwords are words that you want to ignore, so you filter them out of your text when you’re processing it.
> Very common words like 'in', 'is', and 'an' are often used as stop words since they don’t add a lot of meaning to a text in and of themselves.


In [None]:
#Next step is to remove the stopwords (words that don't carry a semantic importance)

#Let's fetch the English language stopwords
from nltk.corpus import stopwords
sw_list = set(stopwords.words('english'))
print(sw_list)

#It can be seen that there are no special characters/symbols in the list of stopwords
#Let's extend it

{'in', 'yourself', 'ourselves', 'through', "didn't", 'ma', 'don', 'during', 'no', "mustn't", "should've", 'which', 'm', "you're", "that'll", 'll', 'out', 'same', 'y', "it's", 'with', 'before', 'wouldn', 'are', "don't", 'all', 'whom', 'a', 'after', 'didn', 're', 'you', 'or', 'below', 'yourselves', 'itself', 'down', 'herself', 'my', 'haven', "she's", "wouldn't", "weren't", 'him', "isn't", 'here', 'who', 'not', 'having', "mightn't", 'but', "won't", 'be', 'should', 'his', 'then', 'under', 'at', 'above', 'until', 'has', 'once', "you've", 'off', 'nor', 'had', 'both', 'when', 't', "doesn't", 'theirs', 'your', 'she', 'where', 'ours', 'am', 'between', 'he', 'some', 'too', 'do', 'did', 'other', 'can', 'any', 'isn', 'doesn', "hasn't", 'was', 'mightn', 'ain', 'yours', 'these', 'myself', 'now', 'does', 'than', 'being', 'their', "you'll", 'more', 'have', 'those', 'over', 'few', 'won', 'of', 'to', 'been', 'such', 'that', 's', 'about', 'each', "hadn't", 'most', 'because', 'd', "couldn't", 'as', 'this'

In [None]:
sw_list.update(['@',"'",'.','"','/','!',',',"'ve","...","n't",'$',"'s"])
print(sw_list)

{'in', 'yourself', 'ourselves', 'through', "didn't", 'ma', 'don', 'during', 'no', "mustn't", '"', "should've", 'which', 'm', "you're", "that'll", 'll', 'out', 'same', 'y', "it's", 'with', 'before', 'wouldn', 'are', "don't", 'all', 'whom', 'a', 'after', 'didn', 're', 'you', 'or', 'below', 'yourselves', 'itself', 'down', 'herself', 'my', 'haven', "she's", '@', "wouldn't", "weren't", 'him', "isn't", 'here', 'who', 'not', 'having', "mightn't", 'but', "won't", 'be', 'should', 'his', 'then', 'under', 'at', 'above', 'until', 'has', 'once', "you've", 'off', 'nor', 'had', 'both', 'when', 't', "doesn't", 'theirs', 'your', 'she', 'where', '.', 'ours', 'am', 'between', 'he', 'some', ',', 'too', 'do', '/', 'did', 'other', 'can', 'any', 'isn', 'doesn', "hasn't", 'was', 'mightn', 'ain', 'yours', '$', 'these', 'myself', 'now', 'does', 'than', 'being', 'their', "you'll", 'more', 'have', 'those', 'over', 'few', '...', 'won', 'of', 'to', 'been', 'such', 'that', 's', 'about', 'each', "hadn't", 'most', 'be

In [None]:
tokenized_data = []
for each in df.loc[:2,'text'].str.lower():
    tokenized_data.append(nltk.word_tokenize(each))
print(tokenized_data)

[['@', 'virginamerica', 'what', '@', 'dhepburn', 'said', '.'], ['@', 'virginamerica', 'plus', 'you', "'ve", 'added', 'commercials', 'to', 'the', 'experience', '...', 'tacky', '.'], ['@', 'virginamerica', 'i', 'did', "n't", 'today', '...', 'must', 'mean', 'i', 'need', 'to', 'take', 'another', 'trip', '!']]


In [None]:
data = []
for line in tokenized_data:
    processed_line = []
    for word in line:
        if word not in sw_list:
            processed_line.append(word)
    data.append(processed_line)
print(data)
#We are left with only semantically meanigful words
#Depending on the output, more stopwords can be added to the sw_list above

[['virginamerica', 'dhepburn', 'said'], ['virginamerica', 'plus', 'added', 'commercials', 'experience', 'tacky'], ['virginamerica', 'today', 'must', 'mean', 'need', 'take', 'another', 'trip']]


# Sentence Tokenization

In [None]:
#For some tasks, you might need to tokenize the data into sentences
#Sentence tokenization in a naive way
df.loc[5,'text'].lower().split('.')

["@virginamerica seriously would pay $30 a flight for seats that didn't have this playing",
 "\nit's really the only bad thing about flying va"]

In [None]:
#ALternative: Using sentence tokenizer from NLTK
nltk.sent_tokenize(df.loc[5,'text'].lower())

["@virginamerica seriously would pay $30 a flight for seats that didn't have this playing.",
 "it's really the only bad thing about flying va"]

# Stemming
Reduce words to their root, which is the core part of a word. Take note that the core part ('root') may not be a complete English word.

For example, the words “helping” and “helped” share the root “help.”

> There are two stemmers available in NLTK, PorterStemmer() and SnowballStemmer(). The Snowball stemmer, which is also called Porter2, is an improvement on the original Porter stemmer.

In [None]:
from nltk.stem import PorterStemmer
p_stemmer = PorterStemmer()

print(p_stemmer.stem('help'))
print(p_stemmer.stem('helped'))
print(p_stemmer.stem('helping'))

help
help
help


In [None]:
from nltk.stem import SnowballStemmer
s_stemmer = SnowballStemmer('english')

print(s_stemmer.stem('help'))
print(s_stemmer.stem('helped'))
print(s_stemmer.stem('helping'))

help
help
help


In [None]:
#Let's apply stemming on a subset of data and see the differences in the two stemmers
strings_for_stemming = df.loc[:100,'text'].str.lower()
strings_for_stemming

0                    @virginamerica what @dhepburn said.
1      @virginamerica plus you've added commercials t...
2      @virginamerica i didn't today... must mean i n...
3      @virginamerica it's really aggressive to blast...
4      @virginamerica and it's a really big bad thing...
                             ...                        
96     @virginamerica i can't check in or add a bag. ...
97     @virginamerica - let 2 scanned in passengers l...
98     @virginamerica what is your phone number. i ca...
99     @virginamerica is anyone doing anything there ...
100    @virginamerica trying to add my boy prince to ...
Name: text, Length: 101, dtype: object

In [None]:
from nltk import word_tokenize

words_in_each_string = []
for each_string in strings_for_stemming:
    words_in_each_string.append(word_tokenize(each_string))

words_in_each_string

[['@', 'virginamerica', 'what', '@', 'dhepburn', 'said', '.'],
 ['@',
  'virginamerica',
  'plus',
  'you',
  "'ve",
  'added',
  'commercials',
  'to',
  'the',
  'experience',
  '...',
  'tacky',
  '.'],
 ['@',
  'virginamerica',
  'i',
  'did',
  "n't",
  'today',
  '...',
  'must',
  'mean',
  'i',
  'need',
  'to',
  'take',
  'another',
  'trip',
  '!'],
 ['@',
  'virginamerica',
  'it',
  "'s",
  'really',
  'aggressive',
  'to',
  'blast',
  'obnoxious',
  '``',
  'entertainment',
  "''",
  'in',
  'your',
  'guests',
  "'",
  'faces',
  '&',
  'amp',
  ';',
  'they',
  'have',
  'little',
  'recourse'],
 ['@',
  'virginamerica',
  'and',
  'it',
  "'s",
  'a',
  'really',
  'big',
  'bad',
  'thing',
  'about',
  'it'],
 ['@',
  'virginamerica',
  'seriously',
  'would',
  'pay',
  '$',
  '30',
  'a',
  'flight',
  'for',
  'seats',
  'that',
  'did',
  "n't",
  'have',
  'this',
  'playing',
  '.',
  'it',
  "'s",
  'really',
  'the',
  'only',
  'bad',
  'thing',
  'about',


In [None]:
# ['@', 'virginamerica', 'trying', 'to', 'add', 'my', 'boy', 'prince', 'to', 'my', 'ressie', '.', 'sf', 'this', 'thursday', '@', 'virginamerica', 'from', 'lax', 'http', ':', '//t.co/gsb2j3c4gm']
original_words_list = []
p_stemmed_list = []
s_stemmed_list = []

for each_list in words_in_each_string:
    print(each_list)
    for word in each_list:
        original_words_list.append(word)
        p_stemmed_list.append(p_stemmer.stem(word))
        s_stemmed_list.append(s_stemmer.stem(word))

['@', 'virginamerica', 'what', '@', 'dhepburn', 'said', '.']
['@', 'virginamerica', 'plus', 'you', "'ve", 'added', 'commercials', 'to', 'the', 'experience', '...', 'tacky', '.']
['@', 'virginamerica', 'i', 'did', "n't", 'today', '...', 'must', 'mean', 'i', 'need', 'to', 'take', 'another', 'trip', '!']
['@', 'virginamerica', 'it', "'s", 'really', 'aggressive', 'to', 'blast', 'obnoxious', '``', 'entertainment', "''", 'in', 'your', 'guests', "'", 'faces', '&', 'amp', ';', 'they', 'have', 'little', 'recourse']
['@', 'virginamerica', 'and', 'it', "'s", 'a', 'really', 'big', 'bad', 'thing', 'about', 'it']
['@', 'virginamerica', 'seriously', 'would', 'pay', '$', '30', 'a', 'flight', 'for', 'seats', 'that', 'did', "n't", 'have', 'this', 'playing', '.', 'it', "'s", 'really', 'the', 'only', 'bad', 'thing', 'about', 'flying', 'va']
['@', 'virginamerica', 'yes', ',', 'nearly', 'every', 'time', 'i', 'fly', 'vx', 'this', '“', 'ear', 'worm', '”', 'won', '’', 't', 'go', 'away', ':', ')']
['@', 'virgin

In [None]:
data = pd.DataFrame(original_words_list, columns = ['Original Word'])
data['Porter_Stemming'] = p_stemmed_list
data['Snowball_Stemming'] = s_stemmed_list

data[:-50]

Unnamed: 0,Original Word,Porter_Stemming,Snowball_Stemming
0,@,@,@
1,virginamerica,virginamerica,virginamerica
2,what,what,what
3,@,@,@
4,dhepburn,dhepburn,dhepburn
...,...,...,...
2011,.,.,.
2012,i,i,i
2013,ca,ca,ca
2014,n't,n't,n't


In [None]:
#Let's look at only the unique words
data.drop_duplicates(inplace = True)
data
#You can see that 672 words are only remaining in the 'data' dataframe!

Unnamed: 0,Original Word,Porter_Stemming,Snowball_Stemming
0,@,@,@
1,virginamerica,virginamerica,virginamerica
2,what,what,what
4,dhepburn,dhepburn,dhepburn
5,said,said,said
...,...,...,...
2051,prince,princ,princ
2054,ressie,ressi,ressi
2056,sf,sf,sf
2058,thursday,thursday,thursday


In [None]:
#Let's see on which words the stemmers give different results!
data.loc[data['Porter_Stemming'] != data['Snowball_Stemming']]

# You can observe that Snowball_Stemming is more accurate.
# Also, observe that the root of some of the words is an incomplete English word.

Unnamed: 0,Original Word,Porter_Stemming,Snowball_Stemming
9,plus,plu,plus
11,'ve,'ve,ve
87,this,thi,this
102,yes,ye,yes
104,nearly,nearli,near
137,https,http,https
155,was,wa,was
165,'re,'re,re
351,amazingly,amazingli,amaz
428,bos,bo,bos


# Lemmatization
Stemming is a process that stems or removes last few characters from a word, often leading to incorrect meanings and spelling. Lemmatization considers the context and converts the word to its meaningful base form, which is called Lemma.

> For instance, stemming the word 'amazed' would return 'amaz'. However, lemmatizing the word 'amazed' would give 'amaze'.

In [None]:
print(p_stemmer.stem('amazed'))
print(s_stemmer.stem('amazed'))

amaz
amaz


In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('amazed')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'amazed'

In [None]:
# Note the difference in the results!
lemmatizer.lemmatize('amazed', pos = 'v')

# Lemmatization needs the Parts of Speech to generate the correct lemma. The default pos is 'n' (Noun).
# 'v' stands for verb.

'amaze'

# Tagging Parts of Speech (PoS)

In [None]:
import nltk

text = df.loc[5,'text']
print(text)

tokens_list = nltk.word_tokenize(text)
tokens_list

@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.
it's really the only bad thing about flying VA


['@',
 'VirginAmerica',
 'seriously',
 'would',
 'pay',
 '$',
 '30',
 'a',
 'flight',
 'for',
 'seats',
 'that',
 'did',
 "n't",
 'have',
 'this',
 'playing',
 '.',
 'it',
 "'s",
 'really',
 'the',
 'only',
 'bad',
 'thing',
 'about',
 'flying',
 'VA']

In [None]:
# nltk.download('averaged_perceptron_tagger')
tokens_after_SW = []
for word in tokens_list:
        if word not in sw_list:
            tokens_after_SW.append(word)

tagged_tokens = nltk.pos_tag(tokens_after_SW) #Tagging each word with PoS
tagged_tokens

[('VirginAmerica', 'NNP'),
 ('seriously', 'RB'),
 ('would', 'MD'),
 ('pay', 'VB'),
 ('30', 'CD'),
 ('flight', 'NN'),
 ('seats', 'NNS'),
 ('playing', 'VBG'),
 ('really', 'RB'),
 ('bad', 'JJ'),
 ('thing', 'NN'),
 ('flying', 'VBG'),
 ('VA', 'NNP')]

In [None]:
#Refer to the complete list of tags here!
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

FYI, NLTK has another Tagger: WordNet Tagger. You can explore it in your free time.

In [None]:
# Define a function to get the POS tag for lemmatization
def get_pos(tag):
    if tag.startswith('J'):
        return 'a' #Adjective
    elif tag.startswith('V'):
        return 'v' #Verb
    elif tag.startswith('N'):
        return 'n' #Noun
    elif tag.startswith('R'):
        return 'r' #Adverb
    else:
        return 'n'

In [None]:
#Lemmatization on a sample string. Stopwords have been removed.
for token, tag in tagged_tokens:
    print("Word:",token,"\nLemmatized Word:",lemmatizer.lemmatize(token.lower(),pos = get_pos(tag)))
    print('***********************')

Word: VirginAmerica 
Lemmatized Word: virginamerica
***********************
Word: seriously 
Lemmatized Word: seriously
***********************
Word: would 
Lemmatized Word: would
***********************
Word: pay 
Lemmatized Word: pay
***********************
Word: 30 
Lemmatized Word: 30
***********************
Word: flight 
Lemmatized Word: flight
***********************
Word: seats 
Lemmatized Word: seat
***********************
Word: playing 
Lemmatized Word: play
***********************
Word: really 
Lemmatized Word: really
***********************
Word: bad 
Lemmatized Word: bad
***********************
Word: thing 
Lemmatized Word: thing
***********************
Word: flying 
Lemmatized Word: fly
***********************
Word: VA 
Lemmatized Word: va
***********************
