# Stemming and Lemmatization

## Imports

In [8]:
import numpy as np
import pandas as pd

# nltk
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# for multiprocessing
from pandarallel import pandarallel

# Ipython related imports
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" #This is for multiple print statements per cell

## Acquiring Data

In [2]:
df = pd.read_csv('pre-cleaned_consolidated_tweet_data.csv', sep='\t')
df

Unnamed: 0,sentiment,text
0,negative,oh no its fading away again
1,positive,bunnylake will kill me but i cant stop listeni...
2,negative,last day in cali partyin for the last time wi...
3,negative,is having a major soar throat
4,positive,my last day as 12 years old
...,...,...
1611536,negative,twisuz yeah and how did this happen i was up...
1611537,negative,smittygoalie im sorry about your dog
1611538,negative,posipat im already there i wish you were here
1611539,negative,is thinking in 12 hours ill be at the airport ...


## Stemming and Lemmatization

### Download necessary modules using nltk download

In [11]:
#call the nltk downloader
# nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

### Porter Stemmer

In [4]:
# stem method definition 
porter=PorterStemmer()

def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [5]:
# applying stemming to the text column of each row !!single process
# df['text'] = df['text'].apply(lambda x: stemSentence(x))
# df

In [6]:
# applying stemming to the text column of each row !!using all cores, multiple processes
# the code below works fine, but performing this on a numpy array might save more time
pandarallel.initialize()
df['text'] = df['text'].parallel_apply(lambda x: stemSentence(x))
df

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Unnamed: 0,sentiment,text
0,negative,oh no it fade away again
1,positive,bunnylak will kill me but i cant stop listen t...
2,negative,last day in cali partyin for the last time wit...
3,negative,is have a major soar throat
4,positive,my last day as 12 year old
...,...,...
1611536,negative,twisuz yeah and how did thi happen i wa updat ...
1611537,negative,smittygoali im sorri about your dog
1611538,negative,posipat im alreadi there i wish you were here
1611539,negative,is think in 12 hour ill be at the airport thi ...


### Word Net Lemmatizer

In [9]:
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizeSentence(sentence):
    token_words=word_tokenize(sentence)
    lemmat_sentence=[]
    for word in token_words:
        lemmat_sentence.append(wordnet_lemmatizer.lemmatize(word))
        lemmat_sentence.append(" ")
    return "".join(lemmat_sentence)

In [12]:
# applying stemming to the text column of each row !!using all cores, multiple processes
# the code below works fine, but performing this on a numpy array might save more time
df['text'] = df['text'].parallel_apply(lambda x: lemmatizeSentence(x))
df

Unnamed: 0,sentiment,text
0,negative,oh no it fade away again
1,positive,bunnylak will kill me but i cant stop listen t...
2,negative,last day in cali partyin for the last time wit...
3,negative,is have a major soar throat
4,positive,my last day a 12 year old
...,...,...
1611536,negative,twisuz yeah and how did thi happen i wa updat ...
1611537,negative,smittygoali im sorri about your dog
1611538,negative,posipat im alreadi there i wish you were here
1611539,negative,is think in 12 hour ill be at the airport thi ...


# Saving dataframe to file for later use

In [13]:
df.to_csv('consolidated_tweet_data-cleaned-stemmed-lemmatized.csv', encoding='utf-8', index=False, sep='\t')