In [3]:
import pandas as pd
import numpy as np
import nltk
import string
from tqdm import tqdm

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer


In [4]:
tqdm.pandas()

In [5]:
input_path = 'data/input/'

In [6]:
train = pd.read_csv(input_path + 'train_data.csv')
test = pd.read_csv(input_path + 'test_data.csv')

In [7]:
train['cat'] = 'train'
test['cat'] = 'test'

In [8]:
df = pd.concat([train, test])

In [9]:
train.head()

Unnamed: 0,id,text,cat
0,0,Anyway Im getting of for a while,train
1,1,"My red, Apache isn't feelin too well this morn...",train
2,2,@danyelljoy you should be its great. friday w...,train
3,3,its 11:30pm and i dont wanna sleep; so i debat...,train
4,4,Why does twitter eat my DM's? Not happy,train


In [10]:
df[df.isnull().any(axis=1)]

Unnamed: 0,id,text,cat


## prep text data: naive bayes<br>
1. cast to lowercase
2. remove punctuation
3. remove numbers
4. remove stop words
5. stem or lemmatize
6. bag of words


#### cast to lowercase

In [11]:
df['text'] = df['text'].str.lower()

In [12]:
df['text'][:5]

0                    anyway im getting of for a while 
1    my red, apache isn't feelin too well this morn...
2    @danyelljoy you should be  its great. friday w...
3    its 11:30pm and i dont wanna sleep; so i debat...
4            why does twitter eat my dm's?  not happy 
Name: text, dtype: object

#### remove punctation

In [13]:
# keep question marks and exclamation points for now
punc = '"#$%&\'()*+,-./:;<=>@[\\]^_`{|}~'


In [14]:
df['text_no_punc'] = df['text'].str.replace('[{}]'.format(punc), '')


In [15]:
df['text_no_punc'][:5]

0                    anyway im getting of for a while 
1    my red apache isnt feelin too well this mornin...
2    danyelljoy you should be  its great friday wil...
3    its 1130pm and i dont wanna sleep so i debated...
4             why does twitter eat my dms?  not happy 
Name: text_no_punc, dtype: object

#### remove numbers

In [16]:
numerals = '0123456789'

In [17]:
df['text_no_numerals'] = df['text_no_punc'].str.replace('[{}]'.format(numerals), '')

In [18]:
df['text_no_numerals'][:5]

0                    anyway im getting of for a while 
1    my red apache isnt feelin too well this mornin...
2    danyelljoy you should be  its great friday wil...
3    its pm and i dont wanna sleep so i debated wit...
4             why does twitter eat my dms?  not happy 
Name: text_no_numerals, dtype: object

#### remove stop words from text

In [21]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fzaman/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [22]:
stop = stopwords.words('english')
# remove apostrophes from stop words
stop = [s.translate(str.maketrans('', '', punc)) for s in stop]
# stop

In [23]:
df['text_no_sw'] = df['text_no_numerals'].progress_apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

100%|██████████| 1600498/1600498 [00:43<00:00, 37089.37it/s]


In [24]:
df['text_no_sw'][:5]

0                                    anyway im getting
1         red apache feelin well morning httpmypictmen
2               danyelljoy great friday great tooooooo
3    pm wanna sleep debated end decided perfect tim...
4                               twitter eat dms? happy
Name: text_no_sw, dtype: object

#### stem text

In [25]:
porter = PorterStemmer()
lancaster=LancasterStemmer()

In [26]:
df['text_porter_stemmed'] = df['text_no_sw'].progress_apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))

100%|██████████| 1600498/1600498 [04:17<00:00, 6215.10it/s]


In [27]:
df['text_porter_stemmed'][:5]

0                                        anyway im get
1             red apach feelin well morn httpmypictmen
2               danyelljoy great friday great tooooooo
3    pm wanna sleep debat end decid perfect time ba...
4                               twitter eat dms? happi
Name: text_porter_stemmed, dtype: object

In [28]:
df['text_lancaster_stemmed'] = df['text_no_sw'].progress_apply(lambda x: ' '.join([lancaster.stem(word) for word in x.split()]))

100%|██████████| 1600498/1600498 [02:51<00:00, 9321.23it/s] 


In [29]:
df['text_lancaster_stemmed'][:5]

0                                        anyway im get
1              red apach feelin wel morn httpmypictmen
2                   danyelljoy gre friday gre tooooooo
3    pm wann sleep deb end decid perfect tim bake! kid
4                                  twit eat dms? happy
Name: text_lancaster_stemmed, dtype: object

#### lemmatize text

In [30]:
wordnet_lemmatizer = WordNetLemmatizer()

In [32]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/fzaman/nltk_data...


True

In [34]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /Users/fzaman/nltk_data...


True

In [35]:
# wordnet_lemmatizer.lemmatize
df['text_lemmatized'] = df['text_no_sw'].progress_apply(lambda x: ' '.join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))

100%|██████████| 1600498/1600498 [00:56<00:00, 28508.84it/s]


In [36]:
df['text_lemmatized'][:5]

0                                    anyway im getting
1         red apache feelin well morning httpmypictmen
2               danyelljoy great friday great tooooooo
3    pm wanna sleep debated end decided perfect tim...
4                               twitter eat dms? happy
Name: text_lemmatized, dtype: object

In [37]:
df[df.isnull().any(axis=1)]

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized


In [38]:
df = df.fillna('')

In [39]:
train = df[df['cat']=='train']
test = df[df['cat']=='test']

In [40]:
train.to_csv(input_path+'train_data_prepped.csv', index=False)

In [41]:
test.to_csv(input_path+'test_data_prepped.csv', index=False)