In [1]:
import pandas as pd
import numpy as np
import nltk
import string
from tqdm import tqdm

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import re

In [2]:
tqdm.pandas()

In [3]:
input_path = 'data/input/'

In [4]:
train = pd.read_csv(input_path + 'train_data.csv')
test = pd.read_csv(input_path + 'test_data.csv')

In [5]:
train['cat'] = 'train'
test['cat'] = 'test'

In [6]:
df = pd.concat([train, test])

In [7]:
train.head()

Unnamed: 0,id,text,cat
0,0,Anyway Im getting of for a while,train
1,1,"My red, Apache isn't feelin too well this morn...",train
2,2,@danyelljoy you should be its great. friday w...,train
3,3,its 11:30pm and i dont wanna sleep; so i debat...,train
4,4,Why does twitter eat my DM's? Not happy,train


In [8]:
df[df.isnull().any(axis=1)]

Unnamed: 0,id,text,cat


## prep text data: naive bayes<br>
1. cast to lowercase
2. remove punctuation
3. remove numbers
4. remove stop words
5. stem or lemmatize
6. bag of words


#### cast to lowercase

In [9]:
df['text'] = df['text'].str.lower()

In [10]:
df['text'][:5]

0                    anyway im getting of for a while 
1    my red, apache isn't feelin too well this morn...
2    @danyelljoy you should be  its great. friday w...
3    its 11:30pm and i dont wanna sleep; so i debat...
4            why does twitter eat my dm's?  not happy 
Name: text, dtype: object

#### remove twitter handles

In [11]:
df['text'] = df['text'].apply(lambda x: ' '.join([x if not x[0]=='@' else '' for x in x.split()]).strip())

#### remove links/urls

In [12]:
df['text'] = df['text'].apply(lambda x: re.sub(r'https?:\/\/\S+', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', x))

#### remove link placeholders

In [13]:
df['text'] = df['text'].apply(lambda x: re.sub(r'{link}', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r"\[video\]", '', x))

#### remove punctation

In [14]:
# keep question marks and exclamation points for now
punc = '"#$%&\'()*+,-./:;<=>@[\\]^_`{|}~'


In [15]:
df['text_no_punc'] = df['text'].str.replace('?', ' ? ').replace('!', ' ! ')
df['text_no_punc'][:5]

  """Entry point for launching an IPython kernel.


0                     anyway im getting of for a while
1    my red, apache isn't feelin too well this morn...
2    you should be its great. friday will be great ...
3    its 11:30pm and i dont wanna sleep; so i debat...
4            why does twitter eat my dm's ?  not happy
Name: text_no_punc, dtype: object

In [16]:
df['text_no_punc'] = df['text_no_punc'].str.replace('[{}]'.format(punc), '')


  """Entry point for launching an IPython kernel.


In [17]:
df['text_no_punc'][:5]

0                     anyway im getting of for a while
1     my red apache isnt feelin too well this morning 
2    you should be its great friday will be great t...
3    its 1130pm and i dont wanna sleep so i debated...
4             why does twitter eat my dms ?  not happy
Name: text_no_punc, dtype: object

#### remove numbers

In [18]:
numerals = '0123456789'

In [19]:
df['text_no_numerals'] = df['text_no_punc'].str.replace('[{}]'.format(numerals), '')

  """Entry point for launching an IPython kernel.


In [20]:
df['text_no_numerals'][:5]

0                     anyway im getting of for a while
1     my red apache isnt feelin too well this morning 
2    you should be its great friday will be great t...
3    its pm and i dont wanna sleep so i debated wit...
4             why does twitter eat my dms ?  not happy
Name: text_no_numerals, dtype: object

#### remove stop words from text<br>
## try without removing stopwords

In [21]:
stop = stopwords.words('english')
# remove apostrophes from stop words
stop = [s.translate(str.maketrans('', '', punc)) for s in stop]
# stop

In [22]:
df['text_no_sw'] = df['text_no_numerals'].progress_apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1600498/1600498 [00:34<00:00, 46505.53it/s]


In [23]:
df['text_no_sw'][:5]

0                                    anyway im getting
1                       red apache feelin well morning
2                          great friday great tooooooo
3    pm wanna sleep debated end decided perfect tim...
4                              twitter eat dms ? happy
Name: text_no_sw, dtype: object

#### stem text

In [24]:
porter = PorterStemmer()
lancaster=LancasterStemmer()

In [25]:
df['text_porter_stemmed'] = df['text_no_numerals'].progress_apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1600498/1600498 [04:24<00:00, 6052.77it/s]


In [26]:
df['text_porter_stemmed'][:5]

0                         anyway im get of for a while
1           my red apach isnt feelin too well thi morn
2    you should be it great friday will be great to...
3    it pm and i dont wanna sleep so i debat with m...
4                whi doe twitter eat my dm ? not happi
Name: text_porter_stemmed, dtype: object

In [27]:
df['text_lancaster_stemmed'] = df['text_no_numerals'].progress_apply(lambda x: ' '.join([lancaster.stem(word) for word in x.split()]))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1600498/1600498 [03:26<00:00, 7740.36it/s]


In [28]:
df['text_lancaster_stemmed'][:5]

0                          anyway im get of for a whil
1            my red apach isnt feelin too wel thi morn
2      you should be it gre friday wil be gre tooooooo
3    it pm and i dont wann sleep so i deb with myse...
4                  why doe twit eat my dms ? not happy
Name: text_lancaster_stemmed, dtype: object

#### lemmatize text

In [29]:
wordnet_lemmatizer = WordNetLemmatizer()

In [30]:
# wordnet_lemmatizer.lemmatize
df['text_lemmatized'] = df['text_no_numerals'].progress_apply(lambda x: ' '.join([wordnet_lemmatizer.lemmatize(word) for word in x.split()]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1600498/1600498 [01:07<00:00, 23655.53it/s]


In [31]:
df['text_lemmatized'][:5]

0                     anyway im getting of for a while
1      my red apache isnt feelin too well this morning
2    you should be it great friday will be great to...
3    it pm and i dont wanna sleep so i debated with...
4                why doe twitter eat my dm ? not happy
Name: text_lemmatized, dtype: object

In [32]:
df[df.isnull().any(axis=1)]

Unnamed: 0,id,text,cat,text_no_punc,text_no_numerals,text_no_sw,text_porter_stemmed,text_lancaster_stemmed,text_lemmatized


In [33]:
df = df.fillna('')

In [34]:
train = df[df['cat']=='train']
test = df[df['cat']=='test']

In [35]:
train.to_csv(input_path+'train_data_prepped.csv', index=False)

In [36]:
test.to_csv(input_path+'test_data_prepped.csv', index=False)