# Text Processing
## Import

In [1]:
import pandas as pd
import numpy as np

In [6]:
import matplotlib as plt
import seaborn as sns
plt.style.use('classic')
sns.set_style('whitegrid')
%matplotlib inline

## Read Data

In [137]:
df = pd.read_csv('https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/cleaned_hm.csv')

In [4]:
df.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100535 entries, 0 to 100534
Data columns (total 9 columns):
hmid                     100535 non-null int64
wid                      100535 non-null int64
reflection_period        100535 non-null object
original_hm              100535 non-null object
cleaned_hm               100535 non-null object
modified                 100535 non-null bool
num_sentence             100535 non-null int64
ground_truth_category    14125 non-null object
predicted_category       100535 non-null object
dtypes: bool(1), int64(3), object(5)
memory usage: 6.2+ MB


## Preliminary Cleaning of Text

### Convert all the letters to the lower case

In [138]:
df = df.apply(lambda x: x.astype(str).str.lower())

In [156]:
df[['hmid', 'wid', 'num_sentence']] = df[['hmid', 'wid', 'num_sentence']].apply(lambda x: x.astype(int))

In [157]:
df.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,i went on a successful date with someone i fel...,i went on a successful date with someone i fel...,True,1,,affection
1,27674,2,24h,i was happy when my son got 90% marks in his e...,i was happy when my son got marks in his exami...,True,1,,affection
2,27675,1936,24h,i went to the gym this morning and did yoga.,i went to the gym this morning and did yoga,True,1,,exercise
3,27676,206,24h,we had a serious talk with some friends of our...,we had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,i went with grandchildren to butterfly display...,i went with grandchildren to butterfly display...,True,1,,affection


### Remove punctuation, numbers, empty words and extra white space

In [54]:
import string

In [139]:
df['cleaned_hm'] = df['cleaned_hm'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

In [140]:
df['cleaned_hm'] = df['cleaned_hm'].str.replace('\d+', '')

In [152]:
df['cleaned_hm'] = df['cleaned_hm'].apply(lambda x: " ".join(x.split()))

In [154]:
df['cleaned_hm'] = df['cleaned_hm'].apply(lambda x: x.strip())

In [155]:
df[['original_hm', 'cleaned_hm']].head()

Unnamed: 0,original_hm,cleaned_hm
0,i went on a successful date with someone i fel...,i went on a successful date with someone i fel...
1,i was happy when my son got 90% marks in his e...,i was happy when my son got marks in his exami...
2,i went to the gym this morning and did yoga.,i went to the gym this morning and did yoga
3,we had a serious talk with some friends of our...,we had a serious talk with some friends of our...
4,i went with grandchildren to butterfly display...,i went with grandchildren to butterfly display...


### Remove stopwords

In [158]:
from nltk.corpus import stopwords

In [None]:
df['cleaned_hm'] = df['cleaned_hm'].apply(lambda x: [word for word in x.split() if word not in stopwords.words('english')])

In [None]:
df[['original_hm', 'cleaned_hm']].head()

### Stem words

In [35]:
df['text_length'] = df['cleaned_hm'].apply(len)