# Text Processing
## Import

In [274]:
import pandas as pd
import numpy as np

## Read Data

In [351]:
df = pd.read_csv('https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/cleaned_hm.csv')

In [165]:
df.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100535 entries, 0 to 100534
Data columns (total 9 columns):
hmid                     100535 non-null int64
wid                      100535 non-null int64
reflection_period        100535 non-null object
original_hm              100535 non-null object
cleaned_hm               100535 non-null object
modified                 100535 non-null bool
num_sentence             100535 non-null int64
ground_truth_category    14125 non-null object
predicted_category       100535 non-null object
dtypes: bool(1), int64(3), object(5)
memory usage: 6.2+ MB


## Preliminary Cleaning of Text

### Convert all the letters to the lower case

In [352]:
df = df.apply(lambda x: x.astype(str).str.lower())

In [353]:
df[['hmid', 'wid', 'num_sentence']] = df[['hmid', 'wid', 'num_sentence']].apply(lambda x: x.astype(int))

In [227]:
df.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,i went on a successful date with someone i fel...,i went on a successful date with someone i fel...,True,1,,affection
1,27674,2,24h,i was happy when my son got 90% marks in his e...,i was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,i went to the gym this morning and did yoga.,i went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,we had a serious talk with some friends of our...,we had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,i went with grandchildren to butterfly display...,i went with grandchildren to butterfly display...,True,1,,affection


### Remove punctuation, numbers, empty words and extra white space

In [354]:
import string

In [355]:
df['text'] = df['cleaned_hm'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

In [356]:
df['text'] = df['text'].str.replace('\d+', '')

In [357]:
df['text'] = df['text'].apply(lambda x: " ".join(x.split()))

In [358]:
df['text'] = df['text'].apply(lambda x: x.strip())

In [245]:
df[['cleaned_hm', 'text']].head()

Unnamed: 0,cleaned_hm,text
0,i went on a successful date with someone i fel...,i went on a successful date with someone i fel...
1,i was happy when my son got 90% marks in his e...,i was happy when my son got marks in his exami...
2,i went to the gym this morning and did yoga.,i went to the gym this morning and did yoga
3,we had a serious talk with some friends of our...,we had a serious talk with some friends of our...
4,i went with grandchildren to butterfly display...,i went with grandchildren to butterfly display...


### Remove stopwords

In [359]:
from nltk.corpus import stopwords

In [360]:
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stopwords.words('english')))

In [361]:
df[['cleaned_hm', 'text']].head()

Unnamed: 0,cleaned_hm,text
0,i went on a successful date with someone i fel...,went successful date someone felt sympathy con...
1,i was happy when my son got 90% marks in his e...,happy son got marks examination
2,i went to the gym this morning and did yoga.,went gym morning yoga
3,we had a serious talk with some friends of our...,serious talk friends flaky lately understood g...
4,i went with grandchildren to butterfly display...,went grandchildren butterfly display crohn con...


### Remove words that don't have significant information

In [278]:
import nltk

In [363]:
tag_df = df['text'].apply(lambda x: nltk.pos_tag(x.split()))

In [374]:
tag_df = tag_df.apply(lambda x: [i for i in x if i[1] not in 
                        ['CC', 'IN', 'EX', 'PDT', 'PRP', 'PRP$', 'WDT', 'WP', 'WP$', 'WRB']])

In [380]:
df['text'] = tag_df.apply(lambda x: ' '.join([i[0] for i in x]))

In [382]:
words = ["happy","ago","yesterday","lot","today","months","month",
                 "happier","happiest","last","week","past","someone","went","felt","came"]

In [383]:
df['text'] = df['text'].apply(lambda x: ' '.join([i for i in x.split() if i not in words]))

In [385]:
df[['cleaned_hm', 'text']].head(10)

Unnamed: 0,cleaned_hm,text
0,i went on a successful date with someone i fel...,successful date sympathy connection
1,i was happy when my son got 90% marks in his e...,son got marks examination
2,i went to the gym this morning and did yoga.,gym morning yoga
3,we had a serious talk with some friends of our...,serious talk friends flaky lately understood g...
4,i went with grandchildren to butterfly display...,grandchildren butterfly display crohn conserva...
5,i meditated last night.,meditated night
6,"i made a new recipe for peasant bread, and it ...",made new recipe peasant bread spectacular
7,i got gift from my elder brother which was rea...,got gift elder brother really surprising
8,yesterday my moms birthday so i enjoyed,moms birthday enjoyed
9,watching cupcake wars with my three teen children,watching cupcake wars three teen children


### Stem words

In [386]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
porter = PorterStemmer()

In [387]:
def stemPorter(sentence):
    token_words = word_tokenize(sentence)
    token_words
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [388]:
df['text'].apply(lambda x: stemPorter(x)).head()

0                       success date sympathi connect 
1                                 son got mark examin 
2                                       gym morn yoga 
3    seriou talk friend flaki late understood good ...
4    grandchildren butterfli display crohn conserva...
Name: text, dtype: object

Not good, PorterStemmer generate stems that are not actual English words.

In [389]:
from nltk.stem import WordNetLemmatizer 
wnl = WordNetLemmatizer() 

In [390]:
def wnlSentence(sentence):
    token_words = word_tokenize(sentence)
    token_words
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(wnl.lemmatize(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [391]:
df['text'].apply(lambda x: wnlSentence(x)).head()

0                 successful date sympathy connection 
1                            son got mark examination 
2                                    gym morning yoga 
3    serious talk friend flaky lately understood go...
4     grandchild butterfly display crohn conservatory 
Name: text, dtype: object

WordNetLemmatizer performs better.

In [392]:
df['text'] = df['text'].apply(lambda x: wnlSentence(x))

In [393]:
df[['cleaned_hm', 'text']].head()

Unnamed: 0,cleaned_hm,text
0,i went on a successful date with someone i fel...,successful date sympathy connection
1,i was happy when my son got 90% marks in his e...,son got mark examination
2,i went to the gym this morning and did yoga.,gym morning yoga
3,we had a serious talk with some friends of our...,serious talk friend flaky lately understood go...
4,i went with grandchildren to butterfly display...,grandchild butterfly display crohn conservatory


In [394]:
df.to_csv(r'../data/processed_data.csv', index=False)