# Text Processing
## Import

In [1]:
import pandas as pd
import numpy as np

In [6]:
import matplotlib as plt
import seaborn as sns
plt.style.use('classic')
sns.set_style('whitegrid')
%matplotlib inline

## Read Data

In [224]:
df = pd.read_csv('https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/cleaned_hm.csv')

In [165]:
df.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100535 entries, 0 to 100534
Data columns (total 9 columns):
hmid                     100535 non-null int64
wid                      100535 non-null int64
reflection_period        100535 non-null object
original_hm              100535 non-null object
cleaned_hm               100535 non-null object
modified                 100535 non-null bool
num_sentence             100535 non-null int64
ground_truth_category    14125 non-null object
predicted_category       100535 non-null object
dtypes: bool(1), int64(3), object(5)
memory usage: 6.2+ MB


## Preliminary Cleaning of Text

### Convert all the letters to the lower case

In [225]:
df = df.apply(lambda x: x.astype(str).str.lower())

In [226]:
df[['hmid', 'wid', 'num_sentence']] = df[['hmid', 'wid', 'num_sentence']].apply(lambda x: x.astype(int))

In [227]:
df.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,i went on a successful date with someone i fel...,i went on a successful date with someone i fel...,True,1,,affection
1,27674,2,24h,i was happy when my son got 90% marks in his e...,i was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,i went to the gym this morning and did yoga.,i went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,we had a serious talk with some friends of our...,we had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,i went with grandchildren to butterfly display...,i went with grandchildren to butterfly display...,True,1,,affection


### Remove punctuation, numbers, empty words and extra white space

In [228]:
import string

In [241]:
df['text'] = df['cleaned_hm'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

In [242]:
df['text'] = df['text'].str.replace('\d+', '')

In [243]:
df['text'] = df['text'].apply(lambda x: " ".join(x.split()))

In [244]:
df['text'] = df['text'].apply(lambda x: x.strip())

In [245]:
df[['cleaned_hm', 'text']].head()

Unnamed: 0,cleaned_hm,text
0,i went on a successful date with someone i fel...,i went on a successful date with someone i fel...
1,i was happy when my son got 90% marks in his e...,i was happy when my son got marks in his exami...
2,i went to the gym this morning and did yoga.,i went to the gym this morning and did yoga
3,we had a serious talk with some friends of our...,we had a serious talk with some friends of our...
4,i went with grandchildren to butterfly display...,i went with grandchildren to butterfly display...


### Remove stopwords

In [246]:
from nltk.corpus import stopwords

In [247]:
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stopwords.words('english')))

In [248]:
df[['cleaned_hm', 'text']].head()

Unnamed: 0,cleaned_hm,text
0,i went on a successful date with someone i fel...,went successful date someone felt sympathy con...
1,i was happy when my son got 90% marks in his e...,happy son got marks examination
2,i went to the gym this morning and did yoga.,went gym morning yoga
3,we had a serious talk with some friends of our...,serious talk friends flaky lately understood g...
4,i went with grandchildren to butterfly display...,went grandchildren butterfly display crohn con...


### Stem words

In [249]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
porter = PorterStemmer()

In [250]:
def stemPorter(sentence):
    token_words = word_tokenize(sentence)
    token_words
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [251]:
df['text'].apply(lambda x: stemPorter(x)).head()

0      went success date someon felt sympathi connect 
1                           happi son got mark examin 
2                                  went gym morn yoga 
3    seriou talk friend flaki late understood good ...
4    went grandchildren butterfli display crohn con...
Name: text, dtype: object

Not good, PorterStemmer generate stems that are not actual English words.

In [252]:
from nltk.stem import WordNetLemmatizer 
wnl = WordNetLemmatizer() 

In [253]:
def wnlSentence(sentence):
    token_words = word_tokenize(sentence)
    token_words
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(wnl.lemmatize(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [255]:
df['text'].apply(lambda x: wnlSentence(x)).head()

0    went successful date someone felt sympathy con...
1                      happy son got mark examination 
2                               went gym morning yoga 
3    serious talk friend flaky lately understood go...
4    went grandchild butterfly display crohn conser...
Name: text, dtype: object

WordNetLemmatizer performs better.

In [256]:
df['text'] = df['text'].apply(lambda x: wnlSentence(x))

In [257]:
df[['cleaned_hm', 'text']].head()

Unnamed: 0,cleaned_hm,text
0,i went on a successful date with someone i fel...,went successful date someone felt sympathy con...
1,i was happy when my son got 90% marks in his e...,happy son got mark examination
2,i went to the gym this morning and did yoga.,went gym morning yoga
3,we had a serious talk with some friends of our...,serious talk friend flaky lately understood go...
4,i went with grandchildren to butterfly display...,went grandchild butterfly display crohn conser...


In [262]:
df.to_csv(r'/Users/wangxinquan/Desktop/2sem/STAT5243/sec001/text_mining_project/data/processed_data.csv')