## Exploratory Data Analysis(EDA) in ISEAR dataset

In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import math

In [57]:
df = pd.read_csv('/home/nishesh/Desktop/Work/Trainings/ISEAR.csv', header=None, names=['Emotion', 'Sentence'], index_col=0)

In [58]:
df.head()

Unnamed: 0,Emotion,Sentence
0,joy,On days when I feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


In [59]:
df.shape

(7446, 2)

In [60]:
# Check if there are missing values
df.isnull().sum()

Emotion     0
Sentence    0
dtype: int64

In [61]:
# Check whether the calss is balanced or not
df.groupby('Emotion').agg(lambda x: len(x))

Unnamed: 0_level_0,Sentence
Emotion,Unnamed: 1_level_1
anger,1069
disgust,1059
fear,1063
guilt,1040
joy,1082
sadness,1074
shame,1059


In [62]:
df['Sentence'][2552]

"I am afraid if I'll finish the term."

In [63]:
def expand_contractions(text):
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'am", " am", text)
    return text

In [64]:
# Expand the contracted words
df['Sentence'] = df['Sentence'].apply(lambda x: expand_contractions(x))

In [65]:
df['Sentence'][2552]

'I am afraid if I will finish the term.'

In [66]:
# Lowercase all the words
df['Sentence'] = df['Sentence'].apply(lambda x: x.lower())

In [67]:
df['Sentence'][2552]

'i am afraid if i will finish the term.'

In [68]:
df['Sentence'][25]

'when my 2 year old son climbed up and sat on the 7th floor  \nbalcony with his legs hanging out.  he was holding on tightly to  \nthe upper railing of the balcony but he could have easily lost  \nhis balance when he sat down.'

In [69]:
# Remove digits and words containing digits
df['Sentence'] = df['Sentence'].apply(lambda x: re.sub('\w*\d\w*', '', x))

In [70]:
df['Sentence'][25]

'when my  year old son climbed up and sat on the  floor  \nbalcony with his legs hanging out.  he was holding on tightly to  \nthe upper railing of the balcony but he could have easily lost  \nhis balance when he sat down.'

In [71]:
# Remove punctuations
df['Sentence'] = df['Sentence'].apply(lambda x: re.sub('[{}]'.format(re.escape(string.punctuation)), '', x))

In [72]:
df['Sentence'][25]

'when my  year old son climbed up and sat on the  floor  \nbalcony with his legs hanging out  he was holding on tightly to  \nthe upper railing of the balcony but he could have easily lost  \nhis balance when he sat down'

In [73]:
# Remove extra spaces
df['Sentence'] = df['Sentence'].apply(lambda x: re.sub(' +', ' ', x))

In [74]:
df['Sentence'][25]

'when my year old son climbed up and sat on the floor \nbalcony with his legs hanging out he was holding on tightly to \nthe upper railing of the balcony but he could have easily lost \nhis balance when he sat down'

In [75]:
# Remove \n
df['Sentence'] = df['Sentence'].apply(lambda x: re.sub('\n', '', x))

In [76]:
df['Sentence'][25]

'when my year old son climbed up and sat on the floor balcony with his legs hanging out he was holding on tightly to the upper railing of the balcony but he could have easily lost his balance when he sat down'

In [78]:
import spacy

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Lemmatization after stopword removal
df['Lemmatized'] = df['Sentence'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop == False)]))
df.head()

ModuleNotFoundError: No module named 'spacy'