## EDA on ISEAR dataset for emotion detection form text 

### Dataset consist of 7 categories of emotion
- joy
- fear
- anger
- sadness
- disgust
- shame
- guilt

#### The dataset is balanced.

In [1]:
import numpy as np
import pandas as pd
# For visualizations
import matplotlib.pyplot as plt
%matplotlib inline
# For regular expressions
import re
# For handling string



ModuleNotFoundError: No module named 'numpy'

In [3]:
df = pd.read_csv('../data/external/ISEAR.csv',names=['index','emotions','texts'])

In [4]:
df.shape

(7446, 3)

In [5]:
df.head()

Unnamed: 0,index,emotions,texts
0,0,joy,On days when I feel close to my partner and ot...
1,1,fear,Every time I imagine that someone I love or I ...
2,2,anger,When I had been obviously unjustly treated and...
3,3,sadness,When I think about the short time that we live...
4,4,disgust,At a gathering I found myself involuntarily si...


In [6]:
# checking if there is any nyll values
df.isnull().sum()

index       0
emotions    0
texts       0
dtype: int64

In [7]:
df.shape

(7446, 3)

## Data cleaning

In [8]:
df = df.drop(['index'],axis=1)

In [9]:
df['emotions'].unique()

array(['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt'],
      dtype=object)

In [10]:
df['emotions'].value_counts()

joy        1082
sadness    1074
anger      1069
fear       1063
disgust    1059
shame      1059
guilt      1040
Name: emotions, dtype: int64

In [11]:
df['texts'].unique()

array(['On days when I feel close to my partner and other friends.   \nWhen I feel at peace with myself and also experience a close  \ncontact with people whom I regard greatly.',
       'Every time I imagine that someone I love or I could contact a  \nserious illness, even death.',
       'When I had been obviously unjustly treated and had no possibility  \nof elucidating this.',
       ...,
       "A few days back I was waiting for the bus at the bus stop.   \nBefore getting into the bus I had prepared the exact amount of  \ncoins to pay for the bus fair and when I got into the bus I put  \nthese coins into the box meant to collect the bus fair.  I  \nthought that I had paid and wanted to get inside.  However the  \nbus driver called me and asked me in an impolite way if the coins  \nwere stuck at the opening  of the box.  He had not seen me paying  \nand there wasn't a stack of coins in the box.  I could not  \nunderstand this and the driver kept questioning me.  He made me  \nfeel 

## Removing unnecessay symbols form the phrases

In [12]:
def clean_phrase(dataset):
    return re.sub("[^a-z\s']+"," ",dataset, flags=re.IGNORECASE)
df['texts'] = df['texts'].apply(clean_phrase)

In [13]:
df.shape

(7446, 2)

In [14]:
# converting to lower case.
df['texts'] = df['texts'].apply(lambda x: ' '.join\
                                        (x.lower() for x in x.split()))

In [15]:
df['texts'][1]

'every time i imagine that someone i love or i could contact a serious illness even death'

## Data preprocessing

### Tokenize the sentence

In [16]:
import nltk


In [17]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/sushmita/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sushmita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
df['texts']=df.apply(lambda row: nltk.word_tokenize(row["texts"]), axis=1)

In [19]:
df['texts'][0]

['on',
 'days',
 'when',
 'i',
 'feel',
 'close',
 'to',
 'my',
 'partner',
 'and',
 'other',
 'friends',
 'when',
 'i',
 'feel',
 'at',
 'peace',
 'with',
 'myself',
 'and',
 'also',
 'experience',
 'a',
 'close',
 'contact',
 'with',
 'people',
 'whom',
 'i',
 'regard',
 'greatly']

### Removing stopwords

In [20]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sushmita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
df['texts'] = df['texts'].apply(lambda x: [token for token in x if token not in stop_words])

In [90]:
df['texts'][0]

['days',
 'feel',
 'close',
 'partner',
 'friends',
 'feel',
 'peace',
 'also',
 'experience',
 'close',
 'contact',
 'people',
 'regard',
 'greatly']

### Stemming 

In [69]:
#not advised
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
df['texts']=df['texts'].apply(lambda x: [stemmer.stem(word) for word in x])

In [70]:
df['texts'][0]

['day',
 'feel',
 'close',
 'partner',
 'friend',
 'feel',
 'peac',
 'also',
 'experi',
 'close',
 'contact',
 'peopl',
 'regard',
 'greatli']

### Lemmatization

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
df['texts']=df['texts'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [23]:
df['texts'][0]

['day',
 'feel',
 'close',
 'partner',
 'friend',
 'feel',
 'peace',
 'also',
 'experience',
 'close',
 'contact',
 'people',
 'regard',
 'greatly']

In [24]:
df['texts'] = df['texts'].apply(lambda x: ' '.join(x))

In [25]:
df['texts'][0]

'day feel close partner friend feel peace also experience close contact people regard greatly'

In [26]:
df.head()

Unnamed: 0,emotions,texts
0,joy,day feel close partner friend feel peace also ...
1,fear,every time imagine someone love could contact ...
2,anger,obviously unjustly treated possibility elucida...
3,sadness,think short time live relate period life think...
4,disgust,gathering found involuntarily sitting next two...


In [27]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

df['emotions'].iplot(
    kind='hist',
    bins=100,
    xTitle='emotions',
    linecolor='black',
    yTitle='count',
    title='Emotion data distribution')

In [28]:
df.groupby('emotions').count()['texts'].iplot(kind='bar', yTitle='Count', linecolor='black', opacity=0.8,
                                                           title='Bar chart of Division Name', xTitle='Division Name')