In [1]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os
#os.environ['KERAS_BACKEND']='theano' # Why theano why not
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
plt.switch_backend('agg')
%matplotlib inline
import neattext as nfx

In [2]:
def clean_str(string):
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()


In [3]:
from IPython.display import Image
from IPython.core.display import HTML 

Image(url= "http://www.wildml.com/wp-content/uploads/2015/11/Screen-Shot-2015-11-06-at-12.05.40-PM.png")

In [4]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2

In [6]:
# reading data
df = pd.read_csv("dataset/Emotion-Stimulus.csv")
df = df.dropna()
df.head()

Unnamed: 0,Emotion,Text
0,happy,I suppose I am happy being so ` tiny' ; it mea...
1,happy,Lennox has always truly wanted to fight for th...
2,happy,"He was a professional musician now , still sen..."
3,happy,Holmes is happy having the freedom of the hous...
4,happy,I had problems with tutors trying to encourage...


In [7]:
df['Text'] = df['Text'].apply(nfx.remove_special_characters)

In [8]:
df['Text']=df['Text'].apply(nfx.remove_punctuations)

In [9]:
df['Text']=df['Text'].apply(nfx.remove_stopwords)

In [10]:
df['Text']=df['Text'].apply(nfx.remove_userhandles)

In [11]:
df = df.reset_index(drop=True)
print('Shape of dataset ',df.shape)
print(df.columns)
print('No. of unique classes',len(set(df['Emotion'])))

Shape of dataset  (2414, 2)
Index(['Emotion', 'Text'], dtype='object')
No. of unique classes 7


In [21]:
df.head()
df['Emotion'].value_counts()

sad         575
anger       483
happy       479
fear        423
surprise    213
shame       146
disgust      95
Name: Emotion, dtype: int64

In [22]:
sent_to_id  = {"sad":0, "anger":1, "happy":2, "fear":3, "surprise":4, "shame":5, "disgust":6}

In [23]:
df["sentiment_id"] = df['Emotion'].map(sent_to_id)

In [24]:
df

Unnamed: 0,Emotion,Text,sentiment_id
0,happy,suppose happy tiny means able surprise people ...,2
1,happy,Lennox truly wanted fight world title happy ta...,2
2,happy,professional musician sensitive happy loved,2
3,happy,Holmes happy freedom house,2
4,happy,problems tutors trying encourage diversity wor...,2
...,...,...,...
2409,shame,gets real humiliated leave,5
2410,shame,aimed higher status jobs felt humiliated unemp...,5
2411,shame,cursed lack selfcontrol knew old biddies seen ...,5
2412,shame,Ive thought forget happened comes feel guilty ...,5


In [25]:
targetnum=sorted(set(df['sentiment_id']))
targetnum_to_id = dict((note, number) for number, note in enumerate(targetnum))

def fun(i):
    return targetnum_to_id[i]

df['sentiment_id']=df['sentiment_id'].apply(fun)

In [26]:
texts = []
labels = []


for idx in range(df.Text.shape[0]):
    text = BeautifulSoup(df.Text[idx])
    texts.append(clean_str(str(text.get_text().encode())))

for idx in df['sentiment_id']:
    labels.append(idx)

In [27]:
df.head()

Unnamed: 0,Emotion,Text,sentiment_id
0,happy,suppose happy tiny means able surprise people ...,2
1,happy,Lennox truly wanted fight world title happy ta...,2
2,happy,professional musician sensitive happy loved,2
3,happy,Holmes happy freedom house,2
4,happy,problems tutors trying encourage diversity wor...,2


In [28]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index))

Number of Unique Tokens 8244


In [29]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of Data Tensor:', data.shape)
print('Shape of Label Tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

Shape of Data Tensor: (2414, 1000)
Shape of Label Tensor: (2414, 7)


In [30]:
embeddings_index = {}
f = open('glove.6B.50d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

Total 400000 word vectors in Glove 6B 100d.


In [31]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,trainable=True)

In [32]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(256, 3, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(3)(l_cov1)
l_cov2 = Conv1D(256, 3, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(3)(l_cov2)
l_cov3 = Conv1D(256, 3, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(256, activation='relu')(l_flat)
preds = Dense(len(targetnum), activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()
cp=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

Simplified convolutional neural network
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1000)]            0         
                                                                 
 embedding (Embedding)       (None, 1000, 50)          412250    
                                                                 
 conv1d (Conv1D)             (None, 998, 256)          38656     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 332, 256)         0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 330, 256)          196864    
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 110, 256)         0         
 1D)                 

In [None]:
Cnn_1st=model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=2, batch_size=2,callbacks=[cp])

Epoch 1/2
Epoch 1: val_acc improved from -inf to 0.21992, saving model to model_cnn.hdf5
Epoch 2/2
  5/966 [..............................] - ETA: 24s - loss: 2.1594 - acc: 0.2000