In [130]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os
#os.environ['KERAS_BACKEND']='theano' # Why theano why not
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
plt.switch_backend('agg')
%matplotlib inline
import neattext as nfx

In [131]:
def clean_str(string):
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()


In [132]:
MAX_SEQUENCE_LENGTH = 60
MAX_NB_WORDS = 75000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2

In [133]:
# reading data
#df = pd.read_csv("dataset/Emotion-Stimulus.csv")
#df=pd.read_csv('dataset/dailydialog.csv')
df=pd.read_csv('dataset/text_emotion.csv')

df.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [134]:
df['Text'] = df['content'].apply(nfx.remove_special_characters)

In [135]:
df['Text']=df['Text'].apply(nfx.remove_punctuations)

In [136]:
df['Text']=df['Text'].apply(nfx.remove_stopwords)

In [137]:
df['Text']=df['Text'].apply(nfx.remove_userhandles)

In [138]:
df = df.reset_index(drop=True)
print('Shape of dataset ',df.shape)
print(df.columns)
print('No. of unique classes',len(set(df['sentiment'])))

Shape of dataset  (40000, 5)
Index(['tweet_id', 'sentiment', 'author', 'content', 'Text'], dtype='object')
No. of unique classes 13


In [139]:
df.head()
df['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [140]:
sent_to_id  = {"neutral":0, "worry":1, "happiness":2, "sadness":3, "love":4, "surprise":5, "fun":6, "relief":7, "hate":8, "empty":9, "enthusiasm":10, "boredom":11, "anger":12}

In [141]:
df["sentiment_id"] = df['sentiment'].map(sent_to_id)

In [142]:
df

Unnamed: 0,tweet_id,sentiment,author,content,Text,sentiment_id
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,tiffanylue know listenin bad habit earlier sta...,9
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,Layin n bed headache ughhhhwaitin,3
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,Funeral ceremonygloomy friday,3
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants hang friends SOON,10
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,dannycastillo want trade Houston tickets,0
...,...,...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,@JohnLloydTaylor,JohnLloydTaylor,0
39996,1753919001,love,drapeaux,Happy Mothers Day All my love,Happy Mothers Day love,4
39997,1753919005,love,JenniRox,Happy Mother's Day to all the mommies out ther...,Happy Mothers Day mommies woman man long youre...,4
39998,1753919043,happiness,ipdaman1,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,niariley WASSUP BEAUTIFUL FOLLOW PEEP NEW HIT ...,2


In [143]:
targetnum=sorted(set(df['sentiment_id']))
targetnum_to_id = dict((note, number) for number, note in enumerate(targetnum))

def fun(i):
    return targetnum_to_id[i]

df['sentiment_id']=df['sentiment_id'].apply(fun)

In [144]:
texts = []
labels = []


for idx in range(df.Text.shape[0]):
    text = BeautifulSoup(df.Text[idx])
    texts.append(clean_str(str(text.get_text().encode())))

for idx in df['sentiment_id']:
    labels.append(idx)

In [145]:
df.head()

Unnamed: 0,tweet_id,sentiment,author,content,Text,sentiment_id
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,tiffanylue know listenin bad habit earlier sta...,9
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,Layin n bed headache ughhhhwaitin,3
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,Funeral ceremonygloomy friday,3
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants hang friends SOON,10
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,dannycastillo want trade Houston tickets,0


In [146]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index))

Number of Unique Tokens 57359


In [147]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of Data Tensor:', data.shape)
print('Shape of Label Tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

Shape of Data Tensor: (40000, 60)
Shape of Label Tensor: (40000, 13)


In [148]:
embeddings_index = {}
f = open('glove.6B.50d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))

Total 400000 word vectors in Glove 6B 100d.


In [149]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,trainable=True)

In [158]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_cov1= Conv1D(128, 3, activation='relu')(embedded_sequences)
l_pool1 = MaxPooling1D(3)(l_cov1)
l_cov2 = Conv1D(128, 3, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(3)(l_cov2)
l_cov3 = Conv1D(128, 3, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(3)(l_cov3)  # global max pooling
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(len(targetnum), activation='softmax')(l_dense)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Simplified convolutional neural network")
model.summary()
cp=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)

Simplified convolutional neural network
Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, 60)]              0         
                                                                 
 embedding_3 (Embedding)     (None, 60, 50)            2868000   
                                                                 
 conv1d_34 (Conv1D)          (None, 58, 128)           19328     
                                                                 
 max_pooling1d_32 (MaxPoolin  (None, 19, 128)          0         
 g1D)                                                            
                                                                 
 conv1d_35 (Conv1D)          (None, 17, 128)           49280     
                                                                 
 max_pooling1d_33 (MaxPoolin  (None, 5, 128)           0         
 g1D)              

In [159]:
Cnn_1st=model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=5, batch_size=128,callbacks=[cp])

Epoch 1/5
Epoch 1: val_acc improved from -inf to 0.22000, saving model to model_cnn.hdf5
Epoch 2/5
Epoch 2: val_acc did not improve from 0.22000
Epoch 3/5
Epoch 3: val_acc improved from 0.22000 to 0.22013, saving model to model_cnn.hdf5
Epoch 4/5
Epoch 4: val_acc did not improve from 0.22013
Epoch 5/5
Epoch 5: val_acc did not improve from 0.22013
