In [1]:
import os
import re
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from factory_func import plot_confusion_matrix
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, roc_curve

In [2]:
from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, Lambda, LSTM, ConvLSTM2D, TimeDistributed, Masking, Bidirectional
from keras.layers import Reshape, Flatten, Dropout, Concatenate, Activation, MaxPooling1D, GlobalAveragePooling1D, MaxPooling2D, GlobalAveragePooling2D
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Model, load_model, Sequential
from keras.optimizers import Adam
from keras.utils import to_categorical
import keras.backend as K

Using TensorFlow backend.


In [3]:
import gzip
from gensim.models import Word2Vec

### Accuracy Records  
SVM: 45.5172%  
Dense NN: 44.0613%  
CNN: 45.0192%

In [4]:
dev_raw = pd.read_csv(os.path.join(os.getcwd(), 'dev_sent_emo.csv'))
train_raw = pd.read_csv(os.path.join(os.getcwd(), 'train_sent_emo.csv'))
test_raw = pd.read_csv(os.path.join(os.getcwd(), 'test_sent_emo.csv'))

In [5]:
dev_raw.head()

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,"Oh my God, hes lost it. Hes totally lost it.",Phoebe,sadness,negative,0,0,4,7,"00:20:57,256","00:21:00,049"
1,2,What?,Monica,surprise,negative,0,1,4,7,"00:21:01,927","00:21:03,261"
2,3,"Or! Or, we could go to the bank, close our acc...",Ross,neutral,neutral,1,0,4,4,"00:12:24,660","00:12:30,915"
3,4,Youre a genius!,Chandler,joy,positive,1,1,4,4,"00:12:32,334","00:12:33,960"
4,5,"Aww, man, now we wont be bank buddies!",Joey,sadness,negative,1,2,4,4,"00:12:34,211","00:12:37,505"


In [6]:
dev_raw.Utterance = dev_raw.Utterance.apply(lambda x: re.sub('\\x92', "'", x))
train_raw.Utterance = train_raw.Utterance.apply(lambda x: re.sub('\\x92', "'", x))
test_raw.Utterance = test_raw.Utterance.apply(lambda x: re.sub('\\x92', "'", x))

In [7]:
x_dev = dev_raw.Utterance
y_dev = dev_raw.Emotion
x_train = train_raw.Utterance
y_train = train_raw.Emotion
x_test = test_raw.Utterance
y_test = test_raw.Emotion

### Tfidf + SVM

In [223]:
tfidf = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1,3))
tfidf.fit(x_train)
x_dev_tf = tfidf.transform(x_dev)
x_train_tf = tfidf.transform(x_train)
x_test_tf = tfidf.transform(x_test)

In [224]:
# svd = TruncatedSVD(n_components=300)
# x_dev_tr = svd.fit_transform(x_dev_tf)
# x_train_tr = svd.fit_transform(x_train_tf)
# x_test_tr = svd.fit_transform(x_test_tf)

In [225]:
svm = SVC(C=10, kernel='linear', probability=True)
# param_grid = { 
#     'C': [1,10,100], 'kernel': ['linear', 'rbf']
# }
# clf = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5)
# clf.fit(x_train_tr, y_train)

In [226]:
# clf.best_estimator_

**Best model by GridSearchCV**  
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,  
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',  
kernel='linear', max_iter=-1, probability=True, random_state=None,  
shrinking=True, tol=0.001, verbose=False)

In [227]:
svm.fit(X=x_train_tf, y=y_train)
y_pred_svm = svm.predict(x_test_tf)

In [16]:
f1_score(y_pred=y_pred_svm, y_true=y_test, average="weighted")

0.42923347861223476

In [228]:
svm_accuracy = 100*np.sum(y_pred_svm==y_test)/len(y_pred_svm)
print('Test accuracy: %.4f%%' % svm_accuracy)

Test accuracy: 45.5172%


### Keras TF-IDF tokenizer + Neural nets

Neural network won't accept sentences with different dimension(i.e. number of words) as input. By padding the inputs, we decide the maximum length of words in a sentence, then zero pads the rest, if the input length is shorter than the designated length. In the case where it exceeds the maximum length, then it will also truncate either from the beginning or from the end.  
*Ref_1* https://keras.io/preprocessing/sequence/  
*Ref_2* https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-11-cnn-word2vec-41f5e28eda74

https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b

In [143]:
num_classes = len(set(y_train))

model = Sequential()
model.add(Dense(32, activation='relu', input_dim=20000))
# model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_39 (Dense)             (None, 32)                640032    
_________________________________________________________________
dense_40 (Dense)             (None, 64)                2112      
_________________________________________________________________
dense_41 (Dense)             (None, 128)               8320      
_________________________________________________________________
dense_42 (Dense)             (None, 7)                 903       
Total params: 651,367
Trainable params: 651,367
Non-trainable params: 0
_________________________________________________________________


Conv1D is generally good for text, whereas Conv2D is good for audio and images where spatial matter

In [134]:
# vocabulary_size = 20000
# tokenizer = Tokenizer(num_words= vocabulary_size)
# tokenizer.fit_on_texts(x_train)
# sequences = tokenizer.texts_to_sequences(x_train)
# data = pad_sequences(sequences, maxlen=50)

In [276]:
x_dev = dev_raw.Utterance
y_dev = dev_raw.Emotion
x_train = train_raw.Utterance
y_train = train_raw.Emotion
x_test = test_raw.Utterance
y_test = test_raw.Emotion

tokenizer.fit_on_texts(x_train)
x_train = tokenizer.texts_to_matrix(x_train, mode='tfidf')
x_test = tokenizer.texts_to_matrix(x_test, mode='tfidf')

targets, uniques = pd.factorize(y_train, sort=True)
y_train = to_categorical(targets, num_classes)

In [277]:
# tokenizer.fit_on_sequences(x_dev)
# tokenizer.texts_to_sequences(x_dev)
# tokenizer.texts_to_matrix(x_dev, mode='tfidf')
y_train

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [166]:
model.fit(x_train, y_train, validation_split=0.4, epochs=10)

Train on 5993 samples, validate on 3996 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b63ad98668>

In [168]:
# test_data = pad_sequences(tokenizer.texts_to_sequences(x_test), maxlen=50)
model.predict_classes(x_test)

array([4, 4, 4, ..., 0, 3, 0], dtype=int64)

In [169]:
Counter([np.argmax(model.predict(np.expand_dims(seq, axis=0))) for seq in x_test])

Counter({4: 1578, 3: 324, 0: 261, 5: 145, 6: 232, 2: 44, 1: 26})

In [172]:
Counter(targets)

Counter({4: 4710, 6: 1205, 2: 268, 5: 683, 3: 1743, 1: 271, 0: 1109})

In [188]:
label_map = dict(zip(list(uniques), range(num_classes)))
label_map
y_true = list(map(label_map.get, y_test))
len(y_true) == len(y_pred)

True

In [200]:
# y_pred = [np.argmax(model.predict(np.expand_dims(seq, axis=0))) for seq in x_test]
y_pred = model.predict_classes(x_test)
test_accuracy = 100*np.sum(y_pred==y_true)/len(y_pred)
print('Test accuracy: %.4f%%' % test_accuracy)

Test accuracy: 44.0613%


In [59]:
to_categorical(np.array([1, 3, 2, 0, 3, 2, 2, 1, 0, 1]))

array([[0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.]], dtype=float32)

In [233]:
x_train.shape

(9989, 20000)

In [210]:
uniques

Index(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'], dtype='object')

In [270]:
num_classes = 7
n_length = x_train.shape[0]
n_features = x_train.shape[1]

x_train_reshaped = x_train.reshape(n_length, n_features, 1)
x_test_reshaped = x_test.reshape(x_test.shape[0], n_features, 1)

In [266]:
x_test.shape
# x_train_reshaped

(2610, 20000)

In [298]:
cnn = Sequential()
cnn.add(Conv1D(filters=16, kernel_size=2, padding='same', activation='relu', input_shape=(n_features, 1)))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
cnn.add(MaxPooling1D(pool_size=2))
cnn.add(Conv1D(filters=64, kernel_size=2, padding='same', activation='relu'))
cnn.add(MaxPooling1D(pool_size=2))
# cnn.add(GlobalAveragePooling1D())
cnn.add(Flatten())
cnn.add(Dense(128, activation='relu'))
cnn.add(Dense(num_classes, activation='softmax'))
cnn.compile(optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy'])
cnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_38 (Conv1D)           (None, 20000, 16)         48        
_________________________________________________________________
max_pooling1d_32 (MaxPooling (None, 10000, 16)         0         
_________________________________________________________________
conv1d_39 (Conv1D)           (None, 10000, 32)         1056      
_________________________________________________________________
max_pooling1d_33 (MaxPooling (None, 5000, 32)          0         
_________________________________________________________________
conv1d_40 (Conv1D)           (None, 5000, 64)          4160      
_________________________________________________________________
max_pooling1d_34 (MaxPooling (None, 2500, 64)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 160000)            0         
__________

In [299]:
cnn.fit(x_train_reshaped, y_train, validation_split=0.4, epochs=5)

Train on 5993 samples, validate on 3996 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1b646058080>

In [302]:
y_pred_cnn = cnn.predict_classes(x_test_reshaped)
# y_pred_cnn = [np.argmax(x) for x in cnn.predict(x_test_reshaped)]

In [303]:
Counter(y_pred_cnn)

Counter({4: 1557, 3: 352, 5: 153, 6: 317, 0: 173, 1: 33, 2: 25})

In [304]:
cnn_accuracy = 100*np.sum(y_pred_cnn==y_true)/len(y_pred_cnn)
print('Test accuracy: %.4f%%' % cnn_accuracy)

Test accuracy: 45.0192%


In [8]:
# ENGLISH_STOP_WORDS

### Word2Vec + LSTM  
- checkpoint: try a pre-trained embedding layer e.g. GloVe Embedding

In [13]:
max_features=20000
embedding_size=128
lstm_output_size=70
num_classes=7

lstm = Sequential()
lstm.add(Embedding(input_dim=max_features, output_dim=embedding_size, input_length=100))
lstm.add(Dropout(0.25))
lstm.add(Conv1D(filters=64,
                 kernel_size=5,
                 padding='valid',
                 activation='relu',
                 strides=1))
lstm.add(MaxPooling1D(pool_size=4))
lstm.add(LSTM(units=lstm_output_size))
lstm.add(Dense(num_classes, activation='softmax'))

lstm.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
dropout_4 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 96, 64)            41024     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 70)                37800     
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 497       
Total params: 2,639,321
Trainable params: 2,639,321
Non-trainable params: 0
_________________________________________________________________


In [None]:
# clstm = Sequential()
# clstm.add(ConvLSTM2D(nb_filter=40, nb_row=3, nb_col=3, input_shape=(n_features, 1),
#                        border_mode='same', return_sequences=True))
# clstm.add(BatchNormalization())
# clstm.add(ConvLSTM2D(nb_filter=40, nb_row=3, nb_col=3,
#                    border_mode='same', return_sequences=True))
# clstm.add(BatchNormalization())
# clstm.add(ConvLSTM2D(nb_filter=40, nb_row=3, nb_col=3,
#                    border_mode='same', return_sequences=True))
# clstm.add(BatchNormalization())
# clstm.add(ConvLSTM2D(nb_filter=40, nb_row=3, nb_col=3,
#                    border_mode='same', return_sequences=True))
# clstm.add(BatchNormalization())
# clstm.add(Convolution3D(nb_filter=1, kernel_dim1=1, kernel_dim2=3,
#                       kernel_dim3=3, activation='sigmoid',
#                       border_mode='same', dim_ordering='tf'))
# clstm.compile(loss='categorical_crossentropy', optimizer='adadelta')

### Word2Vec

In [None]:
x_dev_tokens = [sentence.split() for sentence in x_dev]
x_train_tokens = [sentence.split() for sentence in x_train]

model = Word2Vec(
    x_train_tokens,
    size=150,
    window=10,
    min_count=2,
    workers=10)

The first parameter passed to gensim.models.Word2Vec is an iterable of sentences. Sentences themselves are a list of words

In [None]:
model.train(x_train_tokens, total_examples=len(x_train), epochs=10)

w = ['good']
# w = filter(lambda x: x in model.vocab, x_train.tokens)
model.wv.most_similar(positive=w,
#                       topn=6
                     )