In [None]:
import pandas as pd
import numpy as np
import os

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, BatchNormalization
from keras.layers import Flatten, Input, Conv1D, MaxPooling1D, Dense, GlobalMaxPooling1D, Dropout
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical

# data from:
# https://github.com/SenticNet/MELD/blob/master/baseline/baseline.py

train = pd.read_csv(os.path.join('data', 'emorynlp_train_final.csv'))
val = pd.read_csv(os.path.join('data', 'emorynlp_dev_final.csv'))
test = pd.read_csv(os.path.join('data', 'emorynlp_test_final.csv'))
train = train.sample(frac=1)
train.head()

In [None]:
labels = train['Emotion'].astype('category').cat.codes
n_classes = labels.max() + 1
labels = to_categorical(labels)
labels, n_classes

In [None]:
# code from:
# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(nb_words=2000)
tokenizer.fit_on_texts(train['Utterance'])
sequences = tokenizer.texts_to_sequences(train['Utterance'])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=33)

In [None]:
embeddings_index = {}
f = open(os.path.join('data', 'glove.6B', 'glove.6B.300d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=33,
                            trainable=False)

In [None]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.utils import shuffle

ros = RandomOverSampler(random_state=42)

print(data.shape, labels.shape)
# train_x, train_y = ros.fit_sample(data, labels)
train_x, train_y = data, labels
print(train_x.shape, train_y.shape)

train_x, train_y = shuffle(train_x, train_y, random_state=42)


In [26]:
sequence_input = Input(shape=(33,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(2)(x)
x = Dropout(0.3)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(2)(x)
x = Dropout(0.3)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
preds = Dense(n_classes, activation='softmax')(x)

model = Model(sequence_input, preds)
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.fit(train_x, train_y, validation_split=0.1, epochs=5)
# baseline: 60%

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 33)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 33, 300)           1538700   
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 29, 128)           192128    
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 14, 128)           0         
_________________________________________________________________
dropout_30 (Dropout)         (None, 14, 128)           0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 10, 128)           82048     
_________________________________________________________________
max_pooling1d_17 (MaxPooling (None, 5, 128)            0         
__________

KeyboardInterrupt: 

In [None]:
from keras.layers import GRU, LSTM

sequence_input = Input(shape=(33,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = GRU(units=128, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)
x = Dense(128, activation='relu')(x)
preds = Dense(n_classes, activation='softmax')(x)

model = Model(sequence_input, preds)
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(train_x, train_y, validation_split=0.1, epochs=50)