In [None]:
import pandas as pd
import numpy as np
import os

from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, BatchNormalization
from keras.layers import Flatten, Input, Conv1D, MaxPooling1D, Dense, GlobalMaxPooling1D, Dropout, LSTM
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# data from:
# https://github.com/SenticNet/MELD/blob/master/baseline/baseline.py

train = pd.read_csv(os.path.join('data', 'emorynlp_train_final.csv'))
val = pd.read_csv(os.path.join('data', 'emorynlp_dev_final.csv'))
test = pd.read_csv(os.path.join('data', 'emorynlp_test_final.csv'))

train.head()

In [None]:
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(train['Utterance'])
word_index = tokenizer.word_index

In [None]:
def create_xy(data, tokenizer):
    sequences = tokenizer.texts_to_sequences(data['Utterance'])
    x = pad_sequences(sequences, maxlen=33)

    labels = data['Emotion'].astype('category').cat.codes
    n_classes = labels.max() + 1
    y = to_categorical(labels)
    return x, y
    
train_x, train_y = create_xy(train, tokenizer)
test_x, test_y = create_xy(test, tokenizer)
val_x, val_y = create_xy(val, tokenizer)    

In [None]:
embeddings_index = {}
f = open(os.path.join('data', 'glove.6B', 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            100,
                            weights=[embedding_matrix],
                            input_length=33,
                            trainable=True)

In [None]:
from imblearn.over_sampling import RandomOverSampler

# over sample training set because we have an inbalanced set
ros = RandomOverSampler(random_state=42)
train_x, train_y = ros.fit_sample(train_x, train_y)

In [None]:
sequence_input = Input(shape=(33,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(2)(x)
x = Dropout(0.3)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(2)(x)
x = Dropout(0.3)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.3)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
preds = Dense(n_classes, activation='softmax')(x)

model = Model(sequence_input, preds)
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0005),
              metrics=['acc'])

model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=5)
# baseline: 0.3333

In [None]:
model=Sequential()
model.add(embedding_layer)
model.add(Conv1D(10,1,activation="relu"))
model.add(MaxPooling1D(4))
model.add(LSTM(100,return_sequences=True))
model.add(Flatten())
model.add(Dense(500,activation='relu'))
model.add(Dense(300,activation='relu'))
model.add(Dense(7,activation="softmax"))
model.compile(loss="categorical_crossentropy",optimizer="sgd",metrics=["accuracy"])
model.summary()

model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=50)


In [None]:
from keras.layers import SpatialDropout1D

embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(embedding_layer)
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(7,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=50)


In [None]:
from keras.layers import GRU, LSTM

sequence_input = Input(shape=(33,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = GRU(units=128, dropout=0.2, recurrent_dropout=0.2)(embedded_sequences)
x = Dense(128, activation='relu')(x)
preds = Dense(n_classes, activation='softmax')(x)

model = Model(sequence_input, preds)
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(train_x, train_y, validation_split=0.1, epochs=50)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
#     classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

class_names = list(train['Emotion'].astype('category').dtype.categories)

pred_y = model.predict(val_x).argmax(axis=-1)

# Plot non-normalized confusion matrix
plot_confusion_matrix(val_y.argmax(axis=-1), pred_y, classes=class_names,
                      title='Confusion matrix, without normalization',  normalize=True)

# Plot normalized confusion matrix
# plot_confusion_matrix(test_y, pred_y, classes=class_names, normalize=True,
#                       title='Normalized confusion matrix')

plt.show()