In [1]:
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras import optimizers
from keras.callbacks import ModelCheckpoint
import tensorflow as tf

import keras.backend as K

Using TensorFlow backend.


In [2]:
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

In [3]:
from utils import *
df = functions.parse_file(r"raw_data/EmoContext/train.txt", "EmoContext")
df.head(5)

Unnamed: 0,id,turn1,turn2,turn3,label
0,0,Don't worry I'm girl,hmm how do I know if you are,What's ur name?,others
1,1,When did I?,saw many times i think -_-,No. I never saw you,angry
2,2,By,by Google Chrome,Where you live,others
3,3,U r ridiculous,I might be ridiculous but I am telling the truth.,U little disgusting whore,angry
4,4,Just for time pass,wt do u do 4 a living then,Maybe,others
5,5,I'm a dog person,youre so rude,Whaaaat why,others
6,6,So whatsup,Nothing much. Sitting sipping and watching TV....,What are you watching on tv?,others
7,7,Ok,ok im back!!,"So, how are u",others
8,8,Really?,really really really really really,Y saying so many times...i can hear you,others
9,9,Bay,in the bay,😘 love you,others


In [4]:
text_data = []
for idx,row in df.iterrows():
    text_data.append("{}. {}. {}.".format(row['turn1'], row['turn2'], row['turn3']))

In [5]:
NR_WORDS = 5000
MAX_PROP_LENGTH = 300
tokenizer = Tokenizer(num_words=NR_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                   lower=True,split=' ')

tokenizer.fit_on_texts(text_data)

X_train = tokenizer.texts_to_sequences(text_data)
X_train = pad_sequences(X_train, maxlen = MAX_PROP_LENGTH)

In [6]:
def one_hot_vector(word,label=None):
    words = {"others": 0, "angry": 1, "sad":2, "happy": 3}
    if label == None:
        y = [0,0,0,0]
        y[words[word]] = 1
        return y
    if label == word:
        return [1,0]
    return [0,1]

Y_train = []

for idx,row in df.iterrows():
    Y_train.append(one_hot_vector(row['label']))

Y_train = np.array(Y_train)

In [7]:
df = functions.parse_file(r"raw_data/EmoContext/devwithlabels.txt", "EmoContext")
df.head(5)

Unnamed: 0,id,turn1,turn2,turn3,label
0,0,Then dont ask me,YOURE A GUY NOT AS IF YOU WOULD UNDERSTAND,IM NOT A GUY FUCK OFF,angry
1,1,Mixed things such as??,the things you do.,Have you seen minions??,others
2,2,Today I'm very happy,and I'm happy for you ❤,I will be marry,happy
3,3,Woah bring me some,left it there oops,Brb,others
4,4,it is thooooo,I said soon master.,he is pressuring me,others
5,5,Wont u ask my age??,hey at least I age well!,Can u tell me how can we get closer??,others
6,6,I said yes,What if I told you I'm not?,Go to hell,angry
7,7,Where I ll check,why tomorrow?,No I want now,others
8,8,Shall we meet,you say- you're leaving soon...anywhere you wa...,?,others
9,9,Let's change the subject,I just did it .l.,You're broken,sad


In [8]:
text_data = []
for idx,row in df.iterrows():
    text_data.append("{}. {}. {}.".format(row['turn1'], row['turn2'], row['turn3']))

In [9]:
tokenizer = Tokenizer(num_words=NR_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                   lower=True,split=' ')

tokenizer.fit_on_texts(text_data)

X_test = tokenizer.texts_to_sequences(text_data)
X_test = pad_sequences(X_test, maxlen = MAX_PROP_LENGTH)

In [10]:
def one_hot_vector(word,label=None):
    words = {"others": 0, "angry": 1, "sad":2, "happy": 3}
    if label == None:
        y = [0,0,0,0]
        y[words[word]] = 1
        return y
    if label == word:
        return [1,0]
    return [0,1]

Y_test = []

for idx,row in df.iterrows():
    Y_test.append(one_hot_vector(row['label']))

Y_test = np.array(Y_test)

In [11]:
embed_dim = 256
lstm_out = 128
batch_size = 64

adam = optimizers.Adam(lr=0.01)
rmsprop = optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)


model = Sequential()
model.add(Embedding(NR_WORDS, embed_dim,input_length = X_train.shape[1]))
model.add(LSTM(lstm_out))
model.add(Dense(100))
model.add(Dropout(0.3))
model.add(Dense(4,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer="adagrad", metrics = ['accuracy', f1])


In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 256)          1280000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense_1 (Dense)              (None, 100)               12900     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 404       
Total params: 1,490,424
Trainable params: 1,490,424
Non-trainable params: 0
_________________________________________________________________


In [13]:
mdcheck = ModelCheckpoint("trained_models/best_model_val_acc{val_acc:.4f}.h5", monitor='val_acc', save_best_only=True)

In [None]:
#X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0, random_state = 42)

from sklearn.model_selection import KFold
# prepare cross validation
#kfold = KFold(n_splits=6)
Y_train = np.array(Y_train)
# enumerate splits

total = len(Y_train)
#for train, validation in kfold.split(X_train):
#    history = model.fit(X_train[train], Y_train[train],
#                    validation_data=(X_train[validation], Y_train[validation]),
#                    epochs=2, verbose=1, batch_size=batch_size,
#                    class_weight={
#                        0: total / len(np.where(Y_train[train][:,0]==1.0)[0]),
#                        1: total / len(np.where(Y_train[train][:,1]==1.0)[0]),
#                        2: total / len(np.where(Y_train[train][:,2]==1.0)[0]),
#                        3: total / len(np.where(Y_train[train][:,3]==1.0)[0]),
#})

history = model.fit(X_train, Y_train,
                    validation_data=(X_test, Y_test),
                    epochs=20, verbose=1, batch_size=batch_size,
                    class_weight={
                        0: total / len(np.where(Y_train[:,0]==1.0)[0]),
                        1: total / len(np.where(Y_train[:,1]==1.0)[0]),
                        2: total / len(np.where(Y_train[:,2]==1.0)[0]),
                        3: total / len(np.where(Y_train[:,3]==1.0)[0]),
                    },callbacks=[mdcheck])

Train on 30160 samples, validate on 2755 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
 6592/30160 [=====>........................] - ETA: 7:40 - loss: 1.0408 - acc: 0.8959 - f1: 0.8874

In [None]:
model_json = model.to_json()
with open("lstm_normal_model.json", "w") as outfile:
    outfile.write(model_json)
model.save_weights("lstm_normal_12_epochs_cv.h5")

In [None]:
df_test = functions.parse_file(r"raw_data/EmoContext/devwithoutlabels.txt", "EmoContext")
df_test.head()

In [None]:
text_data = []

for idx,row in df_test.iterrows():
    text_data.append("{}. {}. {}.".format(row['turn1'], row['turn2'], row['turn3']))

res = model.predict(X_test, batch_size=128, verbose=1)
res[3]

In [None]:
revers_words = {0:"others", 1:"angry", 2:"sad", 3:"happy"}

def softmax_convert(res):
    max_i = 0
    max_v = 0
    for i in range(0,4):
        if res[i] > max_v:
            max_v = res[i]
            max_i = i
    return revers_words[max_i]

In [None]:
results = []
for r in res:
    results.append(softmax_convert(r))
    
df_test['label'] = results
df_test.head(30)
df_test.to_csv("lstm_normal_12_epochs_cv.txt",index=False , sep="\t")