In [1]:
import os
os.chdir("../")

In [2]:
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras import optimizers
from keras.callbacks import ModelCheckpoint
import tensorflow as tf

import keras.backend as K

Using TensorFlow backend.


In [3]:
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

In [4]:
from utils import *
df = functions.parse_file(r"raw_data/EmoContext/train3.txt", "EmoContext")
df.head(5)

Unnamed: 0,id,label,turn1,turn2,turn3
0,0,others,dont worry ism girl,hmm how do i know if you are,whats ur name ?
1,1,angry,when did in,saw many times i think,no . i never saw you
2,2,others,by,by google chrome,where you live
3,3,angry,u r ridiculous,i might be ridiculous but i am telling the tru...,u little disgusting whore
4,4,others,just for time pass,wt do u do a a living then,maybe


In [5]:
text_data = []
for idx,row in df.iterrows():
    text_data.append("{}. {}. {}.".format(row['turn1'], row['turn2'], row['turn3']))

In [6]:
NR_WORDS = 8000
MAX_PROP_LENGTH = 128
tokenizer = Tokenizer(num_words=NR_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                   lower=True,split=' ')

tokenizer.fit_on_texts(text_data)

X_train = tokenizer.texts_to_sequences(text_data)
X_train = pad_sequences(X_train, maxlen = MAX_PROP_LENGTH)

In [7]:
def one_hot_vector(word,label=None):
    words = {"others": 0, "angry": 1, "sad":2, "happy": 3}
    if label == None:
        y = [0,0,0,0]
        y[words[word]] = 1
        return y
    if label == word:
        return [1,0]
    return [0,1]

Y_train = []

for idx,row in df.iterrows():
    Y_train.append(one_hot_vector(row['label']))

Y_train = np.array(Y_train)

In [8]:
df = functions.parse_file(r"raw_data/EmoContext/devwithlabels3.txt", "EmoContext")
df.head(5)

Unnamed: 0,id,label,turn1,turn2,turn3
0,0,angry,then dont ask me,youre a guy not as if you would understand,im not a guy fuck off
1,1,others,mixed things such as,the things you do .,have you seen minions
2,2,happy,today ism very happy,and ism happy for you,i will be marry
3,3,others,noah bring me some,left it there oops,orb
4,4,others,it is thoo,i said soon master .,he is pressuring me


In [9]:
text_data = []
for idx,row in df.iterrows():
    text_data.append("{}. {}. {}.".format(row['turn1'], row['turn2'], row['turn3']))

In [10]:
#tokenizer = Tokenizer(num_words=NR_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
#                                   lower=True,split=' ')

#tokenizer.fit_on_texts(text_data)

X_test = tokenizer.texts_to_sequences(text_data)
X_test = pad_sequences(X_test, maxlen = MAX_PROP_LENGTH)

In [11]:
def one_hot_vector(word,label=None):
    words = {"others": 0, "angry": 1, "sad":2, "happy": 3}
    if label == None:
        y = [0,0,0,0]
        y[words[word]] = 1
        return y
    if label == word:
        return [1,0]
    return [0,1]

Y_test = []

for idx,row in df.iterrows():
    Y_test.append(one_hot_vector(row['label']))

Y_test = np.array(Y_test)

In [12]:
embed_dim = 128
lstm_out = 32
batch_size = 64

adam = optimizers.Adam(lr=0.01)
rmsprop = optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)

model = Sequential()
model.name = "ingrid_model"
model.add(Embedding(NR_WORDS, embed_dim,input_length = X_train.shape[1],name="ingrid_embedding_model",trainable=True))
#model.add(Dropout(0.3))
model.add(LSTM(lstm_out, dropout=0.5))
model.add(Dense(4,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer=rmsprop, metrics = ['accuracy', f1])

In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
ingrid_embedding_model (Embe (None, 128, 128)          1024000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                20608     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 132       
Total params: 1,044,740
Trainable params: 1,044,740
Non-trainable params: 0
_________________________________________________________________


In [14]:
mdcheck = ModelCheckpoint("trained_models/best_model_val_acc{val_acc:.4f}.h5", monitor='val_f1', save_best_only=True)

In [15]:
Y_train = np.array(Y_train)
total = len(Y_train)
from sklearn.model_selection import KFold
# prepare cross validation
kfold = KFold(n_splits=6)
Y_train = np.array(Y_train)
# enumerate splits
for train, validation in kfold.split(X_train):
    history = model.fit(X_train[train], Y_train[train],
                    validation_data=(X_test, Y_test),
                    epochs=3, verbose=1, batch_size=batch_size,class_weight={
                        0: len(X_train[train]) / len(np.where(Y_train[train][:,0]==1.0)[0]),
                        1: len(X_train[train]) / len(np.where(Y_train[train][:,1]==1.0)[0]),
                        2: len(X_train[train]) / len(np.where(Y_train[train][:,2]==1.0)[0]),
                        3: len(X_train[train]) / len(np.where(Y_train[train][:,3]==1.0)[0]),
                    })
#history = model.fit(X_train, Y_train,
#                    validation_data=(X_test, Y_test),
#                    epochs=20, verbose=1, batch_size=batch_size,
#                    class_weight={
#                        0: total / len(np.where(Y_train[:,0]==1.0)[0]),
#                        1: total / len(np.where(Y_train[:,1]==1.0)[0]),
#                        2: total / len(np.where(Y_train[:,2]==1.0)[0]),
#                        3: total / len(np.where(Y_train[:,3]==1.0)[0]),
#                    },callbacks=[mdcheck], shuffle=True)

Train on 25133 samples, validate on 2755 samples
Epoch 1/3


InternalError: Blas GEMM launch failed : a.shape=(64, 32), b.shape=(32, 32), m=64, n=32, k=32
	 [[{{node lstm_1/while/MatMul_7}} = MatMul[T=DT_FLOAT, _class=["loc:@training/RMSprop/gradients/AddN_4"], transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](lstm_1/while/Switch_3:1, lstm_1/while/MatMul_7/Enter)]]
	 [[{{node metrics/f1/Mean_1/_87}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_2072_metrics/f1/Mean_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

In [None]:
save_model(model,indices=[{"indices":["turn1","turn2","turn3"],"NR_WORDS":NR_WORDS,"MAX_PROP_LENGTH":MAX_PROP_LENGTH}])

In [None]:
df_test = functions.parse_file(r"raw_data/EmoContext/devwithlabels3.txt", "EmoContext")
df_test.head()

In [None]:
text_data = []

for idx,row in df_test.iterrows():
    text_data.append("{}. {}. {}.".format(row['turn1'], row['turn2'], row['turn3']))

res = model.predict(X_test, batch_size=64, verbose=1)
res[3]

In [None]:
revers_words = {0:"others", 1:"angry", 2:"sad", 3:"happy"}

def softmax_convert(res):
    max_i = 0
    max_v = 0
    for i in range(0,4):
        if res[i] > max_v:
            max_v = res[i]
            max_i = i
    return revers_words[max_i]

In [None]:
results = []
for r in res:
    results.append(softmax_convert(r))
    
df_test['label'] = results
df_test.head(50)
df_test.to_csv("ingrid_model.txt",index=False , sep="\t")