In [20]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, GRU, Embedding, Dropout, Activation, BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
import gensim
from keras.layers import Layer
import inspect
from keras.layers import Merge
from keras.backend import tf

from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
train = pd.read_csv('input_data/train.csv')
test = pd.read_csv('input_data/test.csv')
subm = pd.read_csv('input_data/sample_submission.csv')

In [4]:
embed_size = 100 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [5]:
list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values
full_data = np.concatenate((list_sentences_train, list_sentences_test), axis=0)

In [6]:
# could be improved by adding test set to vocab

#tokenizer = Tokenizer(num_words=max_features)
filters='!"#$%&()+,-./:;<=>?@[\\]^_`{|}~\t\n'
tokenizer = Tokenizer(filters=filters)
tokenizer.fit_on_texts(list(full_data))
#tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [7]:
# def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
# embeddings_index = dict(get_coefs(*o.strip().split()) for o in open('input_data/glove.6B.100d.txt'))

In [8]:
# custom word vectors
X = tokenizer.texts_to_sequences(full_data)
inv_index_map = {v: k for k, v in tokenizer.word_index.iteritems()}
for i in range(len(X)):
    for j in range(len(X[i])):
        X[i][j] = inv_index_map[X[i][j]]
        
model = gensim.models.Word2Vec(X, size=100)
embeddings_index = dict(zip(model.wv.index2word, model.wv.syn0))

In [9]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(-0.018556716, 0.38440287)

In [10]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [25]:
# inp = Input(shape=(maxlen,))
# x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
# x = Bidirectional(LSTM(100, return_sequences=True,dropout=0.1, recurrent_dropout=0.1))(x)
# x = GlobalMaxPool1D()(x)
# x = BatchNormalization()(x)
# x = Dense(100, activation="relu")(x)
# #x = BatchNormalization()(x)
# x = Dropout(0.1)(x)
# x = Dense(6, activation="sigmoid")(x)
# model = Model(inputs=inp, outputs=x)


model = Sequential()
model.add(Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True))
model.add(Bidirectional(LSTM(100, return_sequences=True,
                        dropout=0.1, recurrent_dropout=0.1)))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(6))
model.add(Activation('softmax'))

import keras.backend as K
def loss(y_true, y_pred):
     return K.binary_crossentropy(y_true, y_pred)
    
model.compile(loss=loss, optimizer='nadam', metrics=['accuracy'])

In [28]:
def schedule(ind):
    a = [0.002,0.003, 0.000]
    return a[ind]
lr = callbacks.LearningRateScheduler(schedule)
model.fit(X_t, y, batch_size=64, epochs=1, validation_split=0.1, callbacks=[lr])

Train on 143613 samples, validate on 15958 samples
Epoch 1/1


<keras.callbacks.History at 0x7fe8e4523f50>

In [13]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
subm[list_classes] = y_test
subm.to_csv('submission/LSTM-base-submission1.csv', index=False)

