# Library

In [None]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from keras.preprocessing import text, sequence

from keras.models import Model, Sequential
from keras.layers import Flatten, Dense

from keras.layers import Dense, Embedding, Input, SimpleRNN, LSTM, Bidirectional
from keras.layers import Dropout, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D

from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

# Data

## Train

In [None]:
df_train = pd.read_csv("../data/input/train.csv")
targets = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
X_train = df_train.comment_text
y_train = df_train[targets]

## Test

In [None]:
df_test = pd.read_csv("../data/input/test.csv")
df_test.loc[df_test.comment_text.isnull(), "comment_text"] = ""
X_test = df_test.comment_text

# Global parameters

In [None]:
batch_size = 32
epochs = 3
max_features = 20000
maxlen = 100

# Preprocessing : Tokens

In [None]:
X_train.map(lambda x : len(x.split(" "))).hist(bins=100)
plt.vlines(100, ymin=0, ymax=2000, colors="red")
plt.title("Comment length (number of words)")
plt.xlabel("Number of words")
plt.ylabel("Count")
plt.show()

In [None]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train))

list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)

X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

# Pre-trained embedding

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def get_embedding_matrix(tokenizer, path_glove, embed_size,
                         max_features=20000):
    embeddings_index = dict(
        get_coefs(*o.strip().split()) for o in open(path_glove))
    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words,
                                                            embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
glove_path = "../data/glove/glove.6B/glove.6B.50d.txt"
embedding_matrix = get_embedding_matrix(
    tokenizer=tokenizer, path_glove=glove_path, embed_size=50, max_features=max_features)

# LSTM, Pooling and regularization 

In [None]:
# Define model
model = Sequential()
model.add(
    Embedding(
        max_features, embedding_matrix.shape[1], weights=[embedding_matrix]))
model.add(
    Bidirectional(
        LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
model.add(GlobalMaxPool1D())
model.add(Dense(64, activation="relu"))
model.add(Dense(6, activation='sigmoid'))

In [None]:
SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Show model
model.summary()

In [None]:
# Fit model
history = model.fit(
    X_train, y_train, epochs=epochs, batch_size=batch_size)

In [None]:
model.save_weights('../data/work/complete_lstm_submission.h5')

# Predict for submission

In [None]:
predict = model.predict_proba(X_test, verbose=1)

# Submission

In [None]:
pd.DataFrame(
    predict, columns=targets,
    index=df_test.id).reset_index().rename(columns={
        "index": "id"
    }).to_csv(
        "../submissions/complete_lstm_submission.csv", index=False)