# Kaggle competition https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge

In [1]:
import numpy as np
import pandas as pd
import keras
import tensorflow as tf

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [5]:
max_features = 20000
maxlen = 100

In [7]:
train = pd.read_csv("./data/train.csv")
# test = pd.read_csv("../input/test.csv")
train = train.sample(frac=1)

In [8]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
130271,b8ec63e6e8c9ab11,"Hello neo-nazi \n\nplease do so, ban me from w...",1,0,1,0,1,1
49972,859b63179882ef7f,"""\nCalling me rude doesn't make me less rude, ...",0,0,0,0,0,0
47257,7e3846449a33b36c,"""\n\nMissing article\nI look but couldn't see ...",0,0,0,0,0,0
142390,f9a4cc9e7b1bff02,"""\n\nYou are a plagiarizing, lying, full-of-sh...",1,0,0,0,1,0
27770,498936b12e15a9d2,"""\nIt has a number of issues that would warran...",0,0,0,0,0,0


In [11]:
train.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [14]:
list_sentences_train = train["comment_text"].fillna("CVxTz").values
# list_sentences_test = test["comment_text"].fillna("CVxTz").values

In [15]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

In [17]:
len(list_sentences_train)

159571

In [25]:
y

array([[1, 0, 1, 0, 1, 1],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ..., 
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [26]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

In [27]:
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
# list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

In [28]:
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
# X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [34]:
len(list_tokenized_train[0])

83

In [35]:
X_t[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,  302, 2978, 1189,   45,   33,
         37,  700,   35,   31,   28,    7,  382,   42, 2608, 2978, 2608,
         25,    1, 2608,    9,   14,  707,    3,   11,   11,  103,   14,
        278,    9,   20,  723,  555,    8,   31, 2073,   25,  436, 1175,
        436, 1175,    6,   19,    5,  394, 1189,    5, 3362, 1405,    5,
       1207,    5, 9659,  129,    6,  417, 3192,   22,    6,  105,    7,
         47,  261,   29,  418,   25,  728,  865,  445,   29,  300,    7,
       3483,  227, 6864, 6865, 9659, 1403,    1, 9016, 7720, 1655,    3,
        300], dtype=int32)

In [36]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [37]:
model = get_model()

In [38]:
batch_size = 32
epochs = 2

In [39]:
file_path = "weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)

callbacks_list = [checkpoint, early] 
model.fit(X_t, y, batch_size = batch_size, epochs = epochs, validation_split = 0.1, 
          callbacks = callbacks_list)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11cd21eb8>

In [None]:
model.load_weights(file_path)

In [None]:
y_test = model.predict(X_te)