In [48]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import pickle
import keras
import os
import math
import random
from numpy import array
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer, text
from keras.preprocessing import sequence
from keras.backend.tensorflow_backend import set_session
from keras.layers import Input, Embedding, LSTM, Dense, BatchNormalization, Dropout, Bidirectional, GlobalMaxPool1D
from keras.models import Model, load_model, Sequential
from keras.utils import plot_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import losses
from gensim.models import Word2Vec
from nltk import word_tokenize
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from matplotlib.pyplot import plot

# 1. Load data

In [2]:
df_train = pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv')
df_test = pd.read_csv('jigsaw-toxic-comment-classification-challenge/test.csv')

In [38]:
comments = []
for i, row in df_train.iterrows():
    temp = []
    string = row['comment_text']
    string = string.lower()
    string = re.sub(r"[^\x00-\x7F]+", " ", string)
    string = re.sub(r"what's", "what is ", string)
    string = re.sub(r"\'s", " ", string)
    string = re.sub(r"\'ve", " have ", string)
    string = re.sub(r"can't", "cannot ", string)
    string = re.sub(r"n t", " not ", string)
    string = re.sub(r"n't", " not ", string)
    string = re.sub(r"i'm", "i am ", string)
    string = re.sub(r"\'re", " are ", string)
    string = re.sub(r"\'d", " would ", string)
    string = re.sub(r"\'ll", " will ", string)
    string = re.sub(r'\n', '', string)
    string = re.sub(r'(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})', ' ', string)
    string = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', string)
    string = re.sub(r'(?:[\d]{2}):(?:[\d]{2}).*(?:[\d]{4})', ' ', string)
    string = re.sub(r'(?:[\d]{2}):(?:[\d]{2})', '', string)
    month = '|'.join(['january', 'febuary', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'])
    string = re.sub(month, '', string)
    time_zone = '|'.join(['(est)', '(ast)', '(utc)', '(cst)', '(mst)', '(pst)', '(akst)', '(hst)'])
    string = re.sub(time_zone, '', string)
    string = re.sub(r"[^A-Za-z0-9]+", " ", string)

    # tokenize the sentence into words
    for j in word_tokenize(string):
        temp.append(j)
    comments.append(temp)

In [39]:
md_w2v = Word2Vec(comments, min_count=20, size=300, window=5, iter=100, sg=1)

In [40]:
pickle.dump(md_w2v, open('w2v_model.pkl', 'wb'))

In [5]:
with open('w2v_model.pkl','rb') as f:
    md_w2v = pickle.load(f)

In [41]:
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.8
set_session(tf.Session(config=config))

In [42]:
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = df_train[classes].values

# Kernel

In [64]:
word_num = len(md_w2v.wv.vocab) # 17241
vocab20 = list(md_w2v.wv.vocab)
comments20 = []
for c in comments:
    t = []
    for w in c:
        if w in vocab20:
            t.append(w)
    comments20.append(t)

In [65]:
tokenizer = Tokenizer(word_num)
tokenizer.fit_on_texts(comments20)
description_token_sequence = tokenizer.texts_to_sequences(comments20)

In [66]:
X_t = sequence.pad_sequences(description_token_sequence, maxlen=100)

In [67]:
# Create embedding matrix
vector_size = 300
embedding_matrix =  np.zeros((word_num+1, vector_size))
ind = 0
for word, i in tokenizer.word_index.items():
    vector = md_w2v.wv[word]
    embedding_matrix[i] = vector

In [70]:
main_input = Input(shape=(100,), dtype='int32', name='First_Input')
# This embedding layer will encode the input sequence
# into a sequence of dense 300-dimensional vectors.
# Using pretrained word2vec skip gram model
x = Embedding(output_dim=vector_size, input_dim=word_num+1,
              input_length=100, weights=[embedding_matrix],
              trainable=False, name="Embedding")(main_input)

In [72]:
# A LSTM will transform the vector sequence into a single vector,
# containing information about the entire sequence
lstm_out = LSTM(200, recurrent_dropout=0.1, dropout=0.1, name="LSTM")(x)
# Fully-connected layer and batch normalization layers
x = Dense(256, activation='relu')(lstm_out)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='relu')(x)
x = BatchNormalization()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(6, activation="sigmoid")(x)

In [73]:
NN_model = Model(inputs=main_input, outputs=x)
NN_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [74]:
batch_size = 32
epochs = 2
file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early] #early
NN_model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

Train on 143613 samples, validate on 15958 samples
Epoch 1/2


  6464/143613 [>.............................] - ETA: 4:05:51 - loss: 1.2006 - acc: 0.32 - ETA: 2:11:17 - loss: 1.1168 - acc: 0.36 - ETA: 1:33:04 - loss: 1.0860 - acc: 0.38 - ETA: 1:14:06 - loss: 1.0419 - acc: 0.41 - ETA: 1:02:44 - loss: 1.0103 - acc: 0.42 - ETA: 54:59 - loss: 0.9852 - acc: 0.4418 - ETA: 49:29 - loss: 0.9673 - acc: 0.45 - ETA: 45:20 - loss: 0.9398 - acc: 0.47 - ETA: 42:11 - loss: 0.9128 - acc: 0.48 - ETA: 39:38 - loss: 0.8872 - acc: 0.50 - ETA: 37:30 - loss: 0.8615 - acc: 0.51 - ETA: 35:39 - loss: 0.8383 - acc: 0.53 - ETA: 34:09 - loss: 0.8190 - acc: 0.54 - ETA: 32:52 - loss: 0.7946 - acc: 0.56 - ETA: 31:49 - loss: 0.7781 - acc: 0.57 - ETA: 30:54 - loss: 0.7572 - acc: 0.58 - ETA: 30:05 - loss: 0.7410 - acc: 0.60 - ETA: 29:21 - loss: 0.7209 - acc: 0.61 - ETA: 28:41 - loss: 0.7049 - acc: 0.62 - ETA: 28:06 - loss: 0.6897 - acc: 0.63 - ETA: 27:34 - loss: 0.6732 - acc: 0.65 - ETA: 27:10 - loss: 0.6570 - acc: 0.66 - ETA: 26:44 - loss: 0.6429 - acc: 0.67 - ETA: 26:19 - loss: 

 19520/143613 [===>..........................] - ETA: 15:50 - loss: 0.1433 - acc: 0.94 - ETA: 15:50 - loss: 0.1431 - acc: 0.94 - ETA: 15:49 - loss: 0.1431 - acc: 0.94 - ETA: 15:49 - loss: 0.1429 - acc: 0.94 - ETA: 15:49 - loss: 0.1427 - acc: 0.94 - ETA: 15:48 - loss: 0.1426 - acc: 0.94 - ETA: 15:48 - loss: 0.1425 - acc: 0.94 - ETA: 15:48 - loss: 0.1423 - acc: 0.94 - ETA: 15:47 - loss: 0.1421 - acc: 0.95 - ETA: 15:47 - loss: 0.1421 - acc: 0.95 - ETA: 15:47 - loss: 0.1419 - acc: 0.95 - ETA: 15:46 - loss: 0.1419 - acc: 0.95 - ETA: 15:46 - loss: 0.1420 - acc: 0.95 - ETA: 15:46 - loss: 0.1419 - acc: 0.95 - ETA: 15:46 - loss: 0.1416 - acc: 0.95 - ETA: 15:45 - loss: 0.1416 - acc: 0.95 - ETA: 15:45 - loss: 0.1414 - acc: 0.95 - ETA: 15:45 - loss: 0.1412 - acc: 0.95 - ETA: 15:45 - loss: 0.1411 - acc: 0.95 - ETA: 15:44 - loss: 0.1409 - acc: 0.95 - ETA: 15:44 - loss: 0.1407 - acc: 0.95 - ETA: 15:44 - loss: 0.1407 - acc: 0.95 - ETA: 15:43 - loss: 0.1405 - acc: 0.95 - ETA: 15:43 - loss: 0.1402 - acc

 26048/143613 [====>.........................] - ETA: 14:52 - loss: 0.1262 - acc: 0.95 - ETA: 14:52 - loss: 0.1262 - acc: 0.95 - ETA: 14:52 - loss: 0.1262 - acc: 0.95 - ETA: 14:51 - loss: 0.1261 - acc: 0.95 - ETA: 14:51 - loss: 0.1259 - acc: 0.95 - ETA: 14:51 - loss: 0.1261 - acc: 0.95 - ETA: 14:50 - loss: 0.1260 - acc: 0.95 - ETA: 14:50 - loss: 0.1259 - acc: 0.95 - ETA: 14:50 - loss: 0.1259 - acc: 0.95 - ETA: 14:50 - loss: 0.1258 - acc: 0.95 - ETA: 14:49 - loss: 0.1257 - acc: 0.95 - ETA: 14:49 - loss: 0.1256 - acc: 0.95 - ETA: 14:49 - loss: 0.1255 - acc: 0.95 - ETA: 14:48 - loss: 0.1254 - acc: 0.95 - ETA: 14:48 - loss: 0.1253 - acc: 0.95 - ETA: 14:48 - loss: 0.1252 - acc: 0.95 - ETA: 14:48 - loss: 0.1251 - acc: 0.95 - ETA: 14:47 - loss: 0.1250 - acc: 0.95 - ETA: 14:47 - loss: 0.1251 - acc: 0.95 - ETA: 14:47 - loss: 0.1250 - acc: 0.95 - ETA: 14:47 - loss: 0.1250 - acc: 0.95 - ETA: 14:47 - loss: 0.1248 - acc: 0.95 - ETA: 14:46 - loss: 0.1248 - acc: 0.95 - ETA: 14:46 - loss: 0.1248 - acc

 32576/143613 [=====>........................] - ETA: 13:58 - loss: 0.1142 - acc: 0.96 - ETA: 13:58 - loss: 0.1142 - acc: 0.96 - ETA: 13:58 - loss: 0.1142 - acc: 0.96 - ETA: 13:57 - loss: 0.1142 - acc: 0.96 - ETA: 13:57 - loss: 0.1141 - acc: 0.96 - ETA: 13:57 - loss: 0.1143 - acc: 0.96 - ETA: 13:56 - loss: 0.1143 - acc: 0.96 - ETA: 13:56 - loss: 0.1142 - acc: 0.96 - ETA: 13:56 - loss: 0.1141 - acc: 0.96 - ETA: 13:56 - loss: 0.1141 - acc: 0.96 - ETA: 13:55 - loss: 0.1141 - acc: 0.96 - ETA: 13:55 - loss: 0.1141 - acc: 0.96 - ETA: 13:55 - loss: 0.1140 - acc: 0.96 - ETA: 13:55 - loss: 0.1140 - acc: 0.96 - ETA: 13:54 - loss: 0.1139 - acc: 0.96 - ETA: 13:54 - loss: 0.1139 - acc: 0.96 - ETA: 13:54 - loss: 0.1139 - acc: 0.96 - ETA: 13:54 - loss: 0.1138 - acc: 0.96 - ETA: 13:53 - loss: 0.1138 - acc: 0.96 - ETA: 13:53 - loss: 0.1137 - acc: 0.96 - ETA: 13:53 - loss: 0.1137 - acc: 0.96 - ETA: 13:53 - loss: 0.1137 - acc: 0.96 - ETA: 13:52 - loss: 0.1136 - acc: 0.96 - ETA: 13:52 - loss: 0.1135 - acc


































Epoch 00001: val_loss improved from inf to 0.05681, saving model to weights_base.best.hdf5
Epoch 2/2


  6528/143613 [>.............................] - ETA: 17:31 - loss: 0.0206 - acc: 0.99 - ETA: 17:29 - loss: 0.0437 - acc: 0.98 - ETA: 17:23 - loss: 0.0568 - acc: 0.97 - ETA: 17:27 - loss: 0.0588 - acc: 0.97 - ETA: 17:27 - loss: 0.0580 - acc: 0.97 - ETA: 17:08 - loss: 0.0614 - acc: 0.97 - ETA: 17:06 - loss: 0.0595 - acc: 0.97 - ETA: 17:03 - loss: 0.0545 - acc: 0.97 - ETA: 17:03 - loss: 0.0523 - acc: 0.97 - ETA: 17:01 - loss: 0.0516 - acc: 0.97 - ETA: 17:00 - loss: 0.0508 - acc: 0.98 - ETA: 17:01 - loss: 0.0511 - acc: 0.98 - ETA: 17:00 - loss: 0.0516 - acc: 0.98 - ETA: 16:58 - loss: 0.0506 - acc: 0.98 - ETA: 16:58 - loss: 0.0519 - acc: 0.98 - ETA: 16:59 - loss: 0.0528 - acc: 0.98 - ETA: 16:58 - loss: 0.0513 - acc: 0.98 - ETA: 16:56 - loss: 0.0499 - acc: 0.98 - ETA: 16:51 - loss: 0.0527 - acc: 0.98 - ETA: 16:51 - loss: 0.0519 - acc: 0.98 - ETA: 16:51 - loss: 0.0552 - acc: 0.98 - ETA: 16:50 - loss: 0.0551 - acc: 0.98 - ETA: 16:51 - loss: 0.0591 - acc: 0.97 - ETA: 16:52 - loss: 0.0608 - acc

 19584/143613 [===>..........................] - ETA: 15:21 - loss: 0.0594 - acc: 0.97 - ETA: 15:21 - loss: 0.0595 - acc: 0.97 - ETA: 15:21 - loss: 0.0595 - acc: 0.97 - ETA: 15:21 - loss: 0.0594 - acc: 0.97 - ETA: 15:20 - loss: 0.0593 - acc: 0.97 - ETA: 15:20 - loss: 0.0593 - acc: 0.97 - ETA: 15:20 - loss: 0.0594 - acc: 0.97 - ETA: 15:20 - loss: 0.0593 - acc: 0.97 - ETA: 15:20 - loss: 0.0592 - acc: 0.97 - ETA: 15:19 - loss: 0.0593 - acc: 0.97 - ETA: 15:19 - loss: 0.0591 - acc: 0.97 - ETA: 15:19 - loss: 0.0591 - acc: 0.97 - ETA: 15:18 - loss: 0.0591 - acc: 0.97 - ETA: 15:18 - loss: 0.0590 - acc: 0.97 - ETA: 15:18 - loss: 0.0590 - acc: 0.97 - ETA: 15:18 - loss: 0.0589 - acc: 0.97 - ETA: 15:18 - loss: 0.0588 - acc: 0.97 - ETA: 15:17 - loss: 0.0588 - acc: 0.97 - ETA: 15:17 - loss: 0.0589 - acc: 0.97 - ETA: 15:17 - loss: 0.0588 - acc: 0.97 - ETA: 15:17 - loss: 0.0588 - acc: 0.97 - ETA: 15:17 - loss: 0.0589 - acc: 0.97 - ETA: 15:16 - loss: 0.0589 - acc: 0.97 - ETA: 15:16 - loss: 0.0592 - acc

 26112/143613 [====>.........................] - ETA: 14:35 - loss: 0.0581 - acc: 0.97 - ETA: 14:35 - loss: 0.0582 - acc: 0.97 - ETA: 14:34 - loss: 0.0581 - acc: 0.97 - ETA: 14:34 - loss: 0.0581 - acc: 0.97 - ETA: 14:34 - loss: 0.0581 - acc: 0.97 - ETA: 14:34 - loss: 0.0581 - acc: 0.97 - ETA: 14:33 - loss: 0.0580 - acc: 0.97 - ETA: 14:33 - loss: 0.0580 - acc: 0.97 - ETA: 14:33 - loss: 0.0581 - acc: 0.97 - ETA: 14:33 - loss: 0.0581 - acc: 0.97 - ETA: 14:32 - loss: 0.0581 - acc: 0.97 - ETA: 14:32 - loss: 0.0581 - acc: 0.97 - ETA: 14:32 - loss: 0.0581 - acc: 0.97 - ETA: 14:32 - loss: 0.0581 - acc: 0.97 - ETA: 14:31 - loss: 0.0581 - acc: 0.97 - ETA: 14:31 - loss: 0.0581 - acc: 0.97 - ETA: 14:31 - loss: 0.0581 - acc: 0.97 - ETA: 14:31 - loss: 0.0582 - acc: 0.97 - ETA: 14:31 - loss: 0.0582 - acc: 0.97 - ETA: 14:30 - loss: 0.0581 - acc: 0.97 - ETA: 14:30 - loss: 0.0581 - acc: 0.97 - ETA: 14:30 - loss: 0.0581 - acc: 0.97 - ETA: 14:30 - loss: 0.0581 - acc: 0.97 - ETA: 14:29 - loss: 0.0581 - acc

 32640/143613 [=====>........................] - ETA: 13:47 - loss: 0.0583 - acc: 0.97 - ETA: 13:47 - loss: 0.0584 - acc: 0.97 - ETA: 13:47 - loss: 0.0584 - acc: 0.97 - ETA: 13:46 - loss: 0.0584 - acc: 0.97 - ETA: 13:46 - loss: 0.0584 - acc: 0.97 - ETA: 13:46 - loss: 0.0584 - acc: 0.97 - ETA: 13:46 - loss: 0.0585 - acc: 0.97 - ETA: 13:45 - loss: 0.0584 - acc: 0.97 - ETA: 13:45 - loss: 0.0585 - acc: 0.97 - ETA: 13:45 - loss: 0.0584 - acc: 0.97 - ETA: 13:45 - loss: 0.0585 - acc: 0.97 - ETA: 13:45 - loss: 0.0585 - acc: 0.97 - ETA: 13:44 - loss: 0.0585 - acc: 0.97 - ETA: 13:44 - loss: 0.0585 - acc: 0.97 - ETA: 13:44 - loss: 0.0585 - acc: 0.97 - ETA: 13:44 - loss: 0.0585 - acc: 0.97 - ETA: 13:44 - loss: 0.0584 - acc: 0.97 - ETA: 13:43 - loss: 0.0584 - acc: 0.97 - ETA: 13:43 - loss: 0.0584 - acc: 0.97 - ETA: 13:43 - loss: 0.0583 - acc: 0.97 - ETA: 13:43 - loss: 0.0584 - acc: 0.97 - ETA: 13:42 - loss: 0.0584 - acc: 0.97 - ETA: 13:42 - loss: 0.0584 - acc: 0.97 - ETA: 13:42 - loss: 0.0584 - acc


































Epoch 00002: val_loss improved from 0.05681 to 0.05383, saving model to weights_base.best.hdf5


<keras.callbacks.History at 0x260aeabd208>

In [95]:
tmp = X_t[6].reshape(1,100)

In [96]:
NN_model.predict(tmp)

array([[0.99591583, 0.3230673 , 0.9393855 , 0.07412554, 0.858789  ,
        0.17352752]], dtype=float32)

# Test

In [None]:
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [98]:
test_comment = []
for i, row in df_test.iterrows():
    temp = []
    string = row['comment_text']
    string = string.lower()
    string = re.sub(r"[^\x00-\x7F]+", " ", string)
    string = re.sub(r"what's", "what is ", string)
    string = re.sub(r"\'s", " ", string)
    string = re.sub(r"\'ve", " have ", string)
    string = re.sub(r"can't", "cannot ", string)
    string = re.sub(r"n t", " not ", string)
    string = re.sub(r"n't", " not ", string)
    string = re.sub(r"i'm", "i am ", string)
    string = re.sub(r"\'re", " are ", string)
    string = re.sub(r"\'d", " would ", string)
    string = re.sub(r"\'ll", " will ", string)
    string = re.sub(r'\n', '', string)
    string = re.sub(r'(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})', ' ', string)
    string = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', string)
    string = re.sub(r'(?:[\d]{2}):(?:[\d]{2}).*(?:[\d]{4})', ' ', string)
    string = re.sub(r'(?:[\d]{2}):(?:[\d]{2})', '', string)
    month = '|'.join(['january', 'febuary', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december'])
    string = re.sub(month, '', string)
    time_zone = '|'.join(['(est)', '(ast)', '(utc)', '(cst)', '(mst)', '(pst)', '(akst)', '(hst)'])
    string = re.sub(time_zone, '', string)
    string = re.sub(r"[^A-Za-z0-9]+", " ", string)

    # tokenize the sentence into words
    for j in word_tokenize(string):
        temp.append(j)
    test_comment.append(temp)

In [99]:
tokenizer.fit_on_texts(test_comment)
token_test = tokenizer.texts_to_sequences(test_comment)

In [100]:
X_te = sequence.pad_sequences(token_test, maxlen=100)

In [102]:
y_test = NN_model.predict(X_te)