In [6]:
import logging
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

import numpy as np, pandas as pd
np.random.seed(42)

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Model, load_model
from keras.engine import Layer
from keras.layers import K, Activation, Average, Maximum
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D,GlobalMaxPooling2D
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler, TerminateOnNaN
from keras.layers import Bidirectional
from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [3]:
data_dir="data/"
train = pd.read_csv(data_dir+"train.csv")
test = pd.read_csv(data_dir+"test.csv")
submission = pd.read_csv(data_dir+"sample_submission.csv")

embedding_path = data_dir+"fasttext-crawl-300d-2m/crawl-300d-2M.vec"
#embedding_path = data_dir+"glove840b300dtxt/glove.840B.300d.txt"

max_features = 30000
max_len = 100
embed_size = 300

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
train["comment_text"].fillna("no comment")
test["comment_text"].fillna("no comment")

X_train, X_valid, Y_train, Y_valid = train_test_split(train, y, test_size = 0.1)

raw_text_train = X_train["comment_text"].str.lower()
raw_text_valid = X_valid["comment_text"].str.lower()
raw_text_test = test["comment_text"].str.lower()

tk = Tokenizer(num_words = max_features, lower = True)
tk.fit_on_texts(raw_text_train)
X_train["comment_seq"] = tk.texts_to_sequences(raw_text_train)
X_valid["comment_seq"] = tk.texts_to_sequences(raw_text_valid)
test["comment_seq"] = tk.texts_to_sequences(raw_text_test)

X_train = pad_sequences(X_train.comment_seq, maxlen = max_len)
X_valid = pad_sequences(X_valid.comment_seq, maxlen = max_len)
test = pad_sequences(test.comment_seq, maxlen = max_len)

In [7]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))

In [8]:
word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [9]:
def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale


# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [10]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\nROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))
            for i in range(len(list_classes)):
                score2 = roc_auc_score(self.y_val[:,i], y_pred[:,i])
                print("ROC-AUC of class {}- epoch: {:d} - score: {:.6f}".format(list_classes[i],epoch+1, score2))

In [11]:
file_path = "best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,save_best_only = True, mode = "min")
ra_val = RocAucEvaluation(validation_data=(X_valid, Y_valid), interval = 1)
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
lr_scheduler = LearningRateScheduler(schedule=lambda epoch_n: self.init_lr / (5**(epoch_n)), verbose = 1)
TON = TerminateOnNaN()

Routings = 5
Num_capsule = 8
Dim_capsule = 16
dropout_p = 0.25
rate_drop_dense = 0.28
filter_sizes = [1,2,3,5]
num_filters = 32

def build_model(lr = 0.0):
    
    # Input
    inp = Input(shape = (max_len,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix],trainable=False)(inp)
    
    # CNN2D
    y = SpatialDropout1D(0.4)(x)
    y = Reshape((max_len, embed_size, 1))(y)
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',activation='elu')(y)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',activation='elu')(y)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',activation='elu')(y)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal',activation='elu')(y)
    maxpool_0 = MaxPool2D(pool_size=(max_len - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(max_len - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(max_len - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(max_len - filter_sizes[3] + 1, 1))(conv_3)
    y = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    y = Flatten()(y)
    y = Dropout(0.1)(y)
    
    # Bigru conv1D
    z = Bidirectional(GRU(128, activation='relu', return_sequences=True))(x)
    conv1D = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(z)
    avg_pool = GlobalAveragePooling1D()(conv1D)
    max_pool = GlobalMaxPooling1D()(conv1D)
    z = concatenate([avg_pool, max_pool])
    
    # Capsule
    c = Bidirectional(GRU(128, activation='relu', dropout=dropout_p,recurrent_dropout=dropout_p, return_sequences=True))(x)
    c = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings,share_weights=True)(c)
    c = Flatten()(c)
    c = Dropout(dropout_p)(c)
    
    # output
    a = Average()([y,z,c])
    m = Maximum()([y,z,c])
    out = concatenate([a,m])
    out = Dropout(0.1)(out)
    out = Dense(6, activation = "sigmoid")(out)
    model = Model(inputs=inp, outputs=out)
    
    # model
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr), metrics = ["accuracy"])
    history = model.fit(X_train, Y_train, batch_size = 32, epochs = 6, validation_data = (X_valid, Y_valid), 
                        verbose = 1, callbacks = [ra_val, check_point, early_stop, TON])
    model = load_model(file_path)
    return model

In [12]:
model = build_model(lr = 1e-3)
pred = model.predict(test, batch_size = 512, verbose = 1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/4
ROC-AUC - epoch: 1 - score: 0.987818
ROC-AUC of class toxic- epoch: 1 - score: 0.981749
ROC-AUC of class severe_toxic- epoch: 1 - score: 0.988598
ROC-AUC of class obscene- epoch: 1 - score: 0.989557
ROC-AUC of class threat- epoch: 1 - score: 0.995471
ROC-AUC of class insult- epoch: 1 - score: 0.986915
ROC-AUC of class identity_hate- epoch: 1 - score: 0.984617




Epoch 00001: val_loss improved from inf to 0.04490, saving model to best_model.hdf5
Epoch 2/4
ROC-AUC - epoch: 2 - score: 0.988339
ROC-AUC of class toxic- epoch: 2 - score: 0.982269
ROC-AUC of class severe_toxic- epoch: 2 - score: 0.988708
ROC-AUC of class obscene- epoch: 2 - score: 0.989866
ROC-AUC of class threat- epoch: 2 - score: 0.995933
ROC-AUC of class insult- epoch: 2 - score: 0.987237
ROC-AUC of class identity_hate- epoch: 2 - score: 0.986024




Epoch 00002: val_loss improved from 0.04490 to 0.04235, saving model to best_model.hdf5
Epoch 3/4
ROC-AUC - epoch

ValueError: Unknown layer: Capsule

In [None]:
With caps 1 bigru & fastext :
Epoch 1/4
43680/143613 [========>.....................] - ETA: 22:54 - loss: 0.0637 - acc: 0.9780
                    
143584/143613 [============================>.] - ETA: 0s - loss: 0.0525 - acc: 0.9809
ROC-AUC - epoch: 1 - score: 0.987818

In [9]:
With caps 1 bigru :
Epoch 1/4
 43232/143613 [========>.....................] - ETA: 37:59 - loss: 0.0662 - acc: 0.9771

SyntaxError: invalid syntax (<ipython-input-9-9641b31fab80>, line 1)

In [None]:
With caps 2 different bi gru:
  8256/143613 [>.............................] - ETA: 1:12:08 - loss: 0.0892 - acc: 0.9714

In [None]:
Without caps :
Epoch 1/4
 31648/143613 [=====>........................] - ETA: 36:26 - loss: 0.0653 - acc: 0.9780

In [None]:
With caps :
Epoch 1/4
 17984/143613 [==>...........................] - ETA: 1:10:20 - loss: 0.0717 - acc: 0.9758

In [None]:
Train on 143613 samples, validate on 15958 samples
Epoch 1/4
143584/143613 [============================>.] - ETA: 1s - loss: 0.0520 - acc: 0.9810
ROC-AUC - epoch: 1 - score: 0.987583

ROC-AUC of class toxic- epoch: 1 - score: 0.982831

ROC-AUC of class severe_toxic- epoch: 1 - score: 0.986617

ROC-AUC of class obscene- epoch: 1 - score: 0.990645

ROC-AUC of class threat- epoch: 1 - score: 0.992318

ROC-AUC of class insult- epoch: 1 - score: 0.987175

ROC-AUC of class identity_hate- epoch: 1 - score: 0.985910

Epoch 00001: val_loss improved from inf to 0.04189, saving model to best_model.hdf5
143613/143613 [==============================] - 6860s 48ms/step - loss: 0.0520 - acc: 0.9810 - val_loss: 0.0419 - val_acc: 0.9836
Epoch 2/4
143584/143613 [============================>.] - ETA: 1s - loss: 0.0438 - acc: 0.9833
ROC-AUC - epoch: 2 - score: 0.989061

ROC-AUC of class toxic- epoch: 2 - score: 0.984466

ROC-AUC of class severe_toxic- epoch: 2 - score: 0.987612

ROC-AUC of class obscene- epoch: 2 - score: 0.991312

ROC-AUC of class threat- epoch: 2 - score: 0.995831

ROC-AUC of class insult- epoch: 2 - score: 0.988579

ROC-AUC of class identity_hate- epoch: 2 - score: 0.986566

Epoch 00002: val_loss did not improve
143613/143613 [==============================] - 6816s 47ms/step - loss: 0.0438 - acc: 0.9833 - val_loss: 0.0460 - val_acc: 0.9825
Epoch 3/4
 35104/143613 [======>.......................] - ETA: 1:19:43 - loss: 0.0407 - acc: 0.9846

In [None]:
Last with num_vars

Epoch 1/10
12128/159571 [=>............................] - ETA: 1:47:40 - loss: 0.0738 - acc: 0.9759

Epoch 1/10
21760/159571 [===>..........................] - ETA: 1:40:56 - loss: 0.0657 - acc: 0.9776

Epoch 1/10
37088/159571 [=====>........................] - ETA: 1:29:43 - loss: 0.0601 - acc: 0.9790

Epoch 1/10
64896/159571  [===========>..................] - ETA: 1:09:30 - loss: 0.0555 - acc: 0.9801

Epoch 1/10
103264/159571 [==================>...........] - ETA: 41:44 - loss: 0.0522 - acc: 0.9809

Epoch 1/10
138144/159571 [========================>.....] - ETA: 16:41 - loss: 0.0507 - acc: 0.9813

Epoch 1/10
159571/159571 [==============================] - 7443s 47ms/step - loss: 0.0500 - acc: 0.9815
        
Epoch 2/10
 22048/159571 [===>..........................] - ETA: 1:45:59 - loss: 0.0416 - acc: 0.9836

                        

In [None]:
With caps, multiple dense:
    
Epoch 1/10
 18944/159571 [==>...........................] - ETA: 3:03:43 - loss: 0.0766 - acc: 0.9730
                        
Epoch 1/10
 29184/159571 [====>.........................] - ETA: 4:14:37 - loss: 0.0697 - acc: 0.9757
                        
Epoch 1/10
 40192/159571 [======>.......................] - ETA: 3:58:25 - loss: 0.0660 - acc: 0.9767

In [None]:
Without caps

Epoch 1/10
  8576/159571 [>.............................] - ETA: 2:00:27 - loss: 0.1634 - acc: 0.9488
                        
Epoch 1/10
 49280/159571 [========>.....................] - ETA: 2:03:26 - loss: 0.0833 - acc: 0.9726
                        


In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
submission[list_classes] = (pred)
submission.to_csv("submission.csv", index = False)

In [None]:
submission.head()