https://www.kaggle.com/yekenot/pooled-gru

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd
from nltk.corpus import stopwords

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import re
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import backend as K

Using TensorFlow backend.


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import os
os.environ['OMP_NUM_THREADS'] = '4'

In [5]:
EMBEDDING_FILE = '../input/glove.6B.100d.txt'

In [6]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
submission = pd.read_csv('../input/sample_submission.csv')

In [7]:
# PREPROCESSING PART
repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

In [8]:
keys = [i for i in repl.keys()]

In [9]:
new_train_data = []
new_test_data = []
ltr = train["comment_text"].tolist()
lte = test["comment_text"].tolist()

In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
for i in ltr:
    arr = str(i).split()
    xx = ""
    for j in arr:
        j = str(j).lower()
        if j[:4] == 'http' or j[:3] == 'www':
            continue
        if j in stop_words:
            continue
        if j in keys:
            # print("inn")
            j = repl[j]
        xx += j + " "
    new_train_data.append(xx)

In [12]:
for i in lte:
    arr = str(i).split()
    xx = ""
    for j in arr:
        j = str(j).lower()
        if j[:4] == 'http' or j[:3] == 'www':
            continue
        if j in stop_words:
            continue
        if j in keys:
            # print("inn")
            j = repl[j]
        xx += j + " "
    new_test_data.append(xx)

In [13]:
train["new_comment_text"] = new_train_data
test["new_comment_text"] = new_test_data
print("crap removed")

crap removed


In [14]:
trate = train["new_comment_text"].tolist()
tete = test["new_comment_text"].tolist()

In [15]:
for i, c in enumerate(trate):
    trate[i] = re.sub('[^a-zA-Z ?!]+', '', str(trate[i]).lower())
for i, c in enumerate(tete):
    tete[i] = re.sub('[^a-zA-Z ?!]+', '', tete[i])

In [16]:
train["comment_text"] = trate
test["comment_text"] = tete
print('only alphabets')

only alphabets


In [17]:
X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [18]:
max_features = 20000
maxlen = 100
embed_size = 100

In [19]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [20]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

In [21]:
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [22]:
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()

In [23]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

In [24]:
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [25]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [26]:
from keras import backend as K
from keras.engine.topology import Layer
#from keras import initializations
from keras import initializers, regularizers, constraints


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

In [27]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
    x = SpatialDropout1D(0.5)(x)
    x = Attention(maxlen)(x)
#     avg_pool = GlobalAveragePooling1D()(x)
#     max_pool = GlobalMaxPooling1D()(x)
#     conc = concatenate([avg_pool, max_pool])
#     outp = Dense(6, activation="sigmoid")(conc)
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [32]:
model = get_model()
batch_size = 32
epochs = 3

In [29]:
[X_tra, X_val, y_tra, y_val] = train_test_split(x_train, y_train, train_size=0.75, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

In [30]:
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(X_tra)/batch_size) * epochs
lr_init, lr_fin = 0.001, 0.0005
lr_decay = exp_decay(lr_init, lr_fin, steps)
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

In [None]:
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

Train on 119678 samples, validate on 39893 samples
Epoch 1/3

 ROC-AUC - epoch: 1 - score: 0.977318 

Epoch 2/3

##### record results:
-  gru_preprocessing_moredropouts.csv
        x = SpatialDropout1D(0.2)(x)

        x = Bidirectional(GRU(80, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)

        x = SpatialDropout1D(0.5)(x)

        avg_pool = GlobalAveragePooling1D()(x)

        max_pool = GlobalMaxPooling1D()(x)

        conc = concatenate([avg_pool, max_pool])

        outp = Dense(6, activation="sigmoid")(conc)
    
        Epoch 1/2
        119678/119678 [==============================] - 1120s 9ms/step - loss: 0.0580 - acc: 0.9795 - val_loss: 0.0484 - val_acc: 0.9826

         ROC-AUC - epoch: 1 - score: 0.982405 

        Epoch 2/2
        119678/119678 [==============================] - 1053s 9ms/step - loss: 0.0436 - acc: 0.9835 - val_loss: 0.0443 - val_acc: 0.9833

        ROC-AUC - epoch: 2 - score: 0.984421 




-  gru_preprocessing_attention.csv 
        x = SpatialDropout1D(0.2)(x)

        x = Bidirectional(GRU(80, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)

        x = SpatialDropout1D(0.5)(x)

        x = Attention(maxlen)(x)

        outp

        Epoch 1/2
        119678/119678 [==============================] - 1002s 8ms/step - loss: 0.0394 - acc: 0.9847 - val_loss: 0.0456 - val_acc: 0.9825

        ROC-AUC - epoch: 1 - score: 0.984845 

        Epoch 2/2
        119678/119678 [==============================] - 1075s 9ms/step - loss: 0.0362 - acc: 0.9859 - val_loss: 0.0460 - val_acc: 0.9835

        ROC-AUC - epoch: 2 - score: 0.984626 

- gru_preprocessing_pool_attention.csv
        x = SpatialDropout1D(0.2)(x)
        x = Bidirectional(GRU(80, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
        x = SpatialDropout1D(0.5)(x)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        conc = concatenate([avg_pool, max_pool])
        conc = Attention(maxlen)(conc)
        outp = Dense(6, activation="sigmoid")(conc)
        
        Epoch 1/2
        119678/119678 [==============================] - 1062s 9ms/step - loss: 0.0336 - acc: 0.9868 - val_loss: 0.0488 - val_acc: 0.9830

         ROC-AUC - epoch: 1 - score: 0.983471 

        Epoch 2/2
        119678/119678 [==============================] - 940s 8ms/step - loss: 0.0310 - acc: 0.9877 - val_loss: 0.0503 - val_acc: 0.9824

         ROC-AUC - epoch: 2 - score: 0.982497 

- gru_preprocessing_pool_attention_concat.csv
        x = SpatialDropout1D(0.2)(x)
        x = Bidirectional(GRU(80, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
        x = SpatialDropout1D(0.5)(x)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        atten = Attention(maxlen)(x)
        conc = concatenate([avg_pool, max_pool, atten])
        outp = Dense(6, activation="sigmoid")(conc)
        Epoch 1/2
        119678/119678 [==============================] - 1105s 9ms/step - loss: 0.0286 - acc: 0.9887 - val_loss: 0.0521 - val_acc: 0.9823

         ROC-AUC - epoch: 1 - score: 0.982510 

        Epoch 2/2
        119678/119678 [==============================] - 1119s 9ms/step - loss: 0.0263 - acc: 0.9897 - val_loss: 0.0561 - val_acc: 0.9820

         ROC-AUC - epoch: 2 - score: 0.980674 
         
- 'gru_preprocessing_pool_attention_pool.csv'
        x = SpatialDropout1D(0.2)(x)
        x = Bidirectional(GRU(80, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
        x = SpatialDropout1D(0.5)(x)
        x = Attention(maxlen)(x)
        avg_pool = GlobalAveragePooling1D()(x)
        max_pool = GlobalMaxPooling1D()(x)
        conc = concatenate([avg_pool, max_pool])
        outp = Dense(6, activation="sigmoid")(conc)
        
        Train on 119678 samples, validate on 39893 samples
        Epoch 1/2
        119678/119678 [==============================] - 975s 8ms/step - loss: 0.0244 - acc: 0.9904 - val_loss: 0.0613 - val_acc: 0.9816

         ROC-AUC - epoch: 1 - score: 0.979430 

        Epoch 2/2
        119678/119678 [==============================] - 931s 8ms/step - loss: 0.0226 - acc: 0.9911 - val_loss: 0.0631 - val_acc: 0.9814

         ROC-AUC - epoch: 2 - score: 0.978290
       
- gru_preprocessing_attention_moredropouts.csv
        x = SpatialDropout1D(0.5)(x)
        x = Bidirectional(GRU(80, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
        x = SpatialDropout1D(0.5)(x)
        x = Attention(maxlen)(x)
        outp = Dense(6, activation="sigmoid")(x)
        
        Train on 119678 samples, validate on 39893 samples
        Epoch 1/2
        119678/119678 [==============================] - 1016s 8ms/step - loss: 0.0211 - acc: 0.9920 - val_loss: 0.0648 - val_acc: 0.9811

         ROC-AUC - epoch: 1 - score: 0.977665 

        Epoch 2/2
        119678/119678 [==============================] - 1035s 9ms/step - loss: 0.0195 - acc: 0.9925 - val_loss: 0.0683 - val_acc: 0.9810

         ROC-AUC - epoch: 2 - score: 0.978057 

In [None]:
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('gru_preprocessing_attention_stopwords.csv', index=False)

## analyze false postive & false negative

In [None]:
def false_prediction(pred, real, df):
    result = pd.DataFrame(columns=["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate", "comment_text"])
    if pred.shape[0] != real.shape[0]:
        print ("Error in shape")
        return
    for row_idx in range(pred.shape[0]):
        pred_row = pred[row_idx, :]
        real_row = real[row_idx, :]
        if not np.array_equal(pred_row, real_row):
            print ("comment: {}".format(df.iloc[row_idx]['comment_text']))
            print ('t: ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]')
            print ("p: {}".format(pred_row))
            print ("r: {}\n".format(real_row))

In [None]:
pd.options.display.max_colwidth = 1000

In [None]:
y_train_pred = model.predict(x_train)

In [None]:
y_train_pred_round = np.round(y_train_pred).astype(int)

In [None]:
false_prediction(y_train_pred_round, y_train, train)

In [None]:
y_train_pred_round

In [None]:
y_train_pred