https://www.kaggle.com/yekenot/pooled-gru-fasttext/code

In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Conv1D
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

Using TensorFlow backend.


In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
submission = pd.read_csv('../input/sample_submission.csv')

In [3]:
def normalize(s):
    s = s.lower()
    # Replace ips
    s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
    # Isolate punctuation
    s = re.sub(r'([\'\"\.\(\)\!\?\-\\\/\,])', r' \1 ', s)
    # Remove some special characters
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    # Replace numbers and symbols with language
    s = s.replace('&', ' and ')
    s = s.replace('@', ' at ')
    s = s.replace('0', ' zero ')
    s = s.replace('1', ' one ')
    s = s.replace('2', ' two ')
    s = s.replace('3', ' three ')
    s = s.replace('4', ' four ')
    s = s.replace('5', ' five ')
    s = s.replace('6', ' six ')
    s = s.replace('7', ' seven ')
    s = s.replace('8', ' eight ')
    s = s.replace('9', ' nine ')
    
    # some cleaning 
    s = re.sub(r"what's", "what is ", s)
    s = re.sub(r"\'s", " ", s)
    s = re.sub(r"\'ve", " have ", s)
    s = re.sub(r"can't", "cannot ", s)
    s = re.sub(r"n't", " not ", s)
    s = re.sub(r"i'm", "i am ", s)
    s = re.sub(r"\'re", " are ", s)
    s = re.sub(r"\'d", " would ", s)
    s = re.sub(r"\'ll", " will ", s)
    s = re.sub(r"\'scuse", " excuse ", s)
    s = re.sub('\W', ' ', s)
    s = re.sub('\s+', ' ', s)
    # remove urls
    s = re.sub(r'^https?:\/\/.*[\r\n]*', '', s)
    s = re.sub(r"www\S+", "", s)
    s = s.strip(' ')
    return s

In [4]:
X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [5]:
def normalize_array(a):
    for x, value in np.ndenumerate(a):
        a[x] = normalize(value)
    return a

In [6]:
X_train = normalize_array(X_train)
X_test = normalize_array(X_test)

In [7]:
max_features = 100000
maxlen = 150
embed_size = 300

In [8]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [9]:
EMBEDDING_FILE = '../input/crawl-300d-2M.vec'

In [10]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')

In [11]:
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [12]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [13]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [14]:
from keras import backend as K
from keras.engine.topology import Layer
#from keras import initializations
from keras import initializers, regularizers, constraints


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        #self.init = initializations.get('glorot_uniform')
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

In [24]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(128, return_sequences=True))(x)
    x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model
# model = get_model()

In [25]:
model = get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 150, 300)     30000000    input_4[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_4 (SpatialDro (None, 150, 300)     0           embedding_4[0][0]                
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) (None, 150, 256)     329472      spatial_dropout1d_4[0][0]        
__________________________________________________________________________________________________
conv1d_4 (

In [17]:
batch_size = 64
epochs = 2

In [18]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

In [26]:
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.988667 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.989051 



In [27]:
y_pred = model.predict(x_test, batch_size=batch_size)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission_gru_cnn.csv', index=False)

In [23]:
from sklearn.cross_validation import KFold

kf = KFold(len(x_train), n_folds=5)
cvscores = []

for train,test in kf:
    model=get_model()
    hist = model.fit(x_train[train], y_train[train], batch_size=512, epochs=2, verbose=1)
    
    # evaluate the model
    scores = model.evaluate(x_train[test], y_train[test], verbose=0)
    print("%s: %.10f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
    
print("%.10f%% (+/- %.10f%%)" % (np.mean(cvscores), np.std(cvscores)))

Epoch 1/2
 16896/127656 [==>...........................] - ETA: 30:43 - loss: 0.2240 - acc: 0.9380

KeyboardInterrupt: 

### log results

### submission_grufasttext.csv

inp = Input(shape=(maxlen, ))

x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)

x = SpatialDropout1D(0.2)(x)

x = Bidirectional(GRU(80, return_sequences=True))(x)

avg_pool = GlobalAveragePooling1D()(x)

max_pool = GlobalMaxPooling1D()(x)

conc = concatenate([avg_pool, max_pool])

outp = Dense(6, activation="sigmoid")(conc)

- result:

Epoch 1/2
151592/151592 [==============================] - 2463s 16ms/step - loss: 0.0457 - acc: 0.9828 - val_loss: 0.0454 - val_acc: 0.9829

 ROC-AUC - epoch: 1 - score: 0.987610 

Epoch 2/2
151592/151592 [==============================] - 2059s 14ms/step - loss: 0.0374 - acc: 0.9854 - val_loss: 0.0456 - val_acc: 0.9826

 ROC-AUC - epoch: 2 - score: 0.987506

#### LB: 0.9812

x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)

x = SpatialDropout1D(0.2)(x)

x = Bidirectional(GRU(80, return_sequences=True))(x)

x = SpatialDropout1D(0.5)(x)

avg_pool = GlobalAveragePooling1D()(x)

max_pool = GlobalMaxPooling1D()(x)

conc = concatenate([avg_pool, max_pool])

- result

Epoch 1/2
151592/151592 [==============================] - 2598s 17ms/step - loss: 0.0544 - acc: 0.9806 - val_loss: 0.0454 - val_acc: 0.9827

 ROC-AUC - epoch: 1 - score: 0.986717 

Epoch 2/2
151592/151592 [==============================] - 2195s 14ms/step - loss: 0.0412 - acc: 0.9842 - val_loss: 0.0460 - val_acc: 0.9824

 ROC-AUC - epoch: 2 - score: 0.986632 


### submission_grufasttext_attention.csv

inp = Input(shape=(maxlen, ))

x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)

x = SpatialDropout1D(0.2)(x)

x = Bidirectional(GRU(80, return_sequences=True))(x)

x = Attention(maxlen)(x)

outp = Dense(6, activation="sigmoid")(x)

- result

Epoch 1/2
151592/151592 [==============================] - 2307s 15ms/step - loss: 0.0511 - acc: 0.9818 - val_loss: 0.0470 - val_acc: 0.9821

 ROC-AUC - epoch: 1 - score: 0.984153 

Epoch 2/2
151592/151592 [==============================] - 2148s 14ms/step - loss: 0.0388 - acc: 0.9850 - val_loss: 0.0455 - val_acc: 0.9829

 ROC-AUC - epoch: 2 - score: 0.986293 

### submission_grufasttext_attention_2.csv

inp = Input(shape=(maxlen, ))

x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)

x = SpatialDropout1D(0.2)(x)

x = Bidirectional(GRU(80, return_sequences=True))(x)

x = SpatialDropout1D(0.5)(x)

x = Attention(maxlen)(x)

outp = Dense(6, activation="sigmoid")(x)

- result

Epoch 1/2
151592/151592 [==============================] - 2337s 15ms/step - loss: 0.0550 - acc: 0.9808 - val_loss: 0.0480 - val_acc: 0.9823

 ROC-AUC - epoch: 1 - score: 0.980985 

Epoch 2/2
151592/151592 [==============================] - 2280s 15ms/step - loss: 0.0419 - acc: 0.9841 - val_loss: 0.0461 - val_acc: 0.9827

 ROC-AUC - epoch: 2 - score: 0.984969 


### submission_2grufasttext_attention.csv

inp = Input(shape=(maxlen, ))

x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)

x = SpatialDropout1D(0.2)(x)

x = Bidirectional(GRU(80, return_sequences=True))(x)

x = SpatialDropout1D(0.5)(x)

x = Bidirectional(GRU(80, return_sequences=True))(x)

x = Attention(maxlen)(x)

outp = Dense(6, activation="sigmoid")(x)

- result

Train on 151592 samples, validate on 7979 samples
Epoch 1/2
151592/151592 [==============================] - 3107s 20ms/step - loss: 0.0522 - acc: 0.9813 - val_loss: 0.0474 - val_acc: 0.9826

 ROC-AUC - epoch: 1 - score: 0.984434 

Epoch 2/2
151592/151592 [==============================] - 3054s 20ms/step - loss: 0.0406 - acc: 0.9842 - val_loss: 0.0464 - val_acc: 0.9827

 ROC-AUC - epoch: 2 - score: 0.986317 
 
 #### LB: 0.9813

### submission_2grufasttext_attention_10fold.csv