In [12]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from tensorflow.keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'




In [14]:
# downloaded fasttext w2v file
EMBEDDING_FILE = os.path.join('wordvector','crawl-300d-2M.vec')

train = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','train.csv'),encoding='utf8')
test = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','test.csv'),encoding='utf8')
submission = pd.read_csv(os.path.join('jigsaw-toxic-comment-classification-challenge','sample_submission.csv'),encoding='utf8')

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values


max_features = 30000
maxlen = 100
embed_size = 300

# create encoded sequence and word embedding index

In [15]:
tokenizer = text.Tokenizer(num_words=max_features)#create a tokenizer class with max token limit
tokenizer.fit_on_texts(list(X_train) + list(X_test))# creat and update vocabulary list
X_train = tokenizer.texts_to_sequences(X_train)# create index(comming from vocabulary) based integer list to represent sequence
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen) # padding zeros to make 100 length encoded seq list, the default padding is pre padding
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [4]:
#create embedding_index from fast text file
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

## storing and loading embedding index

In [1]:
import pickle

In [2]:
embeddings_index= pickle.load(open('embedding_index.dat','rb'))

In [106]:
embeddings_index['hello']

array([-0.0963, -0.4002,  0.1611, -0.4041,  0.2315,  0.202 ,  0.0109,
       -0.1781, -0.1908, -0.1324,  0.4132,  0.1422,  0.2847,  0.0166,
       -0.1647,  0.1435, -0.0917,  0.0537,  0.4343, -0.081 , -0.3396,
        0.1352, -0.4702,  0.0374, -0.3001,  0.2118,  0.5446,  0.1056,
        0.1796,  0.1589, -0.4196, -0.1554, -0.4651,  0.1017, -0.0182,
        0.2802,  0.14  , -0.2426, -0.2327,  0.1124, -0.374 ,  0.1927,
        0.1121, -0.009 , -0.0009,  0.1431, -0.021 ,  0.4263,  0.0913,
       -0.2215, -0.4352,  0.1586,  0.1729,  0.0088, -0.2693, -0.174 ,
       -0.0967,  0.0622, -0.4991, -0.0239, -0.1385,  0.1755,  0.0472,
        0.1328,  0.1317,  0.1584,  0.3414, -0.1608, -0.2105, -0.2295,
       -0.1174, -0.3036, -0.1816, -0.09  ,  0.3642,  0.1882, -0.1771,
        0.2296,  0.1375, -0.2877,  0.1672, -0.2132,  0.0552, -0.0641,
       -0.0297, -0.0938, -0.0734, -0.0783, -0.0185,  0.1572,  0.1998,
        0.1876,  0.1425, -0.2362, -0.1263, -0.3583,  0.1297,  0.0814,
       -0.5309, -0.1

## create embedding matrix, where each row indicates word vector in tokenizer.word_index

In [110]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [121]:
embedding_matrix.shape

(30000, 300)

In [122]:
x_train.shape

(159571, 100)

## define costumised callback

In [113]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

## create model

In [125]:
import tensorflow as tf

In [150]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x) #bidirectional has two inputs if not specified, see keras doc
    avg_pool = GlobalAveragePooling1D()(x) # average over features along time steps direction, here means steps=80
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool]) # concatenate along axis of avg or max_pool in the list by default, input must be list of tensors
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [151]:
model=get_model()

In [152]:
batch_size = 32
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)


y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.986282 

151592/151592 - 2013s - loss: 0.0505 - acc: 0.9817 - val_loss: 0.0446 - val_acc: 0.9828
Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986960 

151592/151592 - 1713s - loss: 0.0378 - acc: 0.9852 - val_loss: 0.0454 - val_acc: 0.9824
