In [1]:
#dpcnn http://ai.tencent.com/ailab/media/publications/ACL3-Brady.pdf
#dpcnn with conv1d, model architecture and all parameters copied from neptune-ml since it's publicly available
#https://github.com/neptune-ml/kaggle-toxic-starter/blob/master/best_configs/fasttext_dpcnn.yaml
#Got it to PLB 0.984 with 10fold cv on local computer after playing with parameters
#Try to improve score on your own local pc or throw it in the blender with the rest of them :)
import os
import gc
import numpy as np
import pandas as pd
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

from keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Flatten, Conv1D, Conv2D, SpatialDropout1D, Reshape, Concatenate
from keras.layers import add, Dropout, PReLU, BatchNormalization, GlobalMaxPooling1D, MaxPool2D, MaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from keras import optimizers
from keras import initializers, regularizers, constraints, callbacks

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

def schedule(ind): # seems like not used later
    a = [0.001, 0.0005, 0.0001, 0.0001]
    return a[ind] 

In [4]:
train = pd.read_csv('~/data/toxic/data/train_preprocessed_clean.csv')
test = pd.read_csv('~/data/toxic/data/test_preprocessed_clean.csv')

X_train = train["comment_text"].fillna("unknown").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("unknown").values

max_features = 100000
maxlen = 200

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [None]:
embeddings_index_lex, embed_size = word2Vec('lex')
embeddings_index_glc, embed_size = word2Vec('gl-common')
embeddings_index_glt, embed_size = word2Vec('gl-twitter')
embeddings_index_ftc, embed_size = word2Vec('ft-common')
embeddings_index_ftw, embed_size = word2Vec('ft-wiki')

In [None]:
def word2Vec(source):
    embed_size = 300
    if source.lower() == 'ft-common':
        file = '/home/kai/data/resources/FastText/crawl-300d-2M.vec'
    elif source.lower() == 'ft-wiki':
        file = '/home/kai/data/resources/FastText/wiki.en.vec'
    elif source.lower() == 'lex':
        file = '/home/kai/data/resources/lexvec/lexvec.commoncrawl.300d.W.pos.vectors'
    elif source.lower() == 'gl-common':
        file = '/home/kai/data/resources/glove/glove.840B.300d.txt'
    elif source.lower() == 'gl-twitter':
        file = '/home/kai/data/resources/glove/glove.twitter.27B.200d.txt'
        embed_size = 200
    def get_coefs(word,*arr): 
        try:
            return word, np.asarray(arr, dtype='float32') 
        except ValueError:
            return 'nnnnnnnaaaaaaa@@!',np.zeros(embed_size)
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(file, encoding='utf8'))
    return embeddings_index, embed_size

In [None]:
import gc
gc.collect()

all_embs = np.stack(embeddings_index.values())

all_embs.shape

emb_mean, emb_std = all_embs.mean(), all_embs.std()

emb_mean,  emb_std

del all_embs, X_train, X_test, train, test
gc.collect()

In [19]:
import json as js
with open('/home/kai/data/kaggle/toxic/wl/models/RNN/rnn/dirty_word_dict.json', 'r') as file:
    bad_word_dict = js.load(file)
print(len(bad_word_dict))
    
def get_embedding_matrix(embeddings_index, embed_size, max_features, tokenizer, bad_word_dict):
    word_index = tokenizer.word_index
    #prepare embedding matrix
    num_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.zeros((num_words, embed_size))
    still_not_found_word = {}
    replaced_word = {}
    for word, i in word_index.items():
        if i >= max_features:
            continue
        try: 
            embedding_vector = embeddings_index[word] # w2v_model['/en/'+ word] #w2v_model[word]#
        except KeyError:
            replacement = bad_word_dict.get(word)
            embedding_vector = embeddings_index.get(replacement, None)
    #         embedding_vector = None #np.zeros(embed_size)
            if embedding_vector is None:
                still_not_found_word[word] = tokenizer.word_counts[word]#i
            else:
                if word not in replaced_word:
                    replaced_word[word] = replacement
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    print('{} words not found in embedding file (after replacement attempt)'.format(len(still_not_found_word)))
    print('{} words are replaced:'.format(len(replaced_word)))
    print(replaced_word)
    return embedding_matrix, still_not_found_word, replaced_word

228


In [13]:
def get_model(maxlen, max_features, embedding_matrix, embed_size, num_filters=64):    
    
    filter_sizes = [1,2,3,5]
    num_filters = num_filters

    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.4)(x)
    x = Reshape((maxlen, embed_size, 1))(x)

    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size), kernel_initializer='normal',
                                                                                    activation='elu')(x)

    maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(maxlen - filter_sizes[3] + 1, 1))(conv_3)

    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = Flatten()(z)
    z = Dropout(0.1)(z)

    outp = Dense(6, activation="sigmoid")(z)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [22]:
batch_epoch_patience_pool = [
    (1024, 20, 5, 64),
#     (1024, 10, 10),
#     (1024, 3, 3),
#     (1024, 3, 3),
#     (1024, 3, 3),
    (512, 10, 3, 48),
#     (512, 2, 2),
#     (512, 2, 2),
#     (512, 2, 2),
    (256, 5, 2, 32)
#     (128, 5, 5),
#     (128, 2, 2),
#     (128, 2, 2),
#     (128, 1, 1),
#     (32, 3, 3),
#     (32, 2, 2),
#     (32, 1, 1),
#     (32, 1, 1)
]

In [16]:
embeddings_index_pool = [
    (embeddings_index_lex, 300, 'lex'),
    (embeddings_index_glc, 300, 'glc'),
    (embeddings_index_glt, 200, 'glt'),#,
    (embeddings_index_ftc, 300, 'ftc'), 
    (embeddings_index_ftw, 300, 'ftw')
]

In [23]:
for embeddings_index, embed_size, embedding_name in embeddings_index_pool:
    
    embedding_matrix, _, _ = get_embedding_matrix(embeddings_index, embed_size, max_features, tokenizer, bad_word_dict)

    for batch_size, epochs, patience, num_filters in batch_epoch_patience_pool:

        model = get_model(maxlen, max_features, embedding_matrix, embed_size)

        X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.90)#, random_state=233)

        run_name = "cnn_{}_{}_{}_w_dict".format(embedding_name, batch_size, num_filters)
        model_file = './PureCnnModels/' + run_name + '.hdf5'
        try: 
            print('load model: ' + str(model_file))
            model.load_weights(model_file)
        except OSError: 
            print('no model found')

        early = EarlyStopping(monitor="val_loss", mode="min", patience=patience)
        checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')    
        roc_auc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
        hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                         callbacks=[roc_auc, checkpoint, early], verbose=1)


        model.load_weights(model_file)
        y_pred = model.predict(x_test, batch_size=1024)
        submission = pd.read_csv('~/data/toxic/data/sample_submission.csv')
        submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
        import time
        sub_id = str(int(time.time()))[3:]
        print(sub_id)
        submission.to_csv('./PureCnnPreds/' + run_name + '_' + sub_id + '.csv.gz', index=False, compression='gzip')

9806 words not found in embedding file (after replacement attempt)
0 words are replaced:
{}
load model: ./PureCnnModels/cnn_lex_1024_64_w_dict.hdf5
no model found
Train on 143613 samples, validate on 15958 samples
Epoch 1/20
 ROC-AUC - epoch: 1 - score: 0.927154 


Epoch 00001: val_loss improved from inf to 0.05199, saving model to ./PureCnnModels/cnn_lex_1024_64_w_dict.hdf5
Epoch 2/20
 ROC-AUC - epoch: 2 - score: 0.979105 


Epoch 00002: val_loss improved from 0.05199 to 0.04583, saving model to ./PureCnnModels/cnn_lex_1024_64_w_dict.hdf5
Epoch 3/20
 ROC-AUC - epoch: 3 - score: 0.982844 


Epoch 00003: val_loss improved from 0.04583 to 0.04359, saving model to ./PureCnnModels/cnn_lex_1024_64_w_dict.hdf5
Epoch 4/20
 ROC-AUC - epoch: 4 - score: 0.984700 


Epoch 00004: val_loss improved from 0.04359 to 0.04240, saving model to ./PureCnnModels/cnn_lex_1024_64_w_dict.hdf5
Epoch 5/20
 ROC-AUC - epoch: 5 - score: 0.985489 


Epoch 00005: val_loss improved from 0.04240 to 0.04194, saving mod

load model: ./PureCnnModels/cnn_glc_1024_64_w_dict.hdf5
no model found
Train on 143613 samples, validate on 15958 samples
Epoch 1/20
 ROC-AUC - epoch: 1 - score: 0.900212 


Epoch 00001: val_loss improved from inf to 0.05468, saving model to ./PureCnnModels/cnn_glc_1024_64_w_dict.hdf5
Epoch 2/20
 ROC-AUC - epoch: 2 - score: 0.981074 


Epoch 00002: val_loss improved from 0.05468 to 0.04819, saving model to ./PureCnnModels/cnn_glc_1024_64_w_dict.hdf5
Epoch 3/20
 ROC-AUC - epoch: 3 - score: 0.985072 


Epoch 00003: val_loss improved from 0.04819 to 0.04586, saving model to ./PureCnnModels/cnn_glc_1024_64_w_dict.hdf5
Epoch 4/20
 ROC-AUC - epoch: 4 - score: 0.986628 


Epoch 00004: val_loss improved from 0.04586 to 0.04539, saving model to ./PureCnnModels/cnn_glc_1024_64_w_dict.hdf5
Epoch 5/20
 ROC-AUC - epoch: 5 - score: 0.987409 


Epoch 00005: val_loss improved from 0.04539 to 0.04465, saving model to ./PureCnnModels/cnn_glc_1024_64_w_dict.hdf5
Epoch 6/20
 ROC-AUC - epoch: 6 - score: 0.

1568812
load model: ./PureCnnModels/cnn_glc_256_32_w_dict.hdf5
no model found
Train on 143613 samples, validate on 15958 samples
Epoch 1/5
 ROC-AUC - epoch: 1 - score: 0.984675 


Epoch 00001: val_loss improved from inf to 0.04726, saving model to ./PureCnnModels/cnn_glc_256_32_w_dict.hdf5
Epoch 2/5
 ROC-AUC - epoch: 2 - score: 0.986972 


Epoch 00002: val_loss improved from 0.04726 to 0.04616, saving model to ./PureCnnModels/cnn_glc_256_32_w_dict.hdf5
Epoch 3/5
 ROC-AUC - epoch: 3 - score: 0.987860 


Epoch 00003: val_loss improved from 0.04616 to 0.04568, saving model to ./PureCnnModels/cnn_glc_256_32_w_dict.hdf5
Epoch 4/5
 ROC-AUC - epoch: 4 - score: 0.987976 


Epoch 00004: val_loss did not improve
Epoch 5/5
 ROC-AUC - epoch: 5 - score: 0.987664 


Epoch 00005: val_loss improved from 0.04568 to 0.04556, saving model to ./PureCnnModels/cnn_glc_256_32_w_dict.hdf5
1569004
32291 words not found in embedding file (after replacement attempt)
2 words are replaced:
{'assraped': 'ass', 'dic

Epoch 10/20
 ROC-AUC - epoch: 10 - score: 0.985534 


Epoch 00010: val_loss did not improve
Epoch 11/20
 ROC-AUC - epoch: 11 - score: 0.984837 


Epoch 00011: val_loss did not improve
Epoch 12/20
 ROC-AUC - epoch: 12 - score: 0.984454 


Epoch 00012: val_loss did not improve
1569960
load model: ./PureCnnModels/cnn_ftc_512_48_w_dict.hdf5
no model found
Train on 143613 samples, validate on 15958 samples
Epoch 1/10
 ROC-AUC - epoch: 1 - score: 0.980079 


Epoch 00001: val_loss improved from inf to 0.04722, saving model to ./PureCnnModels/cnn_ftc_512_48_w_dict.hdf5
Epoch 2/10
 ROC-AUC - epoch: 2 - score: 0.986083 


Epoch 00002: val_loss improved from 0.04722 to 0.04391, saving model to ./PureCnnModels/cnn_ftc_512_48_w_dict.hdf5
Epoch 3/10
 ROC-AUC - epoch: 3 - score: 0.987362 


Epoch 00003: val_loss improved from 0.04391 to 0.04312, saving model to ./PureCnnModels/cnn_ftc_512_48_w_dict.hdf5
Epoch 4/10
 ROC-AUC - epoch: 4 - score: 0.987666 


Epoch 00004: val_loss improved from 0.04312 to

Epoch 8/20
 ROC-AUC - epoch: 8 - score: 0.990170 


Epoch 00008: val_loss did not improve
Epoch 9/20
 ROC-AUC - epoch: 9 - score: 0.989961 


Epoch 00009: val_loss did not improve
Epoch 10/20
 ROC-AUC - epoch: 10 - score: 0.989662 


Epoch 00010: val_loss did not improve
Epoch 11/20
 ROC-AUC - epoch: 11 - score: 0.989278 


Epoch 00011: val_loss did not improve
1570705
load model: ./PureCnnModels/cnn_ftw_512_48_w_dict.hdf5
no model found
Train on 143613 samples, validate on 15958 samples
Epoch 1/10
 ROC-AUC - epoch: 1 - score: 0.978889 


Epoch 00001: val_loss improved from inf to 0.04693, saving model to ./PureCnnModels/cnn_ftw_512_48_w_dict.hdf5
Epoch 2/10
 ROC-AUC - epoch: 2 - score: 0.985816 


Epoch 00002: val_loss improved from 0.04693 to 0.04421, saving model to ./PureCnnModels/cnn_ftw_512_48_w_dict.hdf5
Epoch 3/10
 ROC-AUC - epoch: 3 - score: 0.988028 


Epoch 00003: val_loss improved from 0.04421 to 0.04249, saving model to ./PureCnnModels/cnn_ftw_512_48_w_dict.hdf5
Epoch 4/10

session_conf = tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4)
K.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))

#model
#wrote out all the blocks instead of looping for simplicity
filter_nr = 64
filter_size = 3
max_pool_size = 3
max_pool_strides = 2
dense_nr = 256
spatial_dropout = 0.2
dense_dropout = 0.5
train_embed = False
conv_kern_reg = regularizers.l2(0.00001)
conv_bias_reg = regularizers.l2(0.00001)

comment = Input(shape=(maxlen,))
emb_comment = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=train_embed)(comment)
emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment)

block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
block1 = BatchNormalization()(block1)
block1 = PReLU()(block1)
block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1)
block1 = BatchNormalization()(block1)
block1 = PReLU()(block1)

#we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output
#if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output
resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
resize_emb = PReLU()(resize_emb)
    
block1_output = add([block1, resize_emb])
block1_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output)

block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1_output)
block2 = BatchNormalization()(block2)
block2 = PReLU()(block2)
block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2)
block2 = BatchNormalization()(block2)
block2 = PReLU()(block2)
    
block2_output = add([block2, block1_output])
block2_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block2_output)

block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2_output)
block3 = BatchNormalization()(block3)
block3 = PReLU()(block3)
block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3)
block3 = BatchNormalization()(block3)
block3 = PReLU()(block3)
    
block3_output = add([block3, block2_output])
block3_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block3_output)

block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3_output)
block4 = BatchNormalization()(block4)
block4 = PReLU()(block4)
block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4)
block4 = BatchNormalization()(block4)
block4 = PReLU()(block4)

block4_output = add([block4, block3_output])
block4_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block4_output)

block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4_output)
block5 = BatchNormalization()(block5)
block5 = PReLU()(block5)
block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5)
block5 = BatchNormalization()(block5)
block5 = PReLU()(block5)

block5_output = add([block5, block4_output])
block5_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block5_output)

block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5_output)
block6 = BatchNormalization()(block6)
block6 = PReLU()(block6)
block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6)
block6 = BatchNormalization()(block6)
block6 = PReLU()(block6)

block6_output = add([block6, block5_output])
block6_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block6_output)

block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6_output)
block7 = BatchNormalization()(block7)
block7 = PReLU()(block7)
block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
            kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block7)
block7 = BatchNormalization()(block7)
block7 = PReLU()(block7)

block7_output = add([block7, block6_output])
output = GlobalMaxPooling1D()(block7_output)

output = Dense(dense_nr, activation='linear')(output)
output = BatchNormalization()(output)
output = PReLU()(output)
output = Dropout(dense_dropout)(output)
output = Dense(6, activation='sigmoid')(output)

model = Model(comment, output)


model.compile(loss='binary_crossentropy', 
            optimizer=optimizers.Adam(lr=1e-4),
            metrics=['accuracy'])
            
batch_size = 128
epochs = 25

Xtrain, Xval, ytrain, yval = train_test_split(x_train, y_train, train_size=0.9)#, random_state=233)

run_name = "cnn_{}_{}_w_dict".format('ftc', batch_size)
model_file = './PureCnnModels/' + run_name + '.hdf5'
try: 
    print('load model: ' + str(model_file))
    model.load_weights(model_file)
except OSError: 
    print('no model found')

#lr = callbacks.LearningRateScheduler(schedule) # instead using fixed lr: 1e-4
early = EarlyStopping(monitor="val_loss", mode="min", patience=10)
ra_val = RocAucEvaluation(validation_data=(Xval, yval), interval = 1)
checkpoint = ModelCheckpoint(model_file, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(Xtrain, ytrain, batch_size=batch_size, epochs=epochs, validation_data=(Xval, yval), 
          callbacks = [ra_val, early, checkpoint] ,verbose=1)


model.load_weights(model_file)
y_pred = model.predict(x_test,batch_size=1024,verbose=1)
submission = pd.read_csv('~/data/toxic/data/sample_submission.csv')
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
import time
sub_id = str(int(time.time()))[3:]
print(sub_id)
submission.to_csv('./PureCnnPreds/' + run_name + '_' + sub_id + '.csv.gz', index=False, compression='gzip')

In [81]:
y_pred.shape

(153164, 6)

In [82]:
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.993278,0.3908302,0.952778,0.192109,0.826996,0.2555371
1,0000247867823ef7,0.000467,9.759669e-07,1.1e-05,2.2e-05,2.4e-05,1.748934e-06
2,00013b17ad220c46,0.000141,1.736008e-07,1e-06,1.1e-05,7e-06,1.260338e-06
3,00017563c3f7919a,3.4e-05,1.98434e-06,4e-06,3e-06,1.1e-05,9.113581e-07
4,00017695ad8997eb,0.000953,9.894452e-07,1.9e-05,4e-05,4.7e-05,3.017124e-06


In [83]:
submission.to_csv('./PureCnnPreds/' + run_name + '_' + sub_id + '.csv.gz', index=False, compression='gzip')

In [84]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 200, 300)     30000000    input_6[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_6 (SpatialDro (None, 200, 300)     0           embedding_6[0][0]                
__________________________________________________________________________________________________
conv1d_76 (Conv1D)              (None, 200, 64)      57664       spatial_dropout1d_6[0][0]        
__________________________________________________________________________________________________
batch_norm