In [1]:
# import importlib
# import local_utils; importlib.reload(local_utils)
from local_utils import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Seed: 7961730


In [2]:
# Load Data
ids, comments, Y, test_ids, test_comments, inx2label, label2inx = load_data()
Y_wblank = np.concatenate([Y, np.expand_dims((~Y.any(axis=1)).astype(int), 1)], axis=1)

comments = Parallel(n_jobs=cpu_cores)(delayed(preprocess)(text) for text in comments)
test_comments = Parallel(n_jobs=cpu_cores)(delayed(preprocess)(text) for text in test_comments)

In [3]:
max_len = 450
vectors, inx2word, word2inx = load_embs()
text_analyzer = TextAnalyzer(word2inx, vectors, process_oov_words=True, oov_min_doc_hits=5, max_len=max_len, cpu_cores=cpu_cores)

In [4]:
seq, meta = text_analyzer.fit_on_texts(comments + test_comments)

X = seq[:len(comments)]
test_X = seq[len(comments):]

meta_mean = meta.mean(axis=0)
meta_std = meta.std(axis=0)
meta = (meta - meta_mean)/meta_std

print("mean_len: {}".format(meta_mean[0]))
print("mean_len + 2*std: {}".format(meta_mean[0]+2*meta_std[0]))
print("mean_len + 3*std: {}".format(meta_mean[0]+3*meta_std[0]))

X_meta = meta[:len(comments)]
test_X_meta = meta[len(comments):]

Docs: 312735
Selected words: 154006
Processed OOV words: 8451
mean_len: 77.40685244695989
mean_len + 2*std: 316.8939801273775
mean_len + 3*std: 436.63754396758634


In [5]:
# Train/Valid splitting
trn_inx, val_inx = stratified_sampling(Y, 0.1, seed)

print("train: {}, valid: {}".format(len(trn_inx), len(val_inx)))
# plot_stratified_sampling(Y, trn_inx, val_inx, inx2label)

train: 143613, valid: 15958


In [6]:
# CNN
# def cnn_block(x, filters, kernel_size, attention=0):
#     cnn = Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(x)
    
#     if attention == 0: cnn = GlobalMaxPooling1D()(cnn)
#     elif attention == 1: cnn = AttentionWeightedAverage()(cnn)
#     elif attention == 2: cnn = Attention()(cnn)

#     return cnn

# def getCNNModel(input_shape, classes, num_words, emb_size, emb_matrix,
#                 attention=0, dense=False, emb_trainable=False):

#     x_input = Input(shape=(input_shape,))
    
#     emb = Embedding(num_words, emb_size, weights=[emb_matrix], trainable=emb_trainable, name='embs')(x_input)
#     emb = SpatialDropout1D(0.15)(emb)
        
#     cnn1 = cnn_block(emb, 100, 3, attention=attention)
#     cnn2 = cnn_block(emb, 100, 4, attention=attention)
#     cnn3 = cnn_block(emb, 100, 5, attention=attention)
#     x = concatenate([cnn1, cnn2, cnn3])

#     x = Dropout(0.15)(x)
    
#     if dense: 
#         x = Dense(50, activation='relu')(x)
#         x = Dropout(0.15)(x)
    
#     x_output = Dense(classes, activation='sigmoid')(x)
#     return Model(inputs=x_input, outputs=x_output)

In [102]:
# LSTM-CNN
def getLSTMCNNModel(input_shape, classes, num_words, emb_size, emb_matrix,
                    attention=0, dense=False, emb_trainable=False):

    x_input = Input(shape=(input_shape,))

    emb = Embedding(num_words, emb_size, weights=[emb_matrix], trainable=emb_trainable, name='embs')(x_input)
    emb = SpatialDropout1D(0.3)(emb)
    
    rnn = Bidirectional(CuDNNGRU(64, return_sequences=True))(emb)
    
    cnn1 = Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(rnn)
#     cnn2 = Conv1D(filters=64, kernel_size=4, activation='relu', padding='same')(rnn)
#     cnn3 = Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(rnn)
#     cnn4 = Conv1D(filters=64, kernel_size=6, activation='relu', padding='same')(rnn)
    
    x = cnn1
#     x = concatenate([cnn1, cnn2, cnn3, cnn4])
    
    if attention == 1: x = AttentionWeightedAverage()(x)
    elif attention == 2: x = Attention()(x)
    else: x = GlobalMaxPooling1D()(x)
    
    x = Dropout(0.3)(x)
    
    if dense: 
        x = Dense(32, activation='relu')(x)
        x = Dropout(0.3)(x)
    
    x_output = Dense(classes, activation='sigmoid')(x)
    return Model(inputs=x_input, outputs=x_output)

In [103]:
# CNN-LSTM
def getCNNLSTMModel(input_shape, classes, num_words, emb_size, emb_matrix,
                    attention=0, dense=False, emb_trainable=False):

    x_input = Input(shape=(input_shape,))

    emb = Embedding(num_words, emb_size, weights=[emb_matrix], trainable=emb_trainable, name='embs')(x_input)
    emb = SpatialDropout1D(0.3)(emb)
#     rnn = SpatialDropout1D(0.15)(rnn)
    
    cnn1 = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(emb)
    cnn2 = Conv1D(filters=64, kernel_size=4, activation='relu', padding='same')(emb)
    cnn3 = Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(emb)
#     cnn4 = Conv1D(filters=64, kernel_size=6, activation='relu', padding='same')(emb)
    
    x = concatenate([cnn1, cnn2, cnn3])
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    
    if attention == 1: x = AttentionWeightedAverage()(x)
    elif attention == 2: x = Attention()(x)
    else: x = GlobalMaxPooling1D()(x)
    
    x = Dropout(0.3)(x)
    
    if dense: 
        x = Dense(32, activation='relu')(x)
        x = Dropout(0.3)(x)
    
    x_output = Dense(classes, activation='sigmoid')(x)
    return Model(inputs=x_input, outputs=x_output)

In [104]:
# MaxPool2D
def getBiCuDNNGRUMaxPool2DModel(input_shape, classes, num_words, emb_size, emb_matrix,
                                attention=0, dense=False, emb_trainable=False):

    x_input = Input(shape=(input_shape,))
    
    emb = Embedding(num_words, emb_size, weights=[emb_matrix], trainable=emb_trainable, name='embs')(x_input)
    emb = SpatialDropout1D(0.3)(emb)
    
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(emb)
#     rnn1 = Bidirectional(CuDNNGRU(64, return_sequences=True))(emb)
#     rnn2 = Bidirectional(CuDNNGRU(64, return_sequences=True))(rnn1)
#     x = concatenate([rnn1, rnn2])

    if attention == 1: x1 = AttentionWeightedAverage()(x)
    elif attention == 2: x1 = Attention()(x)
    else: x1 = GlobalMaxPooling1D()(x)
        
    if attention == 1: x2 = AttentionWeightedAverage()(Permute((2, 1))(x))
    elif attention == 2: x2 = Attention()(Permute((2, 1))(x))
    else: x2 = GlobalMaxPooling1D()(Permute((2, 1))(x))
        
    x = concatenate([x1, x2])
    x = Dropout(0.3)(x)
    
    if dense: 
        x = Dense(32, activation='relu')(x)
        x = Dropout(0.3)(x)
    
    x_output = Dense(classes, activation='sigmoid')(x)
    return Model(inputs=x_input, outputs=x_output)

In [105]:
# import importlib
# importlib.reload(toxic.models)
# from toxic.models import *

model_name = 'lstm_cnn_model'
model = getLSTMCNNModel(input_shape=X.shape[1], classes=Y.shape[1], num_words=len(text_analyzer.inx2emb), 
                         emb_size=text_analyzer.emb_size, emb_matrix=text_analyzer.emb_vectors,
                         attention=0, dense=False, emb_trainable=False)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, 450)               0         
_________________________________________________________________
embs (Embedding)             (None, 450, 300)          46201800  
_________________________________________________________________
spatial_dropout1d_15 (Spatia (None, 450, 300)          0         
_________________________________________________________________
bidirectional_19 (Bidirectio (None, 450, 128)          140544    
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 450, 128)          49280     
_________________________________________________________________
global_max_pooling1d_23 (Glo (None, 128)               0         
_________________________________________________________________
dropout_15 (Dropout)         (None, 128)               0         
__________

In [106]:
model_checkpoint = ModelCheckpoint(models_dir+model_name+'.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)
lr_schedule = LearningRateScheduler(lr_change, verbose=1)
lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=1, min_lr=0.0001, verbose=1)
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=4, verbose=1, mode='auto')
tensorboard = TensorBoard(log_dir='logs', write_graph=False)
roc_auc_callback = ROCAUCCallback(X[val_inx], Y[val_inx], 512)

In [107]:
batch_size=256
weights = getClassWeights(Y)

# trn_seq = StratifiedFeatureSequence(X[trn_inx], Y[trn_inx], batch_size)
trn_seq = FeatureSequence(X[trn_inx], X_meta[trn_inx], Y[trn_inx], batch_size, shuffle=True)
val_seq = FeatureSequence(X[val_inx], X_meta[val_inx], Y[val_inx], batch_size)

In [108]:
model.compile(loss='binary_crossentropy', optimizer=optimizers.RMSprop(0.001, clipvalue=1.0, clipnorm=1.0))
# model.compile(loss=roc_auc_loss, optimizer=optimizers.RMSprop(0.003))

In [109]:
epochs=10
model.fit_generator(
    generator=trn_seq, steps_per_epoch=len(trn_seq),
    validation_data=val_seq, validation_steps=len(val_seq),
    initial_epoch=0, epochs=epochs, shuffle=False, verbose=1,
    class_weight=weights,
    callbacks=[model_checkpoint, lr_reduce, early_stop, roc_auc_callback],
    use_multiprocessing=False, workers=cpu_cores, max_queue_size=8*cpu_cores)

Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.04626, saving model to models/lstm_cnn_model.h5
roc-auc_val: 0.97992711                                                                                                    
Epoch 2/10

Epoch 00002: val_loss improved from 0.04626 to 0.04369, saving model to models/lstm_cnn_model.h5
roc-auc_val: 0.98352894                                                                                                    
Epoch 3/10

Epoch 00003: val_loss improved from 0.04369 to 0.04181, saving model to models/lstm_cnn_model.h5
roc-auc_val: 0.9859852                                                                                                    
Epoch 4/10

Epoch 00004: val_loss did not improve
roc-auc_val: 0.98650348                                                                                                    
Epoch 5/10

Epoch 00005: val_loss did not improve

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0009000000427477062.
r

<keras.callbacks.History at 0x7f29496993c8>

In [43]:
del model
model = load_model(models_dir+model_name+'.h5', compile=True, 
                   custom_objects={'Attention':Attention, 'AttentionWeightedAverage':AttentionWeightedAverage, 
                                   'focal_loss':focal_loss, 'roc_auc_loss':roc_auc_loss})

In [62]:
Y_val_pred = model.predict(X[val_inx], batch_size=1024, verbose=0)
losses = compute_losses(Y[val_inx], Y_val_pred, eps=1e-5)

val_loss = sum(losses)/len(losses)
val_auc = metrics.roc_auc_score(Y[val_inx], Y_val_pred)

print()
print("avg_loss: {}".format(val_loss))
print("ROC AUC: {}".format(val_auc))


avg_loss: 0.040130243035343405
ROC AUC: 0.9905758285834526


In [17]:
stop

NameError: name 'stop' is not defined

In [63]:
submission_name = 'fasttext__gru__max_pool2d__submission_'+str(round(val_loss, 5))+'_'+str(round(val_auc, 5))+'.csv'

sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')
test_pred = model.predict(test_X, batch_size=1024, verbose=1)
sample_submission[inx2label] = test_pred
sample_submission.to_csv(results_dir+submission_name, index=False)

FileLink(results_dir+submission_name)



In [None]:
# pseudo
stop

In [None]:
model_loss_checkpoint = ModelCheckpoint(models_dir+model_name+'_pseudo.h5', monitor='val_loss', verbose=1, mode='min', save_best_only=True)
model.compile(optimizer=optimizers.RMSprop(0.0001), loss='binary_crossentropy', metrics=[auc_roc])

ps_epochs = 3
for ps_inx in range(0, ps_epochs):  
    test_Y = model.predict(test_X, batch_size=1024, verbose=1)
    
    trn_ps_seq = PseudoFeatureSequence(X[trn_inx], X_meta[trn_inx], Y[trn_inx], 182, 
                                       test_X, np.zeros((test_X.shape[0], 2)), test_Y, 74,  
                                       shuffle=True)
    model.fit_generator(
        generator=trn_ps_seq, steps_per_epoch=len(trn_ps_seq),  
        validation_data=val_seq, validation_steps=len(val_seq),
        initial_epoch=epochs+ps_inx, epochs=epochs+ps_inx+1, 
        shuffle=False, verbose=1,
        class_weight=weights,
        callbacks=[model_loss_checkpoint],
        use_multiprocessing=False, workers=cpu_cores, max_queue_size=4*cpu_cores)

In [None]:
Y_val_pred = model.predict(X[val_inx], batch_size=512, verbose=0)
losses = compute_losses(Y[val_inx], Y_val_pred, eps=1e-5)

val_loss = sum(losses)/len(losses)
val_auc = metrics.roc_auc_score(Y[val_inx], Y_val_pred)

print()
print("avg_loss: {}".format(val_loss))
print("ROC AUC: {}".format(val_auc))

In [None]:
submission_name = 'fasttext__gru__max_pool2d__submission_'+str(round(val_loss, 5))+'_'+str(round(val_auc, 5))+'_pseudo'+ps_epochs+'.csv'

sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')
test_pred = model.predict(test_X, batch_size=1024, verbose=1)
sample_submission[inx2label] = test_pred
sample_submission.to_csv(results_dir+submission_name, index=False)

FileLink(results_dir+submission_name)