In [1]:
# import importlib
# import local_utils; importlib.reload(local_utils)
from local_utils import *

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Seed: 7961730


In [2]:
# Load Data
ids, comments, Y, test_ids, test_comments, inx2label, label2inx = load_data()
Y_wblank = np.concatenate([Y, np.expand_dims((~Y.any(axis=1)).astype(int), 1)], axis=1)
print("Original:\n" + comments[0])
print()

comments = Parallel(n_jobs=cpu_cores)(delayed(preprocess)(text, False) for text in comments)
test_comments = Parallel(n_jobs=cpu_cores)(delayed(preprocess)(text, False) for text in test_comments)
print("Processed:\n" + comments[0])

# comments_fr, comments_de, comments_es = load_augmented_data()

Original:
Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27

Processed:
Explanation Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now. 


In [3]:
vectors_ft, inx2word_ft, word2inx_ft = load_embs(embs_name='crawl-300d-2M')
vectors_gl, inx2word_gl, word2inx_gl = load_embs(embs_name='glove-300d-840B')
vectors_gl_tw, inx2word_gl_tw, word2inx_gl_tw = load_embs(embs_name='glove-twitter-200d-27B')

In [4]:
# def tokenize(text):
#     return glove_twitter_tokenizer.tokenize(text, word2inx)

# docs = Parallel(n_jobs=cpu_cores)(delayed(tokenize)(text) for text in comments + test_comments)
# pickle.dump(docs, open('data/tokenized_twitter_comments.pkl', 'wb'))

In [5]:
docs = pickle.load(open('data/tokenized_comments.pkl', 'rb'))
docs_tw = pickle.load(open('data/tokenized_twitter_comments.pkl', 'rb'))

In [6]:
max_len = 450

print("\nFastText...")
text_analyzer_ft = TextAnalyzer(word2inx_ft, vectors_ft, max_len=max_len, process_oov_words=True, oov_min_doc_hits=5)
seq_ft, _ = text_analyzer_ft.fit_on_docs(docs)

X_ft = seq_ft[:len(comments)]
test_X_ft = seq_ft[len(comments):]

print("\nGloVe...")
text_analyzer_gl = TextAnalyzer(word2inx_gl, vectors_gl, max_len=max_len, process_oov_words=True, oov_min_doc_hits=5)
seq_gl, _ = text_analyzer_gl.fit_on_docs(docs)

X_gl = seq_gl[:len(comments)]
test_X_gl = seq_gl[len(comments):]

print("\nGloVe Twitter...")
text_analyzer_gl_tw = TextAnalyzer(word2inx_gl_tw, vectors_gl_tw, max_len=max_len, process_oov_words=True, oov_min_doc_hits=5)
seq_gl_tw, _ = text_analyzer_gl_tw.fit_on_docs(docs_tw)

X_gl_tw = seq_gl_tw[:len(comments)]
test_X_gl_tw = seq_gl_tw[len(comments):]


FastText...
Docs: 312735
Selected words: 159585
Processed OOV words: 5518

GloVe...
Docs: 312735
Selected words: 160751
Processed OOV words: 5155

GloVe Twitter...
Docs: 312735
Selected words: 129743
Processed OOV words: 9465


In [7]:
X = np.stack([X_ft, X_gl, X_gl_tw])
test_X = np.stack([test_X_ft, test_X_gl, test_X_gl_tw])

In [8]:
# Train/Valid splitting
trn_inx, val_inx = stratified_sampling(Y, 0.1, seed)

print("train: {}, valid: {}".format(len(trn_inx), len(val_inx)))
# plot_stratified_sampling(Y, trn_inx, val_inx, inx2label)

train: 143613, valid: 15958


In [9]:
# Current exp model
def getExpModel(input_shape, classes, num_words, emb_sizes, emb_matrixes, 
                emb_dropout=0.5, attention=0, dense=False, emb_trainable=False):

    x_inputs = []
    embs = []
    
    for i in range(0,len(num_words)):
        x_input = Input(shape=(input_shape,))
        emb = Embedding(num_words[i], emb_sizes[i], weights=[emb_matrixes[i]], trainable=emb_trainable, name="embs_"+str(i))(x_input)
        x_inputs.append(x_input)
        embs.append(emb)
            
    
    emb = concatenate(embs)
    emb = SpatialDropout1D(emb_dropout)(emb)
        
    rnn1 = Bidirectional(CuDNNGRU(64, return_sequences=True))(emb)
    rnn2 = Bidirectional(CuDNNGRU(64, return_sequences=True))(rnn1)
    x = concatenate([rnn1, rnn2])

    if attention == 1: x = AttentionWeightedAverage()(x)
    elif attention == 2: x = Attention()(x)
    else: x = GlobalMaxPooling1D()(x)
    
    if dense: 
        x = Dense(32, activation='relu')(x)
        x = Dropout(0.3)(x)
    
    x_output = Dense(classes, activation='sigmoid')(x)
    return Model(inputs=x_inputs, outputs=x_output)

In [10]:
# import importlib
# importlib.reload(toxic.models)
# from toxic.models import *

model_name = 'exp_comb_model'

model = getExpModel(input_shape=max_len, classes=Y.shape[1], 
                  num_words=[len(text_analyzer_ft.inx2emb), len(text_analyzer_gl.inx2emb), len(text_analyzer_gl_tw.inx2emb)],
                  emb_sizes=[text_analyzer_ft.emb_size, text_analyzer_gl.emb_size, text_analyzer_gl_tw.emb_size],
                  emb_matrixes=[text_analyzer_ft.emb_vectors, text_analyzer_gl.emb_vectors, text_analyzer_gl_tw.emb_vectors],
                  emb_dropout=0.25, attention=0, dense=False, emb_trainable=False)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 450)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 450)          0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 450)          0                                            
__________________________________________________________________________________________________
embs_0 (Embedding)              (None, 450, 300)     47875500    input_1[0][0]                    
__________________________________________________________________________________________________
embs_1 (Em

In [11]:
model_checkpoint = ModelCheckpoint(models_dir+model_name+'.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=6, verbose=1, mode='auto')
lr_schedule = LearningRateScheduler(lr_change, verbose=1)
lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=1, min_lr=0.0001, verbose=1)
# tensorboard = TensorBoard(log_dir='logs', write_graph=False)

In [12]:
batch_size = 256
val_batch_size = 1024
weights = getClassWeights(Y, mu=0.5)

class FeatureManySequence(Sequence):
    
    def __init__(self, X, Y, batch_size, shuffle=False):
        
        self.X, self.Y = X, Y
        self.batch_size = batch_size
        
        self.inx = np.arange(0, self.Y.shape[0])
        self.shuffle = shuffle
        if self.shuffle:
            np.random.shuffle(self.inx)

    def __len__(self):
        return math.ceil(self.inx.shape[0] / self.batch_size)

    def __getitem__(self, i):
        batch_inx = self.inx[i*self.batch_size:(i+1)*self.batch_size]
        
        return list(self.X[:, batch_inx]), self.Y[batch_inx]
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.inx)

trn_seq = FeatureManySequence(X[:,trn_inx], Y[trn_inx], batch_size, shuffle=True)
val_seq = FeatureManySequence(X[:,val_inx], Y[val_inx], val_batch_size)
roc_auc_eval = RocAucEvaluation(list(X[:,val_inx]), Y[val_inx], batch_size=val_batch_size)

In [13]:
epochs=32

clr = CyclicLR(base_lr=0.0001, max_lr=0.003, step_size=2*len(trn_seq), mode='triangular2')
model.compile(loss="binary_crossentropy", optimizer=optimizers.Nadam())

model.fit_generator(
    generator=trn_seq, steps_per_epoch=len(trn_seq),
    validation_data=val_seq, validation_steps=len(val_seq),
    initial_epoch=0, epochs=epochs, shuffle=False, verbose=1,
    callbacks=[model_checkpoint, clr, early_stop, roc_auc_eval],
#     callbacks=[model_checkpoint, lr_reduce, early_stop, roc_auc_eval],
    use_multiprocessing=False, workers=cpu_cores, max_queue_size=8*cpu_cores)

Epoch 1/32

Epoch 00001: val_loss improved from inf to 0.04442, saving model to models/exp_comb_model.h5
ROC-AUC: 0.97746925


Epoch 2/32

Epoch 00002: val_loss improved from 0.04442 to 0.04072, saving model to models/exp_comb_model.h5
ROC-AUC: 0.98649338


Epoch 3/32

Epoch 00003: val_loss improved from 0.04072 to 0.03873, saving model to models/exp_comb_model.h5
ROC-AUC: 0.98960346


Epoch 4/32

Epoch 00004: val_loss improved from 0.03873 to 0.03757, saving model to models/exp_comb_model.h5
ROC-AUC: 0.99054679


Epoch 5/32

Epoch 00005: val_loss did not improve
ROC-AUC: 0.99048541


Epoch 6/32

Epoch 00006: val_loss did not improve
ROC-AUC: 0.99063626


Epoch 7/32

Epoch 00007: val_loss did not improve
ROC-AUC: 0.99073064


Epoch 8/32

Epoch 00008: val_loss did not improve
ROC-AUC: 0.99090321


Epoch 9/32

Epoch 00009: val_loss did not improve
ROC-AUC: 0.99076427


Epoch 10/32

Epoch 00010: val_loss did not improve
ROC-AUC: 0.99052896


Epoch 00010: early stopping


<keras.callbacks.History at 0x7fb0d4e26668>

In [14]:
# matplotlib.rcParams['figure.figsize'] = (24,8)

# plt.xlabel('Learning Rate')
# plt.ylabel('Loss')
# plt.plot(clr.history['lr'], clr.history['loss'])

In [15]:
del model
model = load_model(models_dir+model_name+'.h5', compile=True, 
                   custom_objects={'Attention':Attention, 'AttentionWeightedAverage':AttentionWeightedAverage})

In [16]:
Y_trn_pred = model.predict(list(X[:,trn_inx]), batch_size=1024, verbose=0)
losses = compute_losses(Y[trn_inx], Y_trn_pred, eps=1e-5)
for label, label_loss in zip(inx2label, losses):
    print("{}: {}".format(label, label_loss))
print()

trn_loss = sum(losses)/len(losses)
trn_auc = metrics.roc_auc_score(Y[trn_inx], Y_trn_pred)

print()
print("avg_loss: {}".format(trn_loss))
print("ROC AUC: {}".format(trn_auc))

toxic: 0.06960281656512934
severe_toxic: 0.019473836328844602
obscene: 0.0365750524786678
threat: 0.0068126235100814175
insult: 0.04887471573575669
identity_hate: 0.01620135985454672


avg_loss: 0.03292340074550443
ROC AUC: 0.9928868517878993


In [17]:
Y_val_pred = model.predict(list(X[:,val_inx]), batch_size=1024, verbose=0)
losses = compute_losses(Y[val_inx], Y_val_pred, eps=1e-5)
for label, label_loss in zip(inx2label, losses):
    print("{}: {}".format(label, label_loss))
print()

val_loss = sum(losses)/len(losses)
val_auc = metrics.roc_auc_score(Y[val_inx], Y_val_pred)

print()
print("avg_loss: {}".format(val_loss))
print("ROC AUC: {}".format(val_auc))

toxic: 0.08423508211303908
severe_toxic: 0.01992617911071482
obscene: 0.04173761610106832
threat: 0.007220680647989223
insult: 0.05627580592223156
identity_hate: 0.016020324807685685


avg_loss: 0.03756928145045479
ROC AUC: 0.9905467907083395


In [18]:
stop

NameError: name 'stop' is not defined

In [None]:
epochs1=epochs+8
clr = CyclicLR(base_lr=0.0001, max_lr=0.001, step_size=2*len(trn_seq), mode='triangular2')
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=4, verbose=1, mode='auto')
model.compile(loss="binary_crossentropy", optimizer=optimizers.Adam())
# model.compile(loss=art_loss, optimizer=optimizers.Adam())
model.fit_generator(
    generator=trn_seq, steps_per_epoch=len(trn_seq),
    validation_data=val_seq, validation_steps=len(val_seq),
    initial_epoch=epochs, epochs=epochs1, shuffle=False, verbose=1,
#     class_weight=weights,
    callbacks=[model_checkpoint, clr, early_stop, roc_auc_eval],
#     callbacks=[model_checkpoint, lr_reduce, early_stop, roc_auc_eval],
    use_multiprocessing=False, workers=cpu_cores, max_queue_size=8*cpu_cores)

In [None]:
del model
model = load_model(models_dir+model_name+'.h5', compile=True, 
                   custom_objects={'Attention':Attention, 'AttentionWeightedAverage':AttentionWeightedAverage, 'art_loss':art_loss})

In [None]:
Y_val_pred = model.predict(X[val_inx], batch_size=1024, verbose=0)
losses = compute_losses(Y[val_inx], Y_val_pred, eps=1e-5)

val_loss = sum(losses)/len(losses)
val_auc = metrics.roc_auc_score(Y[val_inx], Y_val_pred)

print()
print("avg_loss: {}".format(val_loss))
print("ROC AUC: {}".format(val_auc))

In [None]:
submission_name = 'exp__focal_loss_a1.0_g0.5__fasttext__d0.3__submission_'+str(round(val_loss, 5))+'_'+str(round(val_auc, 5))+'.csv'

sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')
test_pred = model.predict(test_X, batch_size=1024, verbose=1)
sample_submission[inx2label] = test_pred
sample_submission.to_csv(results_dir+submission_name, index=False)

FileLink(results_dir+submission_name)

In [None]:
# pseudo
stop

In [None]:
model_loss_checkpoint = ModelCheckpoint(models_dir+model_name+'_pseudo.h5', monitor='val_loss', verbose=1, mode='min', save_best_only=True)
model.compile(optimizer=optimizers.Nadam(0.0001), loss='binary_crossentropy')

ps_epochs = 3
for ps_inx in range(0, ps_epochs):  
    test_Y = model.predict(test_X, batch_size=1024, verbose=1)
    
    trn_ps_seq = PseudoFeatureSequence(X[trn_inx], X_meta[trn_inx], Y[trn_inx], 182, 
                                       test_X, np.zeros((test_X.shape[0], 2)), test_Y, 74,  
                                       shuffle=True)
    model.fit_generator(
        generator=trn_ps_seq, steps_per_epoch=len(trn_ps_seq),  
        validation_data=val_seq, validation_steps=len(val_seq),
        initial_epoch=epochs+ps_inx, epochs=epochs+ps_inx+1, 
        shuffle=False, verbose=1,
        class_weight=weights,
        callbacks=[model_loss_checkpoint, roc_auc_eval],
        use_multiprocessing=False, workers=cpu_cores, max_queue_size=4*cpu_cores)

In [None]:
Y_val_pred = model.predict(X[val_inx], batch_size=512, verbose=0)
losses = compute_losses(Y[val_inx], Y_val_pred, eps=1e-5)

val_loss = sum(losses)/len(losses)
val_auc = metrics.roc_auc_score(Y[val_inx], Y_val_pred)

print()
print("avg_loss: {}".format(val_loss))
print("ROC AUC: {}".format(val_auc))

In [None]:
submission_name = 'fasttext__gru__max_pool2d__submission_'+str(round(val_loss, 5))+'_'+str(round(val_auc, 5))+'_pseudo'+ps_epochs+'.csv'

sample_submission = pd.read_csv(data_dir + 'sample_submission.csv')
test_pred = model.predict(test_X, batch_size=1024, verbose=1)
sample_submission[inx2label] = test_pred
sample_submission.to_csv(results_dir+submission_name, index=False)

FileLink(results_dir+submission_name)