# Jigsaw Multilingual Competition - Custom Embedding v2 Hyperopt Search

https://www.kaggle.com/c/jigsaw-multilingual-toxic-comment-classification

### Model Search Constants and Constraints:

 --------------------------------------------------------------
 - Max Sequence Length = 220
 - Total Unique Words = 200000 (200,000 selected due to memory issues)
 - Text Preprocessing = yes, but ?! kept
 - Embedding = glove, fasttext, custom 100d 5iter, glove, fasttext
 - Data Split Strategy = Train\Validation (2000 val records)
 - Total Runs = 300
 - Various Architectures = Yes
 --------------------------------------------------------------
 

# Installs

In [1]:
# !pip install hyperopt

# Imports

In [2]:
import time

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [4]:
import os
print(os.getcwd())
print(os.listdir(os.getcwd()))

/home/mattyates/notebooks
['AzureML', 'SparkML', 'pytorch', 'Jigsaw_Custom_Embedding_Hyperopt_v2_azure_hvd2.ipynb', 'MMLSpark', 'julia', 'Jigsaw_Custom_Embedding_Hyperopt_v1_azure_hvd1.ipynb', '.ipynb_checkpoints', 'h2o', 'catboost', 'hvd-hyperopt-custom-lstm-preds1']


In [5]:
# TF Imports
import tensorflow as tf
tf.__version__

'2.1.0'

In [6]:
# Keras Imports
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Bidirectional, Conv1D
from tensorflow.compat.v1.keras.layers import CuDNNLSTM, CuDNNGRU
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence 
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# Numpy
import numpy
numpy.random.seed(1331)
# Pandas
import pandas as pd
# Sklearn
from sklearn import metrics
# Visualizations
import matplotlib.pyplot as plt
%matplotlib inline
# Garbage Collector
import gc
import sys
# Hyperopt
from hyperopt import fmin, tpe, hp, anneal, Trials, space_eval

In [7]:
import urllib.request

In [8]:
import horovod.keras as hvd

# Horovod: initialize Horovod.
hvd.init()

# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

Using TensorFlow backend.


# Variable Constants

In [9]:
test_score_loc = "hvd-hyperopt-custom-lstm-preds1/"
data_path = "https://<your-resource-group>.blob.core.windows.net/<your-container>/"
maxlen = 220
tot_uniq_words = 200000
embed_size = 700
expCounter = 1

# Tokenize & Padding Data

In [10]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

In [11]:
text = preprocess(pd.read_csv(data_path+"jigsaw-toxic-comment-train.csv")['comment_text'])

In [12]:
# create a tokenizer 
token = keras.preprocessing.text.Tokenizer(num_words=tot_uniq_words)
token.fit_on_texts(text)
word_index = token.word_index

In [13]:
del text
gc.collect()

23

# Training Data Gen

In [14]:
def sent_generator(chunksize, threshold, maxlen):
    while True:
        df = pd.read_csv(data_path+"jigsaw_mjy_train.csv", chunksize=chunksize, iterator=True)
        for chunk in df:
            texts = preprocess(chunk['comment_text'])
            target = chunk['toxic'].apply(lambda x: 1 if float(x) > threshold else 0)
            sequences = token.texts_to_sequences(texts)
            data_train = sequence.pad_sequences(sequences, maxlen=maxlen)
            yield (data_train, target)

In [15]:
def sent_generator2(chunksize, threshold, maxlen):
    while True:
        df = pd.read_csv(data_path+"train_augmented1605_sample.csv", chunksize=chunksize, iterator=True)
        for chunk in df:
            texts = preprocess(chunk['comment'])
            target = chunk['target'].apply(lambda x: 1 if float(x) > threshold else 0)
            sequences = token.texts_to_sequences(texts)
            data_train = sequence.pad_sequences(sequences, maxlen=maxlen)
            yield (data_train, target)

# Validation Data

In [16]:
dfval = pd.read_csv(data_path+"jigsaw_mjy_val.csv")
X_val = sequence.pad_sequences(token.texts_to_sequences(preprocess(dfval['comment_text'])), maxlen=maxlen)
y_val = dfval['toxic']

In [17]:
del dfval
gc.collect()

20

# Test Data

In [18]:
def test_data_prep():
    # load test data
    j_df = pd.read_csv(data_path+'jigsaw_miltilingual_test_translated.csv')
    X_test = preprocess(j_df['translated'].astype(str))
    del j_df
    gc.collect()
    X_test = sequence.pad_sequences(token.texts_to_sequences(X_test), maxlen=maxlen)
    return X_test

X_test = test_data_prep()

# Embedding

In [19]:
%%time

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

EMBEDDING_FILES3 = [
    data_path+'crawl-300d-2M.vec',
    data_path+'glove.840B.300d.txt'
]

def load_embeddings(path):
    return dict(get_coefs(*line.decode('utf-8').strip().split(' ')) for line in urllib.request.urlopen(path))

# Build MATrix
def BMAT300(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

EMBEDDING_FILES1 = data_path+'custom_word2vec_100d_5iter.txt'

def BMAT100(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 100))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

embedding_matrix = np.concatenate([BMAT300(token.word_index, f) for f in EMBEDDING_FILES3], axis=-1)
embedding_matrix = np.concatenate([BMAT100(token.word_index, EMBEDDING_FILES1), embedding_matrix], axis=-1)
embedding_matrix = embedding_matrix[0:tot_uniq_words,:]
embedding_matrix.shape

CPU times: user 5min 25s, sys: 8.74 s, total: 5min 33s
Wall time: 5min 54s


(200000, 700)

In [20]:
gc.collect()

20

# Define Objective

In [21]:
def objective(params):
    
    start_time = time.time()
    global expCounter
    print('running experiment number: ', expCounter)
    print ('Params testing: ', params)
    
    # Function to create model, required for KerasClassifier
    def create_model(learnR=params['learnR'],
                     lrFactor=params['lrFactor'],
                     model_idx=0,
                     tot_uniq_words=tot_uniq_words,
                     embed_size=embed_size,
                     drpt_amt=params['drpt_amt'],
                     deep_nn=params['deep_nn'],
                     lstm1_nrns=int(params['lstm1_nrns']),
                     lstm2_nrns=int(params['lstm2_nrns'])):

        # clearn TF backend otherwise we run into memory issues
        K.clear_session()
        
        # create model
        model = Sequential()
        model.add(Embedding(tot_uniq_words, embed_size, input_length=maxlen, weights=[embedding_matrix], trainable=True))
        if deep_nn:
            model.add(Dropout(drpt_amt))
            model.add(Bidirectional(CuDNNLSTM(lstm1_nrns, return_sequences=True)))
        model.add(Dropout(drpt_amt))
        model.add(Bidirectional(CuDNNLSTM(lstm2_nrns)))
        model.add(Dropout(drpt_amt))
        model.add(Dense(1, activation='sigmoid'))
        
        # Horovod: adjust learning rate based on number of GPUs.
        lr = learnR * (lrFactor ** model_idx)
        opt = keras.optimizers.Adadelta(lr=lr * hvd.size())
        
        # Horovod: add Horovod Distributed Optimizer.
        opt = hvd.DistributedOptimizer(opt)
        
        # Compile model
        model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'], experimental_run_tf_function=False)
        
        callbacks = [
            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
            
            # Horovod: average metrics among workers at the end of every epoch.
            #
            # Note: This callback must be in the list before the ReduceLROnPlateau,
            # TensorBoard or other metrics-based callbacks.
            hvd.callbacks.MetricAverageCallback(),
            
            # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
            # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
            # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1),
            
            # Reduce the learning rate if training plateaues.
            keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1),
        ]
        
        # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
        # if hvd.rank() == 0:
        #    callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
        
        return model, callbacks # grid search epochs, batch size and optimizer
    
    # calculate steps
    train_batches = 2131743 // int(params['chunksize'])
    n_steps = train_batches // hvd.size()
    val_batches = 2000 // int(params['chunksize'])
    
    # run the same model in the "global_epoch" loop and update weight and learning rate between runs
    if params['architecture']==1:  
        checkpoint_predictions=[]
        checkpoint_predictions_test=[]
        weights=[]
        for model_idx in range(int(params['num_models'])):
            model, callbacks = create_model(model_idx=model_idx)
            for global_epoch in range(int(params['g_epochs'])):
                print("model running...")
                model.fit(
                        sent_generator(chunksize=int(params['chunksize']), 
                                       threshold=params['threshold'], 
                                       maxlen=maxlen),
                        steps_per_epoch=n_steps,
                        epochs=int(params['epochs'] * (params['epochsR2'] ** global_epoch)),
                        verbose=1,
                        callbacks=callbacks,
                        validation_data=(X_val,y_val),
                        validation_steps=3 * val_batches // hvd.size()
                )
                valPreds = model.predict(X_val, batch_size=200).flatten()
                checkpoint_predictions.append(valPreds)
                testPreds = model.predict(X_test, batch_size=200).flatten()
                checkpoint_predictions_test.append(testPreds)
                weights.append(params['wgtFactor'] ** global_epoch)
            del model
            K.clear_session()
            gc.collect()
    # every round is a new model w/ similar hyperparameters except learning rate and model weights
    elif params['architecture']==2:  
        checkpoint_predictions=[]
        checkpoint_predictions_test=[]
        weights=[]
        for model_idx in range(int(params['num_models'])):
            for global_epoch in range(int(params['g_epochs'])):
                model, callbacks = create_model(model_idx=model_idx)
                print("model running...")
                model.fit(
                        sent_generator(chunksize=int(params['chunksize']), 
                                       threshold=params['threshold'], 
                                       maxlen=maxlen),
                        steps_per_epoch=n_steps,
                        epochs=int(params['epochs'] * (params['epochsR2'] ** global_epoch)),
                        verbose=1,
                        callbacks=callbacks,
                        validation_data=(X_val,y_val),
                        validation_steps=3 * val_batches // hvd.size()
                )
                valPreds = model.predict(X_val, batch_size=200).flatten()
                checkpoint_predictions.append(valPreds)
                testPreds = model.predict(X_test, batch_size=200).flatten()
                checkpoint_predictions_test.append(testPreds)
                weights.append(params['wgtFactor'] ** global_epoch)
                del model
                K.clear_session()
                gc.collect()
    elif params['architecture']==3:
        checkpoint_predictions=[]
        checkpoint_predictions_test=[]
        weights=[]
        for model_idx in range(2):
            if model_idx==0:
                for global_epoch in range(int(params['g_epochs'])):
                    model, callbacks = create_model(model_idx=model_idx)
                    print("model running...")
                    model.fit(
                            sent_generator(chunksize=int(params['chunksize']), 
                                           threshold=params['threshold'], 
                                           maxlen=maxlen),
                            steps_per_epoch=n_steps,
                            epochs=int(params['epochs'] * (params['epochsR2'] ** global_epoch)),
                            verbose=1,
                            callbacks=callbacks,
                            validation_data=(X_val,y_val),
                            validation_steps=3 * val_batches // hvd.size()
                    )
                    valPreds = model.predict(X_val, batch_size=200).flatten()
                    checkpoint_predictions.append(valPreds)
                    testPreds = model.predict(X_test, batch_size=200).flatten()
                    checkpoint_predictions_test.append(testPreds)
                    weights.append(params['wgtFactor'] ** global_epoch)
            else:
                for global_epoch in range(int(params['g_epochs'])):
                    model, callbacks = create_model(model_idx=model_idx)
                    print("model running...")
                    model.fit(
                            sent_generator2(chunksize=int(params['chunksize']), 
                                           threshold=params['threshold'], 
                                           maxlen=maxlen),
                            steps_per_epoch=n_steps,
                            epochs=int(params['epochs'] * (params['epochsR2'] ** global_epoch)),
                            verbose=1,
                            callbacks=callbacks,
                            validation_data=(X_val,y_val),
                            validation_steps=3 * val_batches // hvd.size()
                    )
                    valPreds = model.predict(X_val, batch_size=200).flatten()
                    checkpoint_predictions.append(valPreds)
                    testPreds = model.predict(X_test, batch_size=200).flatten()
                    checkpoint_predictions_test.append(testPreds)
                    weights.append(params['wgtFactor'] ** global_epoch)
        del model
        K.clear_session()
        gc.collect()
                
    
    # validation score
    predictions = np.average(checkpoint_predictions, weights=weights, axis=0)
    fpr, tpr, thresholds = metrics.roc_curve(y_val, predictions, pos_label=1)
    score = metrics.auc(fpr, tpr)
    
    # save hyperopt results to disk
    params['score'] = score
    print("score: ", score)
    histdf = pd.DataFrame.from_dict(params, orient="index").T
    histdf = histdf.reindex(sorted(histdf.columns), axis=1) # order columns a-z
    histdf.to_csv("hyperopt_history.csv",mode='a',header=False)
    
    # score test set and save results to disk
    predictions_test = np.average(checkpoint_predictions_test, weights=weights, axis=0)
    test_df = pd.read_csv(data_path+"jigsaw_miltilingual_test_translated.csv")
    submission = pd.DataFrame.from_dict({
        'id': test_df.id,
        'toxic': predictions_test
    })
    
    submission.to_csv(test_score_loc+'submission-custom-v'+str(expCounter)+'.csv', index=False)
    expCounter += 1
    print("runtime (mins):", (time.time() - start_time)/60)
    
    return score*(-1)

In [22]:
# example of how checkpoint_predictions is working
a = []
b = np.array([1,2,3])
c = np.array([4,5,6])
a.append(b)
a.append(c)
print("a: ", a)
print("avg: ", np.average(a, weights=[2,3], axis=0))
(2*1+3*4)/5

a:  [array([1, 2, 3]), array([4, 5, 6])]
avg:  [2.8 3.8 4.8]


2.8

# Define Search Space

In [27]:
# possible values of parameters
space={'lstm1_nrns': hp.quniform('lstm1_nrns', 5, 40, 1),
       'lstm2_nrns': hp.quniform('lstm2_nrns', 5, 15, 1),
       'deep_nn': hp.choice('deep_nn', [False]),
       'epochs': hp.quniform('epochs', 1, 2, 1),
       'g_epochs': hp.quniform('g_epochs', 1, 2, 1),
       'epochsR2': hp.quniform('epochsR2', 0.00001, 0.99, 0.00001),
       'learnR': hp.quniform('learnR', 0.00001, 0.99, 0.00001),
       'lrFactor': hp.quniform('lrFactor', 0.00001, 0.99, 0.00001),
       'wgtFactor': hp.quniform('wgtFactor', 0.01, 4, 0.01),
       'chunksize': hp.quniform('chunksize', 2000, 2200, 1),
       'drpt_amt': hp.choice('drpt_amt', [0.40, 0.50, 0.60, 0.70, 0.80, 0.85]),
       'threshold': hp.choice('threshold', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8]),
       'num_models': hp.quniform('num_models', 1, 2, 1),
       'architecture': hp.choice('architecture', [1,2,3])
      }

# Initialize Tracking

In [24]:
histdf=pd.DataFrame(columns=list(space.keys())+['score'])
for col in histdf.columns:
    histdf[col] = [0]
histdf = histdf.reindex(sorted(histdf.columns), axis=1) # order columns a-z
histdf.to_csv("hyperopt_history.csv")

# Run Hyperopt

In [25]:
# surpress any warnings regarding TF2.0
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [None]:
best = fmin(objective, space, algo=tpe.suggest, max_evals=300)

running experiment number:                             
23                                                     
Params testing:                                        
{'architecture': 2, 'chunksize': 2051.0, 'deep_nn': False, 'drpt_amt': 0.7, 'epochs': 2.0, 'epochsR2': 0.6478100000000001, 'g_epochs': 1.0, 'learnR': 0.7312500000000001, 'lrFactor': 0.42793000000000003, 'lstm1_nrns': 35.0, 'lstm2_nrns': 11.0, 'num_models': 2.0, 'threshold': 0.2, 'wgtFactor': 1.36}
model running...                                       
Epoch 1/2                                              
   1/1039 [..............................]             
 - ETA: 4:09:42 - loss: 0.7065 - accuracy: 0.5427      
                                                       
   2/1039 [..............................]             
 - ETA: 2:13:16 - loss: 0.6924 - accuracy: 0.5685      
                                                       
   3/1039 [..............................]             
 - ETA: 1:34:28 - loss: 0.674

# Print Results

In [None]:
# best run
print(best)

In [None]:
# best run values
print(space_eval(space, best))

In [None]:
# best accuracy
print(objective(best))

# Resources:

https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/

https://keras.io/layers/embeddings/

https://keras.io/layers/convolutional/

https://keras.io/layers/pooling/

https://machinelearningmastery.com/use-keras-deep-learning-models-scikit-learn-python/

https://stackoverflow.com/questions/40993626/list-memory-usage-in-ipython-and-jupyter

https://www.kaggle.com/ilialar/hyperparameters-tunning-with-hyperopt?utm_medium=email&utm_source=intercom&utm_campaign=datanotes-2019

https://www.kaggle.com/ilialar/hyperparameters-tunning-with-hyperopt

https://machinelearningmastery.com/use-keras-deep-learning-models-scikit-learn-python/

https://www.kaggle.com/inspector/keras-hyperopt-example-sketch

http://hyperopt.github.io/hyperopt/

Azure Specific:

https://stackoverflow.com/questions/1393324/in-python-given-a-url-to-a-text-file-what-is-the-simplest-way-to-read-the-cont