In [29]:
import pickle
import pandas as pd
import numpy as np
from importlib import reload
from helpers import constants; reload(constants)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from helpers.helper_functions import LossAndErrorPrintingCallback
from tensorflow.keras.callbacks import CSVLogger
from datetime import datetime
import os
from tensorboard.plugins.hparams import api as hp
from tensorflow.keras.initializers import Constant

In [30]:
select_label = constants.SELECT_LABEL
intermediate_path = constants.ITM_DATA_DIR
model_data_path = constants.PRCD_DATA_DIR
sample_data_path = constants.SAMPLE_DATA_DIR
max_len = constants.MAX_SEQUENCE_LENGTH # max number of words in a post to use
max_word_no = constants.MAX_NUM_WORDS # how many unique words to use (i.e num rows in embedding vector)
embedding_dim = constants.EMBEDDING_DIM # the number of element for one word in Glove Embedding

In [31]:
#Load data
data_train, labels_train, data_test, labels_test = pickle.load(open(model_data_path,'rb'))
data_train_sample, labels_train_sample, data_test_sample, labels_test_sample = pickle.load(open(sample_data_path,'rb'))

In [32]:
X_train, X_test, y_train, y_test = data_train_sample, data_test_sample, labels_train_sample, labels_test_sample

In [33]:
# Initialize parameters and hyper-parameters
weight_vec = list(np.max(np.sum(y_train, axis=0))/np.sum(y_train, axis=0))
class_weight = {i: weight_vec[i] for i in range(y_train.shape[1])}
no_epoch = 20
conv_filters = 128

In [34]:
log_dir = os.path.join('..','log_gender_self_cnn')
hparam_dir=os.path.join('..','log_gender_self_cnn','hparam_tuning')

In [35]:
HP_EMD_OP = hp.HParam('emd_output', hp.Discrete([64, 128]))
HP_LR = hp.HParam('learning_rate', hp.Discrete([0.0001, 0.001, 0.01]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.1, 0.2, 0.3]))
HP_BATCH = hp.HParam('batch_size', hp.Discrete([128, 256, 512]))

METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer(hparam_dir).as_default():
    hp.hparams_config(
        hparams=[HP_EMD_OP, HP_LR, HP_DROPOUT, HP_BATCH],
        metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
      )

In [38]:
def train_test_model(hparams, log_dir):
    # construct the model
    # train a 1D convnet with global maxpooling
    sequence_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
    embedding_layer = tf.keras.layers.Embedding(input_dim = max_word_no,output_dim = hparams[HP_EMD_OP], input_length=max_len, trainable=True)
    emb = embedding_layer(sequence_input) # turn word index into word embedding

    # Specify each convolution layer and their kernel siz i.e. n-grams 
    conv1_1 = tf.keras.layers.Conv1D(filters=conv_filters, kernel_size=3)(emb)
    btch1_1 = tf.keras.layers.BatchNormalization()(conv1_1)
    drp1_1  = tf.keras.layers.Dropout(0.2)(btch1_1)
    actv1_1 = tf.keras.layers.Activation('relu')(drp1_1)
    glmp1_1 = tf.keras.layers.GlobalMaxPooling1D()(actv1_1)

    conv1_2 = tf.keras.layers.Conv1D(filters=conv_filters, kernel_size=4)(emb)
    btch1_2 = tf.keras.layers.BatchNormalization()(conv1_2)
    drp1_2  = tf.keras.layers.Dropout(0.2)(btch1_2)
    actv1_2 = tf.keras.layers.Activation('relu')(drp1_2)
    glmp1_2 = tf.keras.layers.GlobalMaxPooling1D()(actv1_2)

    conv1_3 = tf.keras.layers.Conv1D(filters=conv_filters, kernel_size=5)(emb)
    btch1_3 = tf.keras.layers.BatchNormalization()(conv1_3)
    drp1_3  = tf.keras.layers.Dropout(0.2)(btch1_3)
    actv1_3 = tf.keras.layers.Activation('relu')(drp1_3)
    glmp1_3 = tf.keras.layers.GlobalMaxPooling1D()(actv1_3)

    conv1_4 = tf.keras.layers.Conv1D(filters=conv_filters, kernel_size=6)(emb)
    btch1_4 = tf.keras.layers.BatchNormalization()(conv1_4)
    drp1_4  = tf.keras.layers.Dropout(0.2)(btch1_4)
    actv1_4 = tf.keras.layers.Activation('relu')(drp1_4)
    glmp1_4 = tf.keras.layers.GlobalMaxPooling1D()(actv1_4)

    # Gather all convolution layers
    cnct = tf.keras.layers.concatenate([glmp1_1, glmp1_2, glmp1_3, glmp1_4], axis=1)
    drp1 = tf.keras.layers.Dropout(0.2)(cnct)

    dns1  = tf.keras.layers.Dense(32, activation='relu')(drp1)
    btch1 = tf.keras.layers.BatchNormalization()(dns1)
    drp2  = tf.keras.layers.Dropout(0.2)(btch1)

    out = tf.keras.layers.Dense(y_train.shape[1], activation='softmax')(drp2)

    model = tf.keras.models.Model(inputs=sequence_input, outputs=out)
    
    adam = Adam(lr=hparams[HP_LR], beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    
    model.fit(X_train, y_train, validation_split=0.1, epochs=no_epoch, batch_size=hparams[HP_BATCH],
             shuffle=True, class_weight=class_weight, verbose=1,
              callbacks = [tf.keras.callbacks.TensorBoard(log_dir, histogram_freq=1)]
             )
    _, accuracy = model.evaluate(X_test, y_test)
    return accuracy

In [None]:
session_num = 0
for emb_output in HP_EMD_OP.domain.values:
    for dropout_rate in HP_DROPOUT.domain.values:
        for learning_rate in HP_LR.domain.values:
            for batch_size in HP_BATCH.domain.values:
                hparams = { HP_EMD_OP: emb_output, HP_DROPOUT: dropout_rate, HP_LR: learning_rate, HP_BATCH: batch_size }
                run_name = "run-%d" % session_num
                print('--- Starting trial: %s' % run_name)
                print({h.name: hparams[h] for h in hparams})
                logdir = hparam_dir + '\\dp_' + str(hparams[HP_DROPOUT]) + '_lr_' + str(hparams[HP_LR]) + '_bz_' + str(hparams[HP_BATCH])
                with tf.summary.create_file_writer(logdir + '\\metrics').as_default():
                    hp.hparams(hparams) # record the values used in this trial
                    accuracy = train_test_model(hparams, logdir)
                    tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)
                session_num += 1

--- Starting trial: run-0
{'emd_output': 64, 'dropout': 0.1, 'learning_rate': 0.0001, 'batch_size': 128}
Train on 40480 samples, validate on 4498 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


--- Starting trial: run-1
{'emd_output': 64, 'dropout': 0.1, 'learning_rate': 0.0001, 'batch_size': 256}
Train on 40480 samples, validate on 4498 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
 5376/40480 [==>...........................] - ETA: 11:19 - loss: 0.3063 - accuracy: 0.8761