In [56]:
import os
import pickle
import pandas as pd
import numpy as np
from importlib import reload
from helpers import constants; reload(constants)
from helpers.helper_functions import LossAndErrorPrintingCallback
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D, Activation, concatenate
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, BatchNormalization, SpatialDropout1D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, TensorBoard
from tensorflow.keras.initializers import Constant

In [50]:
# Load some constants
select_label = constants.SELECT_LABEL
intermediate_path = constants.ITM_DATA_DIR
model_data_path = constants.PRCD_DATA_DIR
tokenizer_path = constants.TOKEN_DIR
sample_data_path = constants.SAMPLE_DATA_DIR

embedding_path = constants.GLOVE_DIR
embedding_dim = constants.EMBEDDING_DIM # the number of element for one word in Glove Embedding

log_dir = constants.LOG_DIR
max_len = constants.MAX_SEQUENCE_LENGTH # max number of words in a post to use 
max_word_no = constants.MAX_NUM_WORDS # how many unique words to use (i.e num rows in embedding vector)

# read in the fitted tokenizer from pickle
with open(tokenizer_path, 'rb') as tokenizer_file:
    tokenizer = pickle.load(tokenizer_file)

In [51]:
#Load data
data_train, labels_train, data_test, labels_test = pickle.load(open(model_data_path,'rb'))
data_train_sample, labels_train_sample = pickle.load(open(sample_data_path,'rb'))

In [26]:
embeddings_index = {}
with open(os.path.join(embedding_path, 'glove.6B.100d.txt'),'r', encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [31]:
# prepare embedding matrix
word_index = tokenizer.word_index
num_words = min(max_word_no, len(word_index) + 1) # Only use the top num_words words from the training corp
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= max_word_no: # if the word index exceeds the limit
        continue # then this word embedding is not included into the embedding
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words, # input dim - vocabulary size
                            embedding_dim, # output dim - dense embedding dimension
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_len,
                            trainable=False)


In [32]:
# Initialize parameters
conv_filters = 128 # No. filters to use for each convolution
weight_vec = list(np.max(np.sum(labels_train, axis=0))/np.sum(labels_train, axis=0))
class_weight = {i: weight_vec[i] for i in range(labels_train.shape[1])}

In [33]:
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(max_len,), dtype='int32')
emb = embedding_layer(sequence_input) # turn word index into word embedding

# Specify each convolution layer and their kernel siz i.e. n-grams 
conv1_1 = Conv1D(filters=conv_filters, kernel_size=3)(emb)
btch1_1 = BatchNormalization()(conv1_1)
drp1_1  = Dropout(0.2)(btch1_1)
actv1_1 = Activation('relu')(drp1_1)
glmp1_1 = GlobalMaxPooling1D()(actv1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=4)(emb)
btch1_2 = BatchNormalization()(conv1_2)
drp1_2  = Dropout(0.2)(btch1_2)
actv1_2 = Activation('relu')(drp1_2)
glmp1_2 = GlobalMaxPooling1D()(actv1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=5)(emb)
btch1_3 = BatchNormalization()(conv1_3)
drp1_3  = Dropout(0.2)(btch1_3)
actv1_3 = Activation('relu')(drp1_3)
glmp1_3 = GlobalMaxPooling1D()(actv1_3)

conv1_4 = Conv1D(filters=conv_filters, kernel_size=6)(emb)
btch1_4 = BatchNormalization()(conv1_4)
drp1_4  = Dropout(0.2)(btch1_4)
actv1_4 = Activation('relu')(drp1_4)
glmp1_4 = GlobalMaxPooling1D()(actv1_4)

# Gather all convolution layers
cnct = concatenate([glmp1_1, glmp1_2, glmp1_3, glmp1_4], axis=1)
drp1 = Dropout(0.2)(cnct)

dns1  = Dense(32, activation='relu')(drp1)
btch1 = BatchNormalization()(dns1)
drp2  = Dropout(0.2)(btch1)

out = Dense(labels_train.shape[1], activation='softmax')(drp2)

In [34]:
# Compile
model = Model(inputs=sequence_input, outputs=out)
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [40]:
model_checkpoint = ModelCheckpoint(filepath=log_dir+'/check_point.check', monitor='val_loss', mode='min', save_best_only=True, verbose=1)
csvlogger = CSVLogger(filename=log_dir+'/csvlogger.csv')

In [54]:
# Clear any logs from previous runs
!del /q ..\log\*

In [55]:
# Estimate model
model_history = model.fit(data_train_sample, labels_train_sample, 
                          validation_split=0.1, 
                          epochs=2, 
                          batch_size=512, 
                          shuffle=True, 
                          class_weight=class_weight, 
                          callbacks = [csvlogger,LossAndErrorPrintingCallback()],
                          verbose=0)

For batch 0, loss is    0.72.
For batch 1, loss is    0.69.
For batch 2, loss is    0.69.
For batch 3, loss is    0.69.
For batch 4, loss is    0.69.
For batch 5, loss is    0.70.
For batch 6, loss is    0.70.
For batch 7, loss is    0.68.
For batch 8, loss is    0.69.
For batch 9, loss is    0.68.
For batch 10, loss is    0.66.
For batch 11, loss is    0.69.
For batch 12, loss is    0.70.
For batch 13, loss is    0.68.
For batch 14, loss is    0.68.
For batch 15, loss is    0.71.
For batch 16, loss is    0.67.
For batch 17, loss is    0.67.
For batch 18, loss is    0.71.
For batch 19, loss is    0.70.
For batch 20, loss is    0.69.
For batch 21, loss is    0.66.
For batch 22, loss is    0.68.
For batch 23, loss is    0.67.
For batch 24, loss is    0.70.
For batch 25, loss is    0.65.
For batch 26, loss is    0.68.
For batch 27, loss is    0.68.
For batch 28, loss is    0.67.
For batch 29, loss is    0.68.
For batch 30, loss is    0.71.
For batch 31, loss is    0.69.
For batch 32, loss

IndexError: tuple index out of range

In [None]:
results = model.evaluate(data_test, labels_test, batch_size=128)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))