# Model Training
> To facilitate a more automated training procedure, the model training is moved to a standalone python script.  
This keeps Keras much happier in terms of required restarts and memory usage.

In [2]:
%load_ext autoreload
%autoreload 2
from importlib import reload

import numpy as np
import time # !
import json
from matplotlib import pyplot as plt

from keras.utils import to_categorical

import glove_helper
from loadutils import conll2003Data, saveProcessedData, retrieve_model
from common import vocabulary, utils


Using TensorFlow backend.


In [3]:
# TRAIN_FILE = "../data/CoNLL-2003_NeuroNER/en/train.txt"
# DEV_FILE = "../data/CoNLL-2003_NeuroNER/en/valid.txt"
# TEST_FILE = "../data/CoNLL-2003_NeuroNER/en/test.txt"

TRAIN_FILE = "../data/pos_tagging/es/train.txt"
DEV_FILE = "../data/pos_tagging/es/dev.txt"
TEST_FILE = "../data/pos_tagging/es/test.txt"

# out files for IPC
HYPER_PARAM_FILE = "hyper_params.json"

VOCAB_SIZE = 20000

## Local helper utils

In [4]:
# local untils

# timeit decorator
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print ('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [5]:
def construct_embedding_matrix(embed_dim, vocab_size):
    """
    construct embedding matrix from GloVe 6Bn word data
    
    reuse glove_helper code from w266 
    
    Returns: an embedding matrix directly plugged into keras.layers.Embedding(weights=[embedding_matrix])
    """
    reload(glove_helper)
    hands = glove_helper.Hands(ndim=embed_dim)
    embedding_matrix = np.zeros((vocab_size, embed_dim))
    
    for i in range(vocabData.vocab.size):
        word = vocabData.vocab.ids_to_words([i])[0]
        try:
            embedding_vector = hands.get_vector(word)
        except:
            embedding_vector = hands.get_vector("<unk>")
        embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [6]:
def plot_history( history):
    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

## Load the Data

In [7]:
# UPDATES!
global_max_features = 20000
windowLength = 11
#testNumSents = 20000

# Use training set to build vocab here
vocabData = conll2003Data(TRAIN_FILE)
vocabData.buildVocab( vocabSize=global_max_features)

# Format training data
trainX, trainX_pos, trainX_capitals, trainY  = vocabData.formatWindowedData( 
                                                  vocabData.train_sentences, 
                                                  windowLength=windowLength,
                                                  verbose=False)

# read in dev data
devSents = vocabData.readFile( DEV_FILE)
devX, devX_pos, devX_capitals, devY = vocabData.formatWindowedData( 
                                              devSents, 
                                              windowLength=windowLength,
                                              verbose=False)

# read in the test data
testSents = vocabData.readFile( TEST_FILE)
testX, testX_pos, testX_capitals, testY = vocabData.formatWindowedData( 
                                                testSents, 
                                                windowLength=windowLength,
                                                verbose=False)

----------------------------------------------------
reading file from path ../data/pos_tagging/es/train.txt
'readFile'  4540.66 ms
----------------------------------------------------
building vocabulary from TRAINING data...
'buildVocab'  3321.62 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  6987.33 ms
----------------------------------------------------
reading file from path ../data/pos_tagging/es/dev.txt
'readFile'  457.22 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  639.20 ms
----------------------------------------------------
reading file from path ../data/pos_tagging/es/test.txt
'readFile'  538.28 ms
----------------------------------------------------
formatting sentences into input windows...
'formatWindowedData'  983.70 ms


In [8]:
# Load GloVe embedding matrix

# setting it to global instead of hyper_param dictionaries because embedding \
# dimensions need to be decided before the data is loaded for the decoder output
# global_embed_dim = 50
global_embed_dim = 300

embedding_matrix = construct_embedding_matrix( global_embed_dim, 
                                               global_max_features)

Loading vectors from data/es/wiki.es.zip
Parsing file: data/es/wiki.es.zip:wiki.es.vec
Found 985,668 words.
Parsing vectors... Done! (W.shape = (985671, 300))


In [46]:
# AZ: devY and testY have different shapes here

to_categorical(testY.astype('float32')).shape[1]

211

In [32]:
# Get Y

# cat train/dev/test to make sure we have all labels
# Y_cat_all 

# encoding 1-hot for ner targets
trainY_cat = to_categorical(trainY.astype('float32'))
devY_cat = to_categorical(devY.astype('float32'), num_classes=trainY_cat.shape[1])
testY_cat = to_categorical(testY.astype('float32'), num_classes=trainY_cat.shape[1])

trainY_cat = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), trainY_cat)), dtype=np.float)
devY_cat = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), devY_cat)), dtype=np.float)
testY_cat = np.array(list(map( lambda i: np.array(i[3:], dtype=np.float), testY_cat)), dtype=np.float)

In [45]:
# AZ: after changing their shape to match trainY, they all have shape (X, 216)

testY_cat.shape

(44563, 216)

In [9]:
# Get decoder Y -- 50 dim embedding of center word

train_decoderY = embedding_matrix[trainX[:,4]]
dev_decoderY = embedding_matrix[devX[:,4]]
test_decoderY = embedding_matrix[testX[:,4]]

In [10]:
# Get X pos tags

# encoding 1-hot for pos tags
trainX_pos_cat = to_categorical(trainX_pos.astype('float32'))
devX_pos_cat = to_categorical(devX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2]) 
testX_pos_cat = to_categorical(testX_pos.astype('float32'), num_classes=trainX_pos_cat.shape[2])

trainX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_pos_cat)), dtype=np.float)
devX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_pos_cat)), dtype=np.float)
testX_pos_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_pos_cat)), dtype=np.float)

In [11]:
# Get X capitlization 

# encoding 1-hot for capitalization info  ("allCaps", "upperInitial", "lowercase", "mixedCaps", "noinfo")
trainX_capitals_cat = to_categorical(trainX_capitals.astype('float32'))
devX_capitals_cat = to_categorical(devX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2]) 
testX_capitals_cat = to_categorical(testX_capitals.astype('float32'), num_classes=trainX_capitals_cat.shape[2])

trainX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), trainX_capitals_cat)), dtype=np.float)
devX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), devX_capitals_cat)), dtype=np.float)
testX_capitals_cat = np.array(list(map( lambda i: np.array(i[:,3:], dtype=np.float), testX_capitals_cat)), dtype=np.float)

## Set up model parameters

In [12]:
# define hyper parameters for model
# CAPSNET
hyper_param_caps = {
    
    'max_features' : global_max_features,  # 20000
    'maxlen' : trainX.shape[1],  # window size (9)
    'poslen' : trainX_pos_cat.shape[2],  # pos classes (45)
    'capitallen' : trainX_capitals_cat.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat.shape[1],  # 8 
    'embed_dim' : global_embed_dim,  # word embedding size
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'use_2D_primarycaps' : False,
    'primarycaps_dim_capsule' : 8,
    'primarycaps_n_channels' : 32,
    'primarycaps_kernel_size' : 3,
    'primarycaps_strides' : 1,
    'primarycaps_padding' : 'valid',

    'ner_capsule_dim' : 16,
    
    'num_dynamic_routing_passes' : 3,
    
    # decoder is still work in progress
    'use_decoder' : False,
    'decoder_feed_forward_1' : 100,
    'decoder_feed_forward_2' : 100, 
    'decoder_dropout' : 0.3,
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 3, 
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
    'loss_function' : 'custom_cosine', # mean_squared_error
}

In [13]:
# define hyper parameters for model
# CNN
hyper_param_cnn = {
    
    'max_features' : global_max_features,  # 20000
    'maxlen' : trainX.shape[1],  # window size (9)
    'poslen' : trainX_pos_cat.shape[2],  # pos classes (45)
    'capitallen' : trainX_capitals_cat.shape[2],  # capitalization classes (5)
    'ner_classes' : trainY_cat.shape[1],  # 8 
    'embed_dim' : global_embed_dim,  # word embedding size
    'num_routing' : 3, 

    'use_glove' : True,
    'allow_glove_retrain' : False,
    'use_pos_tags' : True,
    'use_capitalization_info' : True,    
    
    'conv1_filters' : 256,
    'conv1_kernel_size' : 3,
    'conv1_strides' : 1,
    'conv1_padding' : 'valid',
    
    'conv2_filters' : 256,
    'conv2_kernel_size' : 3,
    'conv2_strides' : 1,
    'conv2_padding' : 'valid',
    
    'conv3_filters' : 128,
    'conv3_kernel_size' : 3,
    'conv3_strides' : 1,
    'conv3_padding' : 'valid',
    
    'max_pooling_size' : 3,
    'max_pooling_strides' : 1,
    'max_pooling_padding' : 'valid',
    'maxpool_dropout' : 0.3,
    
    'feed_forward_1' : 328,
    'ff1_dropout' : 0.3,
    'feed_forward_2' : 192,
    'ff2_dropout' : 0.3,
    
    'save_dir' : './result',
    'batch_size' : 100,
    'debug' : 2,
    'epochs' : 5,
    'stopping_patience' : 5, # default to same as epochs, ie don't use
    'dropout_p' : 0.25,
    'embed_dropout' : 0.25,  # set to 0 to disable dropout
    'lam_recon' : 0.0005,
    
    'optimizer' : 'Adam', #or 'SGD'
}

## Save All Data to Disk

In [14]:
# save all loaded data for use by training process
saveProcessedData( trainX, trainX_capitals_cat, trainX_pos_cat, devX, devX_capitals_cat,
                   devX_pos_cat, trainY_cat, devY_cat, embedding_matrix, train_decoderY, dev_decoderY)

## Model Training Functions

In [15]:
@timeit 
def trainModelSP( testFunc, modelName, hyper_params, embed_matrix=None, verbose=False):
    """
    testFunc - the name of the python file to run
    modelName - the internal name (ID) of the model to train
    hyper_params - a dict of hyper parameters
    """
    # save the hyperparams
    with open(HYPER_PARAM_FILE, mode='w') as fp:
        json.dump( hyper_params, fp)
    
    # call the train function
    # consider replacing with a call to subprocess!!
    !python {testFunc} {modelName} {HYPER_PARAM_FILE}


In [16]:
@timeit 
def testFeatures( testFunc, modelName, hyper_params):
    """
    builds and trains models for the configuration in hyper_params,
    1 for each input feature configuration: base, pos, caps, pos + caps 
      (no longer training pos and caps independently)
    
    testFunc - the name of the python file to run
    modelName - the model name to use for labeling
    """
    hypers = hyper_params.copy()
    
    # try the embeddings with different features
    
    # base
    curModel = modelName + "_base"
    trainModelSP( testFunc, curModel, hypers )
    
    # pos tags
    #curModel = modelName + "_pos"
    #hypers['use_pos_tags'] = True
    #hypers['use_capitalization_info'] = False
    #trainModelSP( testFunc, curModel, hypers )
    
    # capitalization info
    #curModel = modelName + "_caps"
    #hypers['use_pos_tags'] = False
    #hypers['use_capitalization_info'] = True
    #trainModelSP( testFunc, curModel, hypers )
    
    # both
#     curModel = modelName + "_pos_caps"
#     hypers['use_pos_tags'] = True
#     hypers['use_capitalization_info'] = True
#     trainModelSP( testFunc, curModel, hypers )
    

##  Training
> the output isn't pretty, but we don't really need it since everything is stored in the history log. It is really just to show a sign of life.  
> * The below is just an example of how to set hyper parameters and train multiple models.

In [17]:
# capsnet training function
testFunc = "trainCapsModel.py"

hypers = hyper_param_caps.copy()
hypers['epochs'] = 1
hypers['stopping_patience'] = 3
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False

# # try different embeddings
# # learn embeddings
# print("\n\nLearn Embeddings")
# hypers['use_glove'] = False
# hypers['embed_dropout'] = 0.0
# testFeatures( testFunc, "learn", hypers)

# # learn embeddings + Dropout
# print("\n\nLearn Embeddings and Dropout")
# hypers['use_glove'] = False
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "learn_dropout", hypers)

# use glove, no learn
print("\n\nGlove Embeddings")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "glove_nolearn", hypers)

# # use glove, no learn + Dropout
# print("\n\nGlove Embeddings and Dropout")
# hypers['use_glove'] = True
# hypers['allow_glove_retrain'] = False
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "glove_nolearn_dropout", hypers)

# # use glove, learn
# print("\n\nGlove Embeddings with Learning")
# hypers['use_glove'] = True
# hypers['allow_glove_retrain'] = True
# hypers['embed_dropout'] = 0.0
# testFeatures( testFunc, "glove_learn", hypers)

# # # use glove, learn + Dropout
# print("\n\nGlove Embeddings with Learning and Dropout")
# hypers['use_glove'] = True
# hypers['allow_glove_retrain'] = True
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "glove_learn_dropout", hypers)




Glove Embeddings
Using TensorFlow backend.
fargs:
 ['trainCapsModel.py', 'glove_nolearn_base', 'hyper_params.json']
W0326 10:59:42.862789 140220505089856 module_wrapper.py:139] From /home/andrew/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0326 10:59:42.864777 140220505089856 module_wrapper.py:139] From /home/andrew/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0326 10:59:42.865197 140220505089856 module_wrapper.py:139] From /home/andrew/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0326 10:59:42.871878 140220505089856 module_wrapper.py:139] From /home/andrew/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174:

making model prediction on dev set... 
using trained model as is because decoder is DISabled.
y_true shape from devY_cat (44563,)
prediction on dev set finished. raw 1-hot prediction has shape (44563, 216)
prediction converted to class idx has shape (44563,)
decoder embedding prediction has shape (0,)
precision on dev = 0.932286476149433
recall on dev = 0.9301819354491951
f1 score on dev = 0.9312330167612795
debugging use
type y_pred <class 'numpy.ndarray'>
type raw_y_pred <class 'numpy.ndarray'>
type raw_y_pred_decoder_embeddings <class 'numpy.ndarray'>
saving prediction data under directory: dev_Predictions
please use loadutils.loadDevPredictionsData(modelName, modelsDir='dev_Predictions') to load :
 raw_y_pred, raw_y_pred_decoder_embeddings, y_pred
'trainModelSP'  8343604.34 ms
'testFeatures'  8343604.48 ms


In [18]:
# SGD testing
# capsnet training function
testFunc = "trainCapsModel.py"

hypers = hyper_param_caps.copy()
hypers['optimizer'] = "SGD"
print("Training with SGD - Nesterov Momentum Optimizer")

hypers['epochs'] = 1
hypers['stopping_patience'] = 3
hypers['use_pos_tags'] = False
hypers['use_capitalization_info'] = False

# # try different embeddings
# # learn embeddings
# print("\n\nLearn Embeddings")
# hypers['use_glove'] = False
# hypers['embed_dropout'] = 0.0
# testFeatures( testFunc, "SGD_primcaps_learn", hypers)

# # learn embeddings + Dropout
# print("\n\nLearn Embeddings and Dropout")
# hypers['use_glove'] = False
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "SGD_primcaps_learn_dropout", hypers)

# use glove, no learn
print("\n\nGlove Embeddings")
hypers['use_glove'] = True
hypers['allow_glove_retrain'] = False
hypers['embed_dropout'] = 0.0
testFeatures( testFunc, "SGD_primcaps_glove_nolearn", hypers)

# # use glove, no learn + Dropout
# print("\n\nGlove Embeddings and Dropout")
# hypers['use_glove'] = True
# hypers['allow_glove_retrain'] = False
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "SGD_primcaps_glove_nolearn_dropout", hypers)

# # use glove, learn
# print("\n\nGlove Embeddings with Learning")
# hypers['use_glove'] = True
# hypers['allow_glove_retrain'] = True
# hypers['embed_dropout'] = 0.0
# testFeatures( testFunc, "SGD_primcaps_glove_learn", hypers)

# # # use glove, learn + Dropout
# print("\n\nGlove Embeddings with Learning and Dropout")
# hypers['use_glove'] = True
# hypers['allow_glove_retrain'] = True
# hypers['embed_dropout'] = 0.25
# testFeatures( testFunc, "SGD_primcaps_glove_learn_dropout", hypers)

Training with SGD - Nesterov Momentum Optimizer


Glove Embeddings
Using TensorFlow backend.
fargs:
 ['trainCapsModel.py', 'SGD_primcaps_glove_nolearn_base', 'hyper_params.json']
W0326 13:18:47.349478 140258463278912 module_wrapper.py:139] From /home/andrew/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0326 13:18:47.351441 140258463278912 module_wrapper.py:139] From /home/andrew/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0326 13:18:47.351813 140258463278912 module_wrapper.py:139] From /home/andrew/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0326 13:18:47.359259 140258463278912 module_wrapper.py:139] From /home/andrew/.local/lib/pyt

making model prediction on dev set... 
using trained model as is because decoder is DISabled.
y_true shape from devY_cat (44563,)
prediction on dev set finished. raw 1-hot prediction has shape (44563, 216)
prediction converted to class idx has shape (44563,)
decoder embedding prediction has shape (0,)
precision on dev = 0.0017976720147409105
recall on dev = 0.0021499019107253233
f1 score on dev = 0.001958072766879199
debugging use
type y_pred <class 'numpy.ndarray'>
type raw_y_pred <class 'numpy.ndarray'>
type raw_y_pred_decoder_embeddings <class 'numpy.ndarray'>
saving prediction data under directory: dev_Predictions
please use loadutils.loadDevPredictionsData(modelName, modelsDir='dev_Predictions') to load :
 raw_y_pred, raw_y_pred_decoder_embeddings, y_pred
'trainModelSP'  10894136.82 ms
'testFeatures'  10894138.75 ms
