In [10]:
# default_exp trainer

# Trainer
> Code to do model training using Keras and Estimator APIs

In [11]:
import gcp_runner.core
gcp_runner.core.export_and_reload_all(silent=True)

In [12]:
#export

from criteo_nbdev.constants import *
from gcp_runner.ai_platform_constants import *

In [13]:
%matplotlib inline

In [14]:
#export

import datetime
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
from IPython.display import clear_output

class TrainTimeCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_start_time = datetime.datetime.now()

    def on_epoch_end(self, epoch, logs=None):
        logging.info('\nepoch train time: (hh:mm:ss.ms) {}'.format(
            datetime.datetime.now() - self.epoch_start_time))
        if not self.params is None:
            if 'steps' in self.params and self.params['steps']:
                epoch_milliseconds = (datetime.datetime.now(
                ) - self.epoch_start_time).total_seconds() * 1000
                logging.info(
                    '{} ms/step'.format(epoch_milliseconds / self.params['steps']))
                if BATCH_SIZE is not None:
                    logging.info('{} microseconds/example'.format(
                        1000 * epoch_milliseconds / self.params['steps'] / BATCH_SIZE))
                    


class PlotLossesCallback(tf.keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.x = []
        self.losses = []
        self.val_losses = []
        self.fig = plt.figure()
        self.logs = []

    def on_epoch_end(self, epoch, logs={}):
        
        self.logs.append(logs)
        self.x.append(self.i)
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.i += 1
        
        clear_output(wait=True)
        plt.plot(self.x, self.losses, label="loss")
        plt.plot(self.x, self.val_losses, label="val_loss")
        plt.legend()
        plt.show();

In [15]:
#export
import tensorflow as tf

def create_categorical_feature_column_with_hash_bucket(vocabulary_size_dict, key):
    corpus_size = vocabulary_size_dict[key]
    hash_bucket_size = min(corpus_size, 100000)
    categorical_feature_column = tf.feature_column.categorical_column_with_hash_bucket(
        key,
        hash_bucket_size,
        dtype=tf.dtypes.string
    )
    logging.info('categorical column %s hash_bucket_size %d',
                 key, hash_bucket_size)
    return categorical_feature_column

def create_categorical_feature_column_with_vocabulary_list(corpus_dict, vocabulary_size_dict, key):
    corpus_size = vocabulary_size_dict[key]
    categorical_feature_column = tf.feature_column.categorical_column_with_vocabulary_list(
        key,
        list(corpus_dict[key].keys()),
        dtype=tf.dtypes.string,
        num_oov_buckets=max(1, corpus_size - len(corpus_dict[key]))
    )
    logging.info(
        'categorical column with vocabular %s corpus_size %d', key, corpus_size)

    return categorical_feature_column

def create_embedding(vocabulary_size_dict, key, categorical_feature_column):
    vocabulary_size = vocabulary_size_dict[key]
    if vocabulary_size < 10:
        logging.info(
            'categorical column %s vocabulary_size %d - creating indicator column', key, vocabulary_size)
        return tf.feature_column.indicator_column(categorical_feature_column)

    embedding_dimension = int(min(50, math.floor(6 * vocabulary_size**0.25)))
    embedding_feature_column = tf.feature_column.embedding_column(
        categorical_feature_column,
        embedding_dimension)
    return embedding_feature_column

def create_linear_feature_columns():
    return list(tf.feature_column.numeric_column(field.name, dtype=tf.dtypes.float32) for field in CSV_SCHEMA if field.field_type == 'INTEGER' and field.name != 'label')

def create_categorical_embeddings_feature_columns(vocabulary_size_dict, embeddings_mode: EMBEDDINGS_MODE_TYPE):
    if embeddings_mode == EMBEDDINGS_MODE_TYPE.none:
        return []
    elif embeddings_mode == EMBEDDINGS_MODE_TYPE.hashbucket:
        return list(create_embedding(
            vocabulary_size_dict,
            key,
            create_categorical_feature_column_with_hash_bucket(vocabulary_size_dict, key))
            for key, _ in vocabulary_size_dict.items())
    elif embeddings_mode == EMBEDDINGS_MODE_TYPE.vocabular:
        logging.info('loading corpus dictionary')
        corpus_dict = criteo_nbdev.data_reader.get_corpus_dict()
        return list(create_embedding(
            vocabulary_size_dict,
            key,
            create_categorical_feature_column_with_vocabulary_list(corpus_dict, vocabulary_size_dict, key))
            for key, _ in corpus_dict.items())
    else:
        raise ValueError('invalid embedding_mode: {}'.format(embedding_mode))

def create_feature_columns(embedding_mode: EMBEDDINGS_MODE_TYPE):
    logging.info('loading vocabulary size dictionary')
    vocabulary_size_dict = criteo_nbdev.data_reader.get_vocabulary_size_dict()
    feature_columns = []
    feature_columns.extend(create_linear_feature_columns())
    feature_columns.extend(
        create_categorical_embeddings_feature_columns(vocabulary_size_dict, embedding_mode))
    return feature_columns

In [16]:
#export

import criteo_nbdev.data_reader
import nbdev.imports
import tensorflow as tf
import logging
import math
import os

def create_keras_model_sequential(embeddings_mode: EMBEDDINGS_MODE_TYPE):
    feature_columns = create_feature_columns(embeddings_mode)

    feature_layer = tf.keras.layers.DenseFeatures(
        feature_columns, name="feature_layer")
    Dense = tf.keras.layers.Dense
    Dropout = tf.keras.layers.Dropout
    BatchNormalization = tf.keras.layers.BatchNormalization
    model = tf.keras.Sequential(
        [
            feature_layer,
            Dense(598, activation=tf.nn.relu),
            Dense(598, activation=tf.nn.relu),
            Dense(598, activation=tf.nn.relu),
            Dense(1, activation=tf.nn.sigmoid)
        ])

    logging.info('compiling sequential keras model')
    # Compile Keras model
    model.compile(
        optimizer=tf.optimizers.SGD(learning_rate=0.05),
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=['accuracy'])
    return model

def train_and_evaluate_keras_model(
    model, 
    model_dir, 
    epochs = 3,
    dataset_source: DATASET_SOURCE_TYPE = DATASET_SOURCE_TYPE.bq,
    dataset_size: DATASET_SIZE_TYPE = DATASET_SIZE_TYPE.tiny,
    embeddings_mode: EMBEDDINGS_MODE_TYPE = EMBEDDINGS_MODE_TYPE.hashbucket,
    distribution_strategy: DistributionStrategyType = None):
    
    log_dir = os.path.join(model_dir, "logs")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir,
        histogram_freq=1,
        embeddings_freq=1)

    checkpoints_dir = os.path.join(model_dir, "checkpoints")
    # crashing https://github.com/tensorflow/tensorflow/issues/27688
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)

    callbacks = []
    train_time_callback = TrainTimeCallback()

    if DistributionStrategyType == DistributionStrategyType.TPU_STRATEGY:
        # epoch and accuracy constants are not supported when training on TPU.
        checkpoints_file_path = checkpoints_dir + "/epochs_tpu.hdf5"
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            checkpoints_file_path, verbose=1, mode='max')
        callbacks = [tensorboard_callback, checkpoint_callback, train_time_callback]
    else:
        if embeddings_mode == EMBEDDINGS_MODE_TYPE.manual or distribution_strategy == DistributionStrategyType.MULTI_WORKER_MIRRORED_STRATEGY:
            # accuracy fails for adagrad
            # for some reason accuracy is not available for EMBEDDINGS_MODE_TYPE.manual
            # for some reason accuracy is not available for MultiWorkerMirroredStrategy
            checkpoints_file_path = checkpoints_dir + \
                "/epochs:{epoch:03d}.hdf5"
        else:
            checkpoints_file_path = checkpoints_dir + \
                "/epochs:{epoch:03d}-accuracy:{accuracy:.3f}.hdf5"
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            checkpoints_file_path, verbose=1, mode='max')
        #callbacks = [tensorboard_callback, checkpoint_callback, train_time_callback]
        callbacks = [tensorboard_callback, train_time_callback]

    verbosity = 1 if nbdev.imports.in_ipython() else 2
    if nbdev.imports.in_ipython():
        callbacks.append(PlotLossesCallback())
    
    logging.info('training keras model')
    training_ds = criteo_nbdev.data_reader.get_dataset(dataset_source, dataset_size, DATASET_TYPE.training, embeddings_mode).repeat(epochs)
    eval_ds = criteo_nbdev.data_reader.get_dataset(dataset_source, dataset_size, DATASET_TYPE.validation, embeddings_mode).repeat(epochs)
    
    # steps_per_epoch and validation_steps are required for MultiWorkerMirroredStrategy
    model.fit(
        training_ds,
        epochs=epochs,
        verbose=verbosity,
        callbacks=callbacks,
        steps_per_epoch=criteo_nbdev.data_reader.get_steps_per_epoch(dataset_size, DATASET_TYPE.training),
        validation_data=eval_ds,
        validation_steps=criteo_nbdev.data_reader.get_steps_per_epoch(dataset_size, DATASET_TYPE.validation))

    logging.info("done training keras model, evaluating model")
    loss, accuracy = model.evaluate(
        eval_ds, 
        verbose=verbosity, 
        steps=criteo_nbdev.data_reader.get_steps_per_epoch(dataset_size, DATASET_TYPE.validation), 
        callbacks=[tensorboard_callback])
    logging.info("Eval - Loss: {}, Accuracy: {}".format(loss, accuracy))
    logging.info(model.summary())
    logging.info("done evaluating keras model")
    return {'accuracy': accuracy, 'loss': loss}

def train_and_evaluate_keras(
    model_dir, 
    epochs = 3,
    dataset_source: DATASET_SOURCE_TYPE = DATASET_SOURCE_TYPE.bq,
    dataset_size: DATASET_SIZE_TYPE = DATASET_SIZE_TYPE.tiny,
    embeddings_mode: EMBEDDINGS_MODE_TYPE = EMBEDDINGS_MODE_TYPE.hashbucket,
    distribution_strategy: DistributionStrategyType = None):
    
    model = create_keras_model_sequential(embeddings_mode)
    return train_and_evaluate_keras_model(
        model,
        model_dir,
        epochs=epochs,
        dataset_source=dataset_source,
        dataset_size=dataset_size,
        embeddings_mode=embeddings_mode,
        distribution_strategy=distribution_strategy)


In [24]:
#export
import criteo_nbdev.data_reader
import nbdev.imports
import tensorflow as tf
import logging
import math
import os
from kerastuner.tuners import RandomSearch
from criteo_nbdev.constants import *
from gcp_runner.ai_platform_constants import *

def keras_hp_search(
    model_dir, 
    epochs = 3,
    dataset_source: DATASET_SOURCE_TYPE = DATASET_SOURCE_TYPE.gcs,
    dataset_size: DATASET_SIZE_TYPE = DATASET_SIZE_TYPE.tiny,
    embeddings_mode: EMBEDDINGS_MODE_TYPE = EMBEDDINGS_MODE_TYPE.hashbucket,
    distribution_strategy: DistributionStrategyType = None):

    def build_model(hp):
        feature_columns = create_feature_columns(embeddings_mode)
        feature_layer = tf.keras.layers.DenseFeatures(feature_columns, name="feature_layer")
        Dense = tf.keras.layers.Dense
        kernel_regularizer=tf.keras.regularizers.l2(0.001)
        model = tf.keras.Sequential()
        model.add(feature_layer)
        model.add(Dense(hp.Choice('layer1', values=[50, 100, 200]), activation=tf.nn.relu, kernel_regularizer=kernel_regularizer)),
        model.add(Dense(hp.Choice('layer2', values=[50, 100, 200]), activation=tf.nn.relu, kernel_regularizer=kernel_regularizer)),
        model.add(Dense(1, activation=tf.nn.sigmoid, kernel_regularizer=kernel_regularizer))

        logging.info('compiling sequential keras model')
        # Compile Keras model
        model.compile(
          optimizer=tf.optimizers.SGD(learning_rate=0.05),
          loss=tf.keras.losses.BinaryCrossentropy(),
          metrics=['accuracy'])
        return model

    training_ds = criteo_nbdev.data_reader.get_dataset(dataset_source, dataset_size, DATASET_TYPE.training, embeddings_mode).repeat(epochs)
    eval_ds = criteo_nbdev.data_reader.get_dataset(dataset_source, dataset_size, DATASET_TYPE.validation, embeddings_mode).repeat(epochs)

    tuner = RandomSearch(
        build_model,
        objective='val_loss',
        max_trials=30,
        executions_per_trial=1,
        directory=model_dir)

    tuner.search_space_summary()
    tuner.search(training_ds,
                 validation_data=eval_ds,
                 epochs=3,
                 verbose=2)


In [25]:
# export

def train_and_evaluate_estimator(
    model_dir,
    epochs = 3,
    dataset_source: DATASET_SOURCE_TYPE = DATASET_SOURCE_TYPE.bq,
    dataset_size: DATASET_SIZE_TYPE = DATASET_SIZE_TYPE.tiny,
    embeddings_mode: EMBEDDINGS_MODE_TYPE = EMBEDDINGS_MODE_TYPE.hashbucket,
    distribution_strategy: DistributionStrategyType = None):
    
    print(dataset_size)
    logging.info('training for {} steps'.format(criteo_nbdev.data_reader.get_steps_per_epoch(dataset_size, DATASET_TYPE.training)))
    config = tf.estimator.RunConfig(
        train_distribute=distribution_strategy,
        eval_distribute=distribution_strategy)
    
    feature_columns = create_feature_columns(embeddings_mode)
    estimator = tf.estimator.DNNClassifier(
        optimizer=tf.optimizers.SGD(learning_rate=0.05),
        feature_columns=feature_columns,
        hidden_units=[598, 598, 598],
        model_dir=model_dir,
        config=config,
        n_classes=2)
    logging.info('training and evaluating estimator model')
    training_ds = criteo_nbdev.data_reader.get_dataset(dataset_source, dataset_size, DATASET_TYPE.training, embeddings_mode).repeat(epochs)
    eval_ds = criteo_nbdev.data_reader.get_dataset(dataset_source, dataset_size, DATASET_TYPE.validation, embeddings_mode).repeat(epochs) # why??

    # Need to specify both max_steps and epochs. Each worker will go through epoch separately.
    # see https://www.tensorflow.org/api_docs/python/tf/estimator/train_and_evaluate?version=stable
    tf.estimator.train_and_evaluate(
        estimator,
        train_spec=tf.estimator.TrainSpec(
            input_fn=lambda: criteo_nbdev.data_reader.get_dataset(dataset_source, dataset_size, DATASET_TYPE.training, embeddings_mode).repeat(epochs), 
            max_steps=criteo_nbdev.data_reader.get_steps_per_epoch(dataset_size, DATASET_TYPE.training) * epochs),
        eval_spec=tf.estimator.EvalSpec(
            input_fn=lambda: criteo_nbdev.data_reader.get_dataset(dataset_source, dataset_size, DATASET_TYPE.validation, embeddings_mode).repeat(epochs)))
    logging.info('done evaluating estimator model')
    
    serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
        tf.feature_column.make_parse_example_spec(feature_columns))
    estimator_base_path = os.path.join(model_dir, 'from_estimator')
    estimator_path = estimator.export_saved_model(estimator_base_path, serving_input_fn)

In [None]:
train_and_evaluate_estimator('./tmp', epochs=1, dataset_size=DATASET_SIZE_TYPE.tiny, embeddings_mode = EMBEDDINGS_MODE_TYPE.vocabular)

In [28]:
"""Imports a protobuf model as a graph in Tensorboard."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys

from tensorflow.python.client import session
from tensorflow.python.framework import importer
from tensorflow.python.framework import ops
from tensorflow.python.platform import app
from tensorflow.python.summary import summary
from tensorflow.python.tools import saved_model_utils

# Try importing TensorRT ops if available
# TODO(aaroey): ideally we should import everything from contrib, but currently
# tensorrt module would cause build errors when being imported in
# tensorflow/contrib/__init__.py. Fix it.
# pylint: disable=unused-import,g-import-not-at-top,wildcard-import
try:
    from tensorflow.contrib.tensorrt.ops.gen_trt_engine_op import *
except ImportError:
    pass
# pylint: enable=unused-import,g-import-not-at-top,wildcard-import

def import_to_tensorboard(model_dir, log_dir, tag_set):
  """View an imported protobuf model (`.pb` file) as a graph in Tensorboard.
  Args:
    model_dir: The location of the protobuf (`pb`) model to visualize
    log_dir: The location for the Tensorboard log to begin visualization from.
    tag_set: Group of tag(s) of the MetaGraphDef to load, in string format,
        separated by ','. For tag-set contains multiple tags, all tags must be
        passed in.
  Usage:
    Call this function with your model location and desired log directory.
    Launch Tensorboard by pointing it to the log directory.
    View your imported `.pb` model as a graph.
  """
  with session.Session(graph=ops.Graph()) as sess:
    input_graph_def = saved_model_utils.get_meta_graph_def(
        model_dir, tag_set).graph_def
    importer.import_graph_def(input_graph_def)

    pb_visual_writer = summary.FileWriter(log_dir)
    pb_visual_writer.add_graph(sess.graph)
    print("Model Imported. Visualize by running: "
          "tensorboard --logdir={}".format(log_dir))
    
import_to_tensorboard('./tmp/from_estimator/1593547921', './tmp/estimator_vocabular_logs', 'serve')

Model Imported. Visualize by running: tensorboard --logdir=./tmp/estimator_vocabular_logs
