In [1]:
import os
import numpy as np

import tensorflow as tf
from tensorflow.python.keras.datasets import mnist
from tensorflow.contrib.eager.python import tfe

  from ._conv import register_converters as _register_converters


In [2]:
# enable eager mode
tf.enable_eager_execution()
tf.set_random_seed(0)
np.random.seed(0)

In [3]:
# constants
batch_size = 128
epochs = 8
num_classes = 10

In [4]:
# dataset loading
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = x_train.reshape((-1, 28, 28, 1))
x_test = x_test.reshape((-1, 28, 28, 1))

# one hot encode the labels. convert back to numpy as we cannot use a combination of numpy
# and tensors as input to keras
y_train_ohe = tf.one_hot(y_train, depth=num_classes).numpy()
y_test_ohe = tf.one_hot(y_test, depth=num_classes).numpy()

print('x train', x_train.shape)
print('y train', y_train_ohe.shape)
print('x test', x_test.shape)
print('y test', y_test_ohe.shape)

x train (60000, 28, 28, 1)
y train (60000, 10)
x test (10000, 28, 28, 1)
y test (10000, 10)


# Create a Model to build a Convolution2D-BatchNormalization-Relu "Block"

Decomposing the `Conv-BatchNorm-Relu` pattern into a separate Model itself allows us to simply call it as if it was just another Keras Layer. This is recommended for complex networks like Inception and ResNet and when designing one's own models.

In [5]:
class ConvBnReluBlock(tf.keras.Model):
    def __init__(self, filters, kernel, strides):
        super(ConvBnReluBlock, self).__init__()
        self.cnn = tf.keras.layers.Conv2D(filters, (kernel, kernel), strides=(strides, strides), kernel_initializer='he_normal')
        self.bn = tf.keras.layers.BatchNormalization()

    def call(self, inputs, training=None, mask=None):
        x = self.cnn(inputs)
        x = self.bn(x)
        x = tf.nn.relu(x)
        return x

# Model as a Layer
Keras Model extends Keras Layer internally, and therefore can be a drop in replacement for a Keras Layer, as shown below.

## Why not use tf.keras.Sequential ?
Sequential is a special version of Model, which chains layers linearly together. If you see the above `Conv-BatchNorm-Relu` block, it is a prime example of something that can be done with Sequential. So why did I bother with subclassing Model again and defining the chain explicitely?

Simple. Sequential is somewhat painful to work with in Eager. It requires that the first layer added to it has its `batch_input_shape` property set. If it isn't, then it complains and crashes.

Model, on the other hand, defers the shape information calculation to the first `call` operation or when we explicitly call `Model._set_input(dummy_x)`. Simply put, unless you want to worry about knowing the input shape when building the model, I suggest simply extending Model to do even linear layer chains and hope that TF Eager matures quickly to not require the input shape when using Sequential. 

In [6]:
# model definition using the above "Block" model as components
class CNN(tf.keras.Model):
    def __init__(self, num_classes):
        super(CNN, self).__init__()
        self.block1 = ConvBnReluBlock(16, kernel=5, strides=2)
        self.block2 = ConvBnReluBlock(32, kernel=5, strides=2)
        self.pool = tf.keras.layers.GlobalAveragePooling2D()
        self.classifier = tf.keras.layers.Dense(num_classes)

    def call(self, inputs, training=None, mask=None):
        x = self.block1(inputs)
        x = self.block2(x)
        x = self.pool(x)
        output = self.classifier(x)

        # softmax op does not exist on the gpu, so always use cpu
        with tf.device('/cpu:0'):
            output = tf.nn.softmax(output)

        return output

In [7]:
device = '/cpu:0' if tfe.num_gpus() == 0 else '/gpu:0'

with tf.device(device):
    # build model and optimizer
    model = CNN(num_classes)
    model.compile(optimizer=tf.train.AdamOptimizer(0.001), loss='categorical_crossentropy',
                  metrics=['accuracy'])

    # TF Keras tries to use entire dataset to determine shape without this step when using .fit()
    # Fix = Use exactly one sample from the provided input dataset to determine input/output shape/s for the model
    dummy_x = tf.zeros((1, 28, 28, 1))
    model._set_inputs(dummy_x)

    # train
    model.fit(x_train, y_train_ohe, batch_size=batch_size, epochs=epochs,
              validation_data=(x_test, y_test_ohe), verbose=1)

    # evaluate on test set
    scores = model.evaluate(x_test, y_test_ohe, batch_size, verbose=1)
    print("Final test loss and accuracy :", scores)

Train on 60000 samples, validate on 10000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Final test loss and accuracy : [0.08478006159067154, 0.9748]
