In [1]:
from utils import *

In [2]:
# load data

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("tmp/")

Extracting tmp/train-images-idx3-ubyte.gz
Extracting tmp/train-labels-idx1-ubyte.gz
Extracting tmp/t10k-images-idx3-ubyte.gz
Extracting tmp/t10k-labels-idx1-ubyte.gz


Batch Normalization is another common approach to the vanishing/exploding gradient problem. Again, the trick is to somehow ensure that the outputs of neurons in a given layer are centered on mean 0 and have a standard deviation of 1. Batch Normalization also adds two additional, learned parameters for the scaling and shifting of the outputs, once they've been normalized.

Means and standard deviations are calculated by mini-batch, hence "Batch" normalization. During testing or deployment, means and stds are calculated from the entirety of the training data. Behavior is different during training and testing, and so the network needs to know which its doing. It's computationally efficient to calculate the mean and std for *all* the training data - which is needed during testing - *during* training. TensorFlow does this by adding new operations for keeping a running mean and std during training. It's important to be aware of this - these ops need to be called during training.

In [3]:
reset_graph()

n_inputs = 28*28
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

learning_rate = 0.01

# decay for calculation of running averages
# more 9s for bigger datasets
batch_norm_momentum = 0.9

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")
# Batch normalization acts different if you're training or deploying
# We make a boolean variable to indicate whether or not we're training
training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope("dnn"):
    he_init = tf.contrib.layers.variance_scaling_initializer()
    
    # no activation in dense layer
    hidden1 = tf.layers.dense(X, n_hidden1, kernel_initializer=he_init, name="hidden1")
    # instead, after dense, apply BN, then intended activation
    bn1 = tf.nn.elu(tf.layers.batch_normalization(hidden1, training=training, momentum=batch_norm_momentum))
    hidden2 = tf.layers.dense(bn1, n_hidden2, kernel_initializer=he_init, name="hidden2")
    bn2 = tf.nn.elu(tf.layers.batch_normalization(hidden2, training=training, momentum=batch_norm_momentum))
    logits_before_bn = tf.layers.dense(bn2, n_outputs, name="outputs")
    logits = tf.layers.batch_normalization(logits_before_bn, training=training, momentum=batch_norm_momentum, name="logits")
    
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")
    
with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
    
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y , 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
init = tf.global_variables_initializer()

In [4]:
# execution

n_epochs = 40
batch_size = 50

# here are the running average ops!
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            # both training_ops AND extra_update_ops are evaluated
            # feed_dict now gets "is training" boolean as input
            sess.run([training_op, extra_update_ops], feed_dict={training: True, X:X_batch, y:y_batch})
        if epoch % 5 == 0 or epoch == n_epochs - 1:
            acc_train = accuracy.eval(feed_dict={X:mnist.train.images, y:mnist.train.labels})
            acc_val = accuracy.eval(feed_dict={X:mnist.validation.images, y:mnist.validation.labels})
            print(epoch, "train acc:", acc_train, "val acc:", acc_val)
    # Now we're testing
    # We set up the training boolean to have a default value of False
    # therefore we don't need to pass it in the feed dict
    # batch normalization will be calculated accordingly
    acc_test = accuracy.eval(feed_dict={X:mnist.test.images, y:mnist.test.labels})
    print("Test acc:", acc_test)

0 train acc: 0.9162 val acc: 0.9152
5 train acc: 0.971091 val acc: 0.9688
10 train acc: 0.984273 val acc: 0.9772
15 train acc: 0.990546 val acc: 0.9786
20 train acc: 0.993237 val acc: 0.981
25 train acc: 0.996 val acc: 0.9812
30 train acc: 0.996582 val acc: 0.9778
35 train acc: 0.997964 val acc: 0.9798
39 train acc: 0.998709 val acc: 0.9822
Test acc: 0.9811
