** Building a Multilayer Model for MNIST in TensorFlow **

Task: Create an MNIST digit reader

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


The performance of deep neural networks very much depends on an effective initialization of its parameters. There are many features of the error surfaces of deep neural networks that make optimization using vanilla stochastic gradient descent very difficult. This problem is exacerbated as the number of layers in the model (and thus the complexity of the error surface) increases. Smart initialization is one way to mitigate this issue.

In [3]:
def layer(input, weight_shape, bias_shape):
    weight_stddev = (2.0/weight_shape[0])**0.5
    #weight_init = tf.random_normal_initializer(stddev=0.5) # performance is very bad
    weight_init = tf.random_normal_initializer(stddev=weight_stddev)
    """
    if we use tanh neuron, we may initialize our parameters as following:
    epsilon = np.sqrt(6) / np.sqrt(sum(weight_shape))
    for sigmoid neuron, epsilon = 4*epsilon
    weight_init = tf.random_uniform_initializer(minval=-epsilon, maxval=epsilon)
    W = tf.get_variable('W', weight_shape, initializer=weight_init)
    """
    #?# bias_init = tf.zeros_initializer(bias_shape) # difference with followling?
    bias_init = tf.constant_initializer(value=0)
    W = tf.get_variable('W', weight_shape, initializer=weight_init)
    b = tf.get_variable('b', bias_shape, initializer=bias_init)
    output = tf.nn.relu(tf.matmul(input, W) + b)
    return output
    

In [4]:
def inference(x):
    with tf.variable_scope('hidden_1'):
        hidden1 = layer(x, [784, 256], [256])
    with tf.variable_scope('hidden_2'):
        hidden2 = layer(hidden1, [256,256], [256])
    with tf.variable_scope('output'):
        output = layer(hidden2, [256,10],[10])
    return output

Finally, for slightly better performance, we perform the softmax while computing the loss instead of during the inference phase of the network. This results in the modification below:

In [5]:
def loss(output, y):
    xentropy = tf.nn.softmax_cross_entropy_with_logits(output, y)
    loss = tf.reduce_mean(xentropy)
    return loss

In [6]:
def training(cost, global_step):
    tf.scalar_summary("cost",cost)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_op = optimizer.minimize(cost, global_step=global_step)
    return train_op

In [7]:
def evaluate(output, y):
    correct_prediction = tf.equal(tf.argmax(output,1), tf.argmax(y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    return accuracy

In [8]:
learning_rate = 0.01
train_epochs = 1000
batch_size = 100
display_step = 1

In [9]:
with tf.Graph().as_default():
    x = tf.placeholder("float", [None, 784])
    y = tf.placeholder("float", [None, 10])
    
    output = inference(x)
    cost = loss(output, y)
    global_step = tf.Variable(0, name='global_step', trainable=False) # global_step as variable, can be updated while training
    train_op = training(cost, global_step)
    eval_op = evaluate(output, y)
    summary_op = tf.merge_all_summaries()
    saver = tf.train.Saver()
    sess = tf.Session()
    summary_writer = tf.train.SummaryWriter("multilayer_logs/", graph=sess.graph)
    init_op = tf.initialize_all_variables()
    sess.run(init_op)
    
    for epoch in range(train_epochs):
        avg_cost = 0.0
        total_batch = int(mnist.train.num_examples/batch_size)
        for i in range(total_batch):
            minibatch_x, minibatch_y = mnist.train.next_batch(batch_size)
            feed_dict = {x : minibatch_x, y : minibatch_y}
            sess.run(train_op, feed_dict = feed_dict)
            minibatch_cost = sess.run(cost, feed_dict=feed_dict)
            avg_cost += minibatch_cost/total_batch
        if epoch % display_step == 0:
            val_feed_dict = {
                x : mnist.validation.images,
                y : mnist.validation.labels
            }
            accuracy = sess.run(eval_op, feed_dict=val_feed_dict) # Actually, eval_op is NOT an operation, it's a tensor.
            print("Validation Error:", (1 - accuracy))
            summary_str = sess.run(summary_op, feed_dict=feed_dict)
            summary_writer.add_summary(summary_str, sess.run(global_step))
            saver.save(sess, "multilayer_logs/model-checkpoint", global_step=global_step)
    
    print("optimization Finished!")
    
    test_feed_dict = {
        x : mnist.test.images,
        y : mnist.test.labels
    }
    
    accuracy = sess.run(eval_op, feed_dict=test_feed_dict)
    print("Test Accuarcy:", accuracy)
    

Validation Error: 0.308200001717
summary_op: Tensor("MergeSummary/MergeSummary:0", shape=(), dtype=string)
Validation Error: 0.280799984932
summary_op: Tensor("MergeSummary/MergeSummary:0", shape=(), dtype=string)
Validation Error: 0.182799994946
summary_op: Tensor("MergeSummary/MergeSummary:0", shape=(), dtype=string)
Validation Error: 0.174399971962
summary_op: Tensor("MergeSummary/MergeSummary:0", shape=(), dtype=string)
Validation Error: 0.170599997044
summary_op: Tensor("MergeSummary/MergeSummary:0", shape=(), dtype=string)
Validation Error: 0.166800022125
summary_op: Tensor("MergeSummary/MergeSummary:0", shape=(), dtype=string)
Validation Error: 0.163600027561
summary_op: Tensor("MergeSummary/MergeSummary:0", shape=(), dtype=string)
Validation Error: 0.161000013351
summary_op: Tensor("MergeSummary/MergeSummary:0", shape=(), dtype=string)
Validation Error: 0.159799993038
summary_op: Tensor("MergeSummary/MergeSummary:0", shape=(), dtype=string)
Validation Error: 0.15600001812
summa