In [1]:
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf

# Download MNIST data
mnist = input_data.read_data_sets('/tmp/data/', one_hot=True)
sess = tf.InteractiveSession()
x = tf.placeholder(tf.float32, [None, 784])
y_ = tf.placeholder(tf.float32, [None, 10])
keep_prob = tf.placeholder(tf.float32)

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [2]:
def train_mnist(y, iterations=1000, batch_size=100):
    # Define loss and optimizer
    cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
    train_step = tf.train.AdamOptimizer().minimize(cross_entropy)

    # Train the model
    tf.initialize_all_variables().run()
    for i in range(iterations):
        batch_xs, batch_ys = mnist.train.next_batch(batch_size)
        train_step.run({x: batch_xs, y_: batch_ys, keep_prob: 0.5})

    # Test the model
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    validation_accuracy = accuracy.eval({x: mnist.validation.images, y_: mnist.validation.labels, keep_prob: 1.0})
    test_accuracy = accuracy.eval({x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0})
    return validation_accuracy, test_accuracy

In [3]:
# Grid search for hyper parameters
import itertools
import sys

def optimize_mnist(get_model, *hyperparameters):
    print('validation, test, hyperparameter')
    best = None
    
    for hyperparameter in itertools.product(*hyperparameters):
        model = get_model(*hyperparameter)
        validation_accuracy, test_accuracy = train_mnist(model)
        print(validation_accuracy, test_accuracy, hyperparameter)
        sys.stdout.flush()
        
        if best is None or validation_accuracy > best[0]:
            best = (validation_accuracy, test_accuracy, hyperparameter)
    print('best setting')
    print(*best)

In [4]:
# Linear softmax classifier
def linear_model(stddev, b_init):
    W = tf.Variable(tf.truncated_normal([784, 10], stddev=stddev))
    b = tf.Variable(tf.constant(b_init, shape=[10]))
    y = tf.nn.softmax(tf.matmul(x, W) + b)
    return y

stddev = [0.01, 0.1, 1.0]
b_init = [0.0, 0.01, 0.1, 1.0]
optimize_mnist(linear_model, stddev, b_init)

validation, test, hyperparameter
0.9158 0.9129 (0.01, 0.0)
0.9148 0.9141 (0.01, 0.01)
0.9162 0.9146 (0.01, 0.1)
0.9156 0.9135 (0.01, 1.0)
0.9134 0.9111 (0.1, 0.0)
0.9144 0.9126 (0.1, 0.01)
0.9152 0.9096 (0.1, 0.1)
0.911 0.9081 (0.1, 1.0)
0.7544 0.7555 (1.0, 0.0)
0.7462 0.7448 (1.0, 0.01)
0.742 0.7302 (1.0, 0.1)
0.7778 0.7843 (1.0, 1.0)
best setting
0.9162 0.9146 (0.01, 0.1)


In [5]:
# neural network with one hidden layer
stddev = 0.01
b_init = 0.01

def neural_network(h_size):
    W1 = tf.Variable(tf.truncated_normal([784, h_size], stddev=stddev))
    b1 = tf.Variable(tf.constant(b_init, shape=[h_size]))
    W2 = tf.Variable(tf.truncated_normal([h_size, 10], stddev=stddev))
    b2 = tf.Variable(tf.constant(b_init, shape=[10]))
    h = tf.nn.relu(tf.matmul(x, W1) + b1)
    y = tf.nn.softmax(tf.matmul(h, W2) + b2)
    return y

h_size = [100, 1000, 5000, 10000]
optimize_mnist(neural_network, h_size)

validation, test, hyperparameter
0.951 0.9454 (100,)
0.9686 0.9693 (1000,)
0.9792 0.9747 (5000,)
0.9754 0.9718 (10000,)
best setting
0.9792 0.9747 (5000,)


In [6]:
# neural network with 2 hidden layers
def deep_network(h1_size, h2_size):
    W1 = tf.Variable(tf.truncated_normal([784, h1_size], stddev=stddev))
    b1 = tf.Variable(tf.constant(b_init, shape=[h1_size]))
    W2 = tf.Variable(tf.truncated_normal([h1_size, h2_size], stddev=stddev))
    b2 = tf.Variable(tf.constant(b_init, shape=[h2_size]))
    W3 = tf.Variable(tf.truncated_normal([h2_size, 10], stddev=stddev))
    b3 = tf.Variable(tf.constant(b_init, shape=[10]))
    h1 = tf.nn.relu(tf.matmul(x, W1) + b1)
    h2 = tf.nn.relu(tf.matmul(h1, W2) + b2)
    y = tf.nn.softmax(tf.matmul(h2, W3) + b3)
    return y

h1_size = [1000, 5000]
h2_size = [1000, 5000]

optimize_mnist(deep_network, h1_size, h2_size)

validation, test, hyperparameter
0.9762 0.9738 (1000, 1000)
0.975 0.9746 (1000, 5000)
0.976 0.9733 (5000, 1000)
0.9742 0.9734 (5000, 5000)
best setting
0.9762 0.9738 (1000, 1000)


In [7]:
# neural network  with one hidden layer and dropout
def dropout(h_size):
    W1 = tf.Variable(tf.truncated_normal([784, h_size], stddev=stddev))
    b1 = tf.Variable(tf.constant(b_init, shape=[h_size]))
    W2 = tf.Variable(tf.truncated_normal([h_size, 10], stddev=stddev))
    b2 = tf.Variable(tf.constant(b_init, shape=[10]))

    h = tf.nn.relu(tf.matmul(x, W1) + b1)
    h2 = tf.nn.dropout(h, keep_prob)
    y = tf.nn.softmax(tf.matmul(h2, W2) + b2)
    return y

h_size = [100, 1000, 5000, 10000]
optimize_mnist(dropout, h_size)

validation, test, hyperparameter
0.9436 0.9381 (100,)
0.9718 0.9686 (1000,)
0.9736 0.9711 (5000,)
0.971 0.9696 (10000,)
best setting
0.9736 0.9711 (5000,)


In [8]:
# neural network with one hidden layer and batch normalization
def batch_normalization(h_size):
    W1 = tf.Variable(tf.truncated_normal([784, h_size], stddev=stddev))
    W2 = tf.Variable(tf.truncated_normal([h_size, 10], stddev=stddev))

    z1 = tf.matmul(x, W1)
    mean1, variance1 = tf.nn.moments(z1,[0])
    offset1 = tf.Variable(tf.zeros([h_size]))
    scale1 = tf.Variable(tf.ones([h_size]))
    z1_ = tf.nn.batch_normalization(z1, mean1, variance1, offset1, scale1, 1e-3)
    h1 = tf.nn.relu(z1_)
    
    z2 = tf.matmul(h1, W2)
    mean2, variance2 = tf.nn.moments(z2,[0])
    offset2 = tf.Variable(tf.zeros([10]))
    scale2 = tf.Variable(tf.ones([10]))
    z2_ = tf.nn.batch_normalization(z2, mean2, variance2, offset2, scale2, 1e-3)

    y = tf.nn.softmax(z2_)
    return y

h_size = [100, 500, 1000, 5000]
optimize_mnist(batch_normalization, h_size)

validation, test, hyperparameter
0.9714 0.9727 (100,)
0.98 0.9786 (500,)
0.9816 0.9794 (1000,)
0.9786 0.9765 (5000,)
best setting
0.9816 0.9794 (1000,)


In [9]:
# deep neural network with batch normalization
def deep_batch_normalization(layer_size, h_size):
    h = [x]
    W = []
    offset = []
    scale = []
    
    for i in range(layer_size):
        W.append(tf.Variable(tf.truncated_normal([int(h[i].get_shape()[1]), h_size], stddev=stddev)))
        z = tf.matmul(h[i], W[i])
        mean, variance = tf.nn.moments(z,[0])
        offset.append(tf.Variable(tf.zeros([h_size])))
        scale.append(tf.Variable(tf.ones([h_size])))
        z_ = tf.nn.batch_normalization(z, mean, variance, offset[i], scale[i], 1e-3)
        h.append(tf.nn.relu(z_))
    
    W.append(tf.Variable(tf.truncated_normal([h_size, 10], stddev=stddev)))
    z = tf.matmul(h[-1], W[-1])
    mean, variance = tf.nn.moments(z, [0])
    offset.append(tf.Variable(tf.zeros([10])))
    scale.append(tf.Variable(tf.ones([10])))
    z_ = tf.nn.batch_normalization(z, mean, variance, offset[-1], scale[-1], 1e-3)
    
    y = tf.nn.softmax(z_)
    return y

layer_size = [2, 3, 4, 5]
h_size = [1000, 2000, 3000, 4000]
optimize_mnist(deep_batch_normalization, layer_size, h_size)

validation, test, hyperparameter
0.981 0.9785 (2, 1000)
0.9792 0.9787 (2, 2000)
0.9832 0.9802 (2, 3000)
0.9816 0.9805 (2, 4000)
0.9778 0.9775 (3, 1000)
0.98 0.9757 (3, 2000)
0.9774 0.9774 (3, 3000)
0.9794 0.9784 (3, 4000)
0.9806 0.9766 (4, 1000)
0.9798 0.9771 (4, 2000)
0.9804 0.9788 (4, 3000)
0.9804 0.9774 (4, 4000)
0.974 0.9715 (5, 1000)
0.9744 0.9718 (5, 2000)
0.975 0.9732 (5, 3000)
0.9744 0.9736 (5, 4000)
best setting
0.9832 0.9802 (2, 3000)


In [10]:
best_model = deep_batch_normalization(2, 1000)
train_mnist(best_model, 10000, 500)

(0.991, 0.9867)