# Deep Learning: MNIST
Supplemental files for the "Deep Learning" workshop, presented by the IDEA Student Center at UC San Diego.

## Requirements
- python 2.x or 3.x
- numpy
- matplotlib
- tensorflow

## Goals
In this notebook, we'll compare the performance of "classic" machine learning methods and deep learning methods, as measured by the classification accuracy on the MNIST numerical digits data set.

In [1]:
# load required packages
import tensorflow as tf
import numpy as np

# make the code compatible with both Python 2 and 3
from __future__ import print_function, division

In [2]:
# download MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


## 1) Logistic regression
We'll start with a classic ML classification method (logistic regression) to get a baseline on performance.

In [19]:
# parameters
learning_rate = 0.01
training_epochs = 25
batch_size = 100
display_step = 1

# tf Graph Input
x = tf.placeholder(tf.float32, [None, 784]) # mnist data image of shape 28*28=784
y = tf.placeholder(tf.float32, [None, 10]) # 0-9 digits recognition => 10 classes

# set model weights
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))

# construct model
pred = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax

# set the objective: minimize error using cross entropy
cost = tf.reduce_mean(-tf.reduce_sum(y * tf.log(pred), reduction_indices=1))

# set the optimization method: gradient descent)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)

# Initializing the variables
init = tf.global_variables_initializer()

In [20]:
# launch the graph (to train the model)
with tf.Session() as sess:
    sess.run(init)

    # training the model
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(mnist.train.num_examples/batch_size)
        # loop over all batches
        for i in range(total_batch):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            # Fit training using batch data
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_xs,
                                                          y: batch_ys})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if (epoch+1) % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))

    print("Optimization Finished!")

    # test the trained model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    
    # calculate accuracy on the testing set
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print("Accuracy:", 
          accuracy.eval({x: mnist.test.images, y: mnist.test.labels})
         )

Epoch: 0001 cost= 1.184079722
Epoch: 0002 cost= 0.661790305
Epoch: 0003 cost= 0.556142307
Epoch: 0004 cost= 0.498394211
Epoch: 0005 cost= 0.465580386
Epoch: 0006 cost= 0.442832884
Epoch: 0007 cost= 0.423290196
Epoch: 0008 cost= 0.413138235
Epoch: 0009 cost= 0.400863942
Epoch: 0010 cost= 0.392564389
Epoch: 0011 cost= 0.385719246
Epoch: 0012 cost= 0.379882008
Epoch: 0013 cost= 0.369777387
Epoch: 0014 cost= 0.367966681
Epoch: 0015 cost= 0.363209867
Epoch: 0016 cost= 0.357057785
Epoch: 0017 cost= 0.355990491
Epoch: 0018 cost= 0.350483525
Epoch: 0019 cost= 0.349041883
Epoch: 0020 cost= 0.347110362
Epoch: 0021 cost= 0.341326167
Epoch: 0022 cost= 0.340979889
Epoch: 0023 cost= 0.337079989
Epoch: 0024 cost= 0.335280956
Epoch: 0025 cost= 0.334955501
Optimization Finished!
Accuracy: 0.9142


## 2) Multilayer Perceptron (MLP)
Next we'll try out a "classic" neural network model (i.e. not a deep neural network).

In [21]:
# parameters
learning_rate = 0.001
training_epochs = 15
batch_size = 100
display_step = 1

# neural network Parameters
n_hidden_1 = 256 # 1st layer number of features
n_hidden_2 = 256 # 2nd layer number of features
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)

# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])

In [22]:
# function to create the MLP model
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    # Hidden layer with RELU activation
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    # Output layer with linear activation
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

In [23]:
# store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# construct model
pred = multilayer_perceptron(x, weights, biases)

# define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# initializing the variables
init = tf.global_variables_initializer()

In [24]:
# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(mnist.train.num_examples/batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_x, batch_y = mnist.train.next_batch(batch_size)
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([optimizer, cost], feed_dict={x: batch_x,
                                                          y: batch_y})
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))
    print("Optimization Finished!")

    # Test model
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    
    # calculate accuracy on the testing set
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    print("Accuracy:", 
          accuracy.eval({x: mnist.test.images, y: mnist.test.labels})
         )

Epoch: 0001 cost= 159.799869869
Epoch: 0002 cost= 40.983336299
Epoch: 0003 cost= 25.266888553
Epoch: 0004 cost= 17.942928899
Epoch: 0005 cost= 12.705468488
Epoch: 0006 cost= 9.273931177
Epoch: 0007 cost= 6.958779475
Epoch: 0008 cost= 5.080893338
Epoch: 0009 cost= 3.673082374
Epoch: 0010 cost= 2.836607559
Epoch: 0011 cost= 2.123366972
Epoch: 0012 cost= 1.647398465
Epoch: 0013 cost= 1.150525474
Epoch: 0014 cost= 0.895786823
Epoch: 0015 cost= 0.822540025
Optimization Finished!
Accuracy: 0.9458


## 3) Convolutional Neural Network (CNN)
Now let's try out first deep neural network: a Convolutional Neural Network (CNN).

The CNN is made up of a few core layer types, which get stacked on top of each other:
- convolutional layers (2D)
- max pooling layers (2D)
- fully connected layers (same type as in a MLP model)

In [16]:
# Parameters
learning_rate = 0.001
training_iters = 100000
batch_size = 128
display_step = 10

# Network Parameters
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
dropout = 0.75 # Dropout, probability to keep units

# tf Graph input
x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)

In [17]:
# Create some wrappers for simplicity
def conv2d(x, W, b, strides=1):
    # Conv2D wrapper, with bias and relu activation
    x = tf.nn.conv2d(x, W, strides=[1, strides, strides, 1], padding='SAME')
    x = tf.nn.bias_add(x, b)
    return tf.nn.relu(x)


def maxpool2d(x, k=2):
    # MaxPool2D wrapper
    return tf.nn.max_pool(x, ksize=[1, k, k, 1], strides=[1, k, k, 1],
                          padding='SAME')


# Create model
def conv_net(x, weights, biases, dropout):
    # Reshape input picture
    x = tf.reshape(x, shape=[-1, 28, 28, 1])

    # Convolution Layer
    conv1 = conv2d(x, weights['wc1'], biases['bc1'])
    # Max Pooling (down-sampling)
    conv1 = maxpool2d(conv1, k=2)

    # Convolution Layer
    conv2 = conv2d(conv1, weights['wc2'], biases['bc2'])
    # Max Pooling (down-sampling)
    conv2 = maxpool2d(conv2, k=2)

    # Fully connected layer
    # Reshape conv2 output to fit fully connected layer input
    fc1 = tf.reshape(conv2, [-1, weights['wd1'].get_shape().as_list()[0]])
    fc1 = tf.add(tf.matmul(fc1, weights['wd1']), biases['bd1'])
    fc1 = tf.nn.relu(fc1)
    # Apply Dropout
    fc1 = tf.nn.dropout(fc1, dropout)

    # Output, class prediction
    out = tf.add(tf.matmul(fc1, weights['out']), biases['out'])
    return out

In [18]:
# Store layers weight & bias
weights = {
    # 5x5 conv, 1 input, 32 outputs
    'wc1': tf.Variable(tf.random_normal([5, 5, 1, 32])),
    # 5x5 conv, 32 inputs, 64 outputs
    'wc2': tf.Variable(tf.random_normal([5, 5, 32, 64])),
    # fully connected, 7*7*64 inputs, 1024 outputs
    'wd1': tf.Variable(tf.random_normal([7*7*64, 1024])),
    # 1024 inputs, 10 outputs (class prediction)
    'out': tf.Variable(tf.random_normal([1024, n_classes]))
}

biases = {
    'bc1': tf.Variable(tf.random_normal([32])),
    'bc2': tf.Variable(tf.random_normal([64])),
    'bd1': tf.Variable(tf.random_normal([1024])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# Construct model
pred = conv_net(x, weights, biases, keep_prob)

# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Evaluate model
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables
init = tf.global_variables_initializer()

In [19]:
# Launch the graph
with tf.Session() as sess:
    sess.run(init)
    step = 1
    # Keep training until reach max iterations
    while step * batch_size < training_iters:
        batch_x, batch_y = mnist.train.next_batch(batch_size)
        # Run optimization op (backprop)
        sess.run(optimizer, feed_dict={x: batch_x, y: batch_y,
                                       keep_prob: dropout})
        if step % display_step == 0:
            # Calculate batch loss and accuracy
            loss, acc = sess.run([cost, accuracy], feed_dict={x: batch_x,
                                                              y: batch_y,
                                                              keep_prob: 1.0})
            print("Iter " + str(step*batch_size) 
                  + ", Minibatch Loss= " + "{:.6f}".format(loss)
                  + ", Training Accuracy= " + "{:.5f}".format(acc)
                 )
        step += 1
    print("Optimization Finished!")

    # Calculate accuracy for 256 mnist test images
    print("Testing Accuracy:",
        sess.run(accuracy, feed_dict={x: mnist.test.images,
                                      y: mnist.test.labels,
                                      keep_prob: 1.0})
         )

Iter 1280, Minibatch Loss= 25681.146484, Training Accuracy= 0.17188
Iter 2560, Minibatch Loss= 9265.119141, Training Accuracy= 0.46094
Iter 3840, Minibatch Loss= 5808.438965, Training Accuracy= 0.64062
Iter 5120, Minibatch Loss= 5467.181641, Training Accuracy= 0.65625
Iter 6400, Minibatch Loss= 3853.803467, Training Accuracy= 0.73438
Iter 7680, Minibatch Loss= 4165.374512, Training Accuracy= 0.81250
Iter 8960, Minibatch Loss= 4282.016602, Training Accuracy= 0.78125
Iter 10240, Minibatch Loss= 3157.831543, Training Accuracy= 0.82812
Iter 11520, Minibatch Loss= 3798.208496, Training Accuracy= 0.81250
Iter 12800, Minibatch Loss= 2996.082520, Training Accuracy= 0.80469
Iter 14080, Minibatch Loss= 1633.943970, Training Accuracy= 0.88281
Iter 15360, Minibatch Loss= 1953.378662, Training Accuracy= 0.89844
Iter 16640, Minibatch Loss= 1734.333130, Training Accuracy= 0.85938
Iter 17920, Minibatch Loss= 1608.338867, Training Accuracy= 0.89062
Iter 19200, Minibatch Loss= 1340.670654, Training Accu

KeyboardInterrupt: 