Deep Learning
=============

Assignment 4
------------

Previously in `2_fullyconnected.ipynb` and `3_regularization.ipynb`, we trained fully connected networks to classify [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) characters.

The goal of this assignment is make the neural network convolutional.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from six.moves import range

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

('Training set', (200000, 28, 28), (200000,))
('Validation set', (10000, 28, 28), (10000,))
('Test set', (18724, 28, 28), (18724,))


Reformat into a TensorFlow-friendly shape:
- convolutions need the image data formatted as a cube (width by height by #channels)
- labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10
num_channels = 1 # grayscale

import numpy as np

def reformat(dataset, labels):
  dataset = dataset.reshape(
    (-1, image_size, image_size, num_channels)).astype(np.float32)
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

('Training set', (200000, 28, 28, 1), (200000, 10))
('Validation set', (10000, 28, 28, 1), (10000, 10))
('Test set', (18724, 28, 28, 1), (18724, 10))


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

Let's build a small network with two convolutional layers, followed by one fully connected layer. Convolutional networks are more expensive computationally, so we'll limit its depth and number of fully connected nodes.

In [29]:
batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size / 4 * image_size / 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer1_biases)
    conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME')
    hidden = tf.nn.relu(conv + layer2_biases)
    shape = hidden.get_shape().as_list()
    reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [18]:
num_steps = 1001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 50 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 2.766295
Minibatch accuracy: 6.2%
Validation accuracy: 9.8%
Minibatch loss at step 50: 0.855177
Minibatch accuracy: 75.0%
Validation accuracy: 54.3%
Minibatch loss at step 100: 0.545516
Minibatch accuracy: 87.5%
Validation accuracy: 74.8%
Minibatch loss at step 150: 1.221878
Minibatch accuracy: 75.0%
Validation accuracy: 76.5%
Minibatch loss at step 200: 0.688126
Minibatch accuracy: 81.2%
Validation accuracy: 78.5%
Minibatch loss at step 250: 1.037726
Minibatch accuracy: 68.8%
Validation accuracy: 77.8%
Minibatch loss at step 300: 1.437719
Minibatch accuracy: 62.5%
Validation accuracy: 76.5%
Minibatch loss at step 350: 0.348409
Minibatch accuracy: 93.8%
Validation accuracy: 77.5%
Minibatch loss at step 400: 0.579711
Minibatch accuracy: 81.2%
Validation accuracy: 80.5%
Minibatch loss at step 450: 1.501352
Minibatch accuracy: 75.0%
Validation accuracy: 80.8%
Minibatch loss at step 500: 0.671300
Minibatch accuracy: 81.2%
Validation accuracy: 81.6%
Min

---
Problem 1
---------

The convolutional model above uses convolutions with stride 2 to reduce the dimensionality. Replace the strides by a max pooling operation (`nn.max_pool()`) of stride 2 and kernel size 2.

---

Instead of reducing the dimensionality of the image by applying a STRIDE of 2 (which is a very aggressive method because it looses a lot of information from the image) let us keep the STRIDE SHIFT by 1 (as opposed to 2) and reduce the image dimmention by looking at the neighborhood and combining these pixels somehow. That is called POOLING and the way we are going to combine these pixels is by taking their maximum value which is called MAX POOLING. 

In other word, instead of reducing the image dimention, layer by layer, using convolutions with stride of 2, we reduce the images' dimensions using pooling.

In [50]:
batch_size = 16
patch_size = 5
depth = 16
num_hidden = 64

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, num_channels, depth], stddev=0.1))
  layer1_biases = tf.Variable(tf.zeros([depth]))
  layer2_weights = tf.Variable(tf.truncated_normal(
      [patch_size, patch_size, depth, depth], stddev=0.1))
  layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth]))
  layer3_weights = tf.Variable(tf.truncated_normal(
      [image_size / 4 * image_size / 4 * depth, num_hidden], stddev=0.1))
  layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden]))
  layer4_weights = tf.Variable(tf.truncated_normal(
      [num_hidden, num_labels], stddev=0.1))
  layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels]))
  
  # Model.
  def model(data):
    # reduce the kernel stride to 1 so that the convolutions will output images with the same
    # size as the original input
    conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='SAME')
    
    hidden = tf.nn.relu(conv + layer1_biases)
    
    # apply max pooling (Windows size 2x2) and Stride of 2 to reduce
    # the image dimension
    max_pool_1 = tf.nn.max_pool(hidden, 
                                ksize = [1,2,2,1], # The size of the window for each dimension of the input tensor.
                                strides = [1,2,2,1], # The stride of the sliding window for each dimension of the input 
                                padding='SAME')
    
    conv = tf.nn.conv2d(max_pool_1, layer2_weights, [1, 1, 1, 1], padding='SAME')
    
    hidden = tf.nn.relu(conv + layer2_biases)
    
    # apply max pooling (Windows size 2x2) and Stride of 2
    max_pool_2 = tf.nn.max_pool(hidden, ksize = [1,2,2,1], strides = [1,2,2,1], padding='SAME')
    
    shape = max_pool_2.get_shape().as_list()
    reshape = tf.reshape(max_pool_2, [shape[0], shape[1] * shape[2] * shape[3]])
    hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)
    return tf.matmul(hidden, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [51]:
num_steps = 1001

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
    _, l, predictions = session.run(
      [optimizer, loss, train_prediction], feed_dict=feed_dict)
    if (step % 50 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval(), valid_labels))
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 2.705396
Minibatch accuracy: 6.2%
Validation accuracy: 10.2%
Minibatch loss at step 50: 1.200283
Minibatch accuracy: 62.5%
Validation accuracy: 38.6%
Minibatch loss at step 100: 0.760845
Minibatch accuracy: 87.5%
Validation accuracy: 72.3%
Minibatch loss at step 150: 1.322434
Minibatch accuracy: 68.8%
Validation accuracy: 75.3%
Minibatch loss at step 200: 0.705520
Minibatch accuracy: 81.2%
Validation accuracy: 78.6%
Minibatch loss at step 250: 1.041272
Minibatch accuracy: 62.5%
Validation accuracy: 77.7%
Minibatch loss at step 300: 0.965750
Minibatch accuracy: 68.8%
Validation accuracy: 74.6%
Minibatch loss at step 350: 0.380866
Minibatch accuracy: 87.5%
Validation accuracy: 80.0%
Minibatch loss at step 400: 0.486970
Minibatch accuracy: 87.5%
Validation accuracy: 80.7%
Minibatch loss at step 450: 1.175629
Minibatch accuracy: 75.0%
Validation accuracy: 81.0%
Minibatch loss at step 500: 0.524969
Minibatch accuracy: 93.8%
Validation accuracy: 82.6%
Mi

---
Problem 2
---------

Try to get the best performance you can using a convolutional net. Look for example at the classic [LeNet5](http://yann.lecun.com/exdb/lenet/) architecture, adding Dropout, and/or adding learning rate decay.

---

In [12]:
# create a Tensor Weight with a predefined shape
def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)

# create a Tensor Biases with a predifined shape
def bias_variable(shape):
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial)

# perform a convolution using strides of 1 and SAME padding
# reducing the kernel stride to 1 will make the convolutions output images with the same
# size as the original input
def conv2d(x, W):
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

# perform a max pooling operation using stride of 2
# apply max pooling (Windows size 2x2) and Stride of 2 to reduce
# the image dimension by half
def max_pool_2x2(x):
  return tf.nn.max_pool(x, 
                        ksize=[1, 2, 2, 1], # The size of the window for each dimension of the input tensor.
                        strides=[1, 2, 2, 1], # The stride of the sliding window for each dimension of the input 
                        padding='SAME')


In [31]:
batch_size = 16
patch_size = 5
#depth = 16
num_hidden = 1024

graph = tf.Graph()

with graph.as_default():

  # Input data.
  tf_train_dataset = tf.placeholder(
    tf.float32, shape=(batch_size, image_size, image_size, num_channels))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  # Variables.
  layer1_weights = weight_variable([patch_size, patch_size, num_channels, 32])
  layer1_biases = bias_variable([32])

  layer2_weights = weight_variable([patch_size, patch_size, 32, 64])
  layer2_biases = bias_variable([64])

  layer3_weights = weight_variable([image_size / 4 * image_size / 4 * 64, num_hidden]) # [3136 x 1024]
  layer3_biases = bias_variable([num_hidden])

  layer4_weights = weight_variable([num_hidden, num_labels])
  layer4_biases = bias_variable([num_labels])
  
  # add a dropout probability variable
  keep_prob = tf.placeholder(tf.float32)
    
  # Model.
  def model(data):
    # reduce the kernel stride to 1 so that the convolutions will output images with the same
    # size as the original input
    h_conv1 = tf.nn.relu(conv2d(data, layer1_weights) + layer1_biases) # 1st layer (Convolutional layer)
    
    # apply max pooling (Windows size 2x2) and Stride of 2 to reduce
    # the image dimension
    h_pool1 = max_pool_2x2(h_conv1) # 2nd layer (Subsamplling)
    
    # perform the second conv layer
    h_conv2 = tf.nn.relu(conv2d(h_pool1, layer2_weights) + layer2_biases) # 3rd layer (Convolutional layer)
    
    # apply max pooling (Windows size 2x2) and Stride of 2
    h_pool2 = max_pool_2x2(h_conv2) # 4th layer (Subsamplling)
    
    shape = h_pool2.get_shape().as_list()  
    reshape = tf.reshape(h_pool2, [shape[0], shape[1] * shape[2] * shape[3]])
    
    # add dropout before the readout layer
    fully_connected = tf.nn.dropout(tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases), keep_prob) # fully connected
    
    return tf.matmul(fully_connected, layer4_weights) + layer4_biases
  
  # Training computation.
  logits = model(tf_train_dataset)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
    
  # lets add learning rate decay  
  global_step = tf.Variable(0, trainable=False)
  starter_learning_rate = 0.03
  learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           decay_steps = (8*batch_size), 
                                           decay_rate = 0.95, 
                                           staircase=True)
    
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
  test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [32]:
num_steps = 1001

learning_rate_decay = []
loss_func = []

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  for step in range(num_steps):
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    batch_data = train_dataset[offset:(offset + batch_size), :, :, :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
    
    _, l, l_rate_decay, predictions = session.run(
      [optimizer, loss, learning_rate, train_prediction], feed_dict=feed_dict)
    
    learning_rate_decay.append(l_rate_decay)
    loss_func.append(l)
    
    if (step % 50 == 0):
      print('Minibatch loss at step %d: %f' % (step, l))
      print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels))
      print('Validation accuracy: %.1f%%' % accuracy(
        valid_prediction.eval({keep_prob : 1.0}), valid_labels))
    
      print ('Traning data used amount: %d : %d' % (offset, (offset + batch_size)))
    
  print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval({keep_prob : 1.0}), test_labels))

Initialized
Minibatch loss at step 0: 8.814513
Minibatch accuracy: 18.8%
Validation accuracy: 9.8%
Traning data used amount: 0 : 16
Minibatch loss at step 50: 1.206202
Minibatch accuracy: 56.2%
Validation accuracy: 53.9%
Traning data used amount: 800 : 816
Minibatch loss at step 100: 0.902726
Minibatch accuracy: 75.0%
Validation accuracy: 73.9%
Traning data used amount: 1600 : 1616
Minibatch loss at step 150: 1.683960
Minibatch accuracy: 43.8%
Validation accuracy: 75.9%
Traning data used amount: 2400 : 2416
Minibatch loss at step 200: 0.991665
Minibatch accuracy: 62.5%
Validation accuracy: 78.6%
Traning data used amount: 3200 : 3216
Minibatch loss at step 250: 1.127099
Minibatch accuracy: 68.8%
Validation accuracy: 79.7%
Traning data used amount: 4000 : 4016
Minibatch loss at step 300: 1.398275
Minibatch accuracy: 56.2%
Validation accuracy: 79.5%
Traning data used amount: 4800 : 4816
Minibatch loss at step 350: 0.732370
Minibatch accuracy: 81.2%
Validation accuracy: 81.1%
Traning data 

In [34]:
import matplotlib.pyplot as plt

# display a learning rate decreasing in a graph
plt.plot(learning_rate_decay)
plt.grid(1)
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.show()

# display the loss function results during training
plt.plot(loss_func)
plt.grid(1)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.show()