Deep Learning
=============

Assignment 3
------------

Previously in `2_fullyconnected.ipynb`, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.

In [7]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
import copy
import math
import gc

First reload the data we generated in `1_notmnist.ipynb`.

In [8]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:
- data as a flat matrix,
- labels as float 1-hot encodings.

In [9]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 1 to [0.0, 1.0, 0.0 ...], 2 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

nx = train_dataset.shape[1]
ny = train_labels.shape[1]

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


---
Problem 1
---------

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compute the L2 loss for a tensor `t` using `nn.l2_loss(t)`. The right amount of regularization should improve your validation / test accuracy.

---

In [32]:
class Classifier(object):
    """
    A classifier object.
    
    nx = number of features for the dataset (size of input layer)
    ny = number of features for the output (number of classes)
    batch_size = size per batch
    epochs = number of epochs to train for
    print_loss_interval = how often losses are printed to the screen during training, defined as # of training steps
    max_iter = maximum number of iterations to train for
    learning_rate = the learning rate alpha
    reg_lambda = the L2 regularization parameter lambda
    name = the name for this classifier, which is used as an unique identifier in storing the trained parameters
    """
    def __init__(self, nx, ny, layers = [], batch_size = 128, epochs = 2, print_loss_interval = 500, max_iter = None,
                 learning_rate = 0.001, learn_decay_rate = 0, learn_decay_steps = 5000,
                 reg_lambda = 0.5, dropout_rate = 0, name = 'classifier'):
        self.name = name
        self.nx = nx
        self.ny = ny
        self.savefile = './models/' + ''.join(c for c in name if c.isalnum() or c in ['_', '-']) + '.model'
        self.layers = layers
        self.learning_rate = learning_rate
        self.learn_decay_rate = learn_decay_rate
        self.learn_decay_steps = learn_decay_steps
        self.reg_lambda = reg_lambda
        self.dropout_rate = dropout_rate
        self.batch_size = batch_size
        self.epochs = epochs
        self.print_loss_interval = print_loss_interval
        self.max_iter = max_iter
        
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.tf_train_X = tf.placeholder(tf.float32, shape=(None, nx))
            self.tf_train_Y = tf.placeholder(tf.float32, shape=(None, ny))
            self.tf_valid_X = tf.placeholder(tf.float32, shape=(None, nx))
            m = tf.to_float(tf.shape(self.tf_train_X)[0])

            # Initialize weights and biases variables
            self.all_layers = copy.deepcopy(self.layers)
            self.all_layers.insert(0, nx)
            self.all_layers.append(ny)
            self.weights = [None] * len(self.all_layers)
            self.biases = [None] * len(self.all_layers)
            for i in range(1, len(self.all_layers)):
                self.weights[i] = tf.Variable(tf.random_normal((self.all_layers[i - 1], self.all_layers[i]), stddev = np.sqrt(2.0 / self.all_layers[i - 1])),
                                             name = 'W' + str(i))
                self.biases[i] = tf.Variable(tf.zeros((self.all_layers[i])), name = 'b' + str(i))

            # Training computation.
            self.logits = self.__forward_prop(self.tf_train_X)
            self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.tf_train_Y)) + (reg_lambda / (2 * m)) * tf.add_n([tf.nn.l2_loss(w) for w in self.weights[1:]])
            
            # Optimizer.
            if learn_decay_rate > 0 and learn_decay_steps > 0:
                global_step = tf.Variable(0, name = 'global_step', trainable = False)
                self.learn_rate_decayed = tf.train.exponential_decay(self.learning_rate, global_step, learn_decay_steps, learn_decay_rate)
                self.optimizer = tf.train.AdamOptimizer(learning_rate = self.learn_rate_decayed).minimize(self.loss, global_step = global_step)
            else:            
                self.optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate).minimize(self.loss)
            
            # Predictor.
            self.predictor = tf.nn.softmax(self.logits)
            self.valid_predictor = tf.nn.softmax(self.__forward_prop(self.tf_valid_X, is_train = False))
            
            # Saver.
            self.saver = tf.train.Saver()
    
    def __forward_prop(self, X, is_train = True):
        with self.graph.as_default():
            Z = [None] * len(self.all_layers)
            A = [None] * len(self.all_layers)
            A[0] = X

            for i in range(1, len(self.all_layers)):
                Z[i] = tf.matmul(A[i - 1], self.weights[i]) + self.biases[i]
                if i != len(self.all_layers) - 1:
                    if is_train and self.dropout_rate > 0:
                        A[i] = tf.nn.dropout(tf.nn.relu(Z[i]), 1 - self.dropout_rate)
                    else:
                        A[i] = tf.nn.relu(Z[i])
            
            return Z[-1]
    
    def predict_probs(self, X):
        with tf.Session(graph=self.graph) as session:
            saver = tf.train.Saver()
            saver.restore(session, self.savefile)
            p = tf.nn.softmax(self.__forward_prop(tf.constant(X, dtype=tf.float32), is_train = False)).eval()
            return p
    
    def predict(self, X):
        return np.argmax(self.predict_probs(X), 1)
    
    def score(self, X, Y):
        predictions = self.predict(X)
        return (100.0 * np.sum(predictions == np.argmax(Y, 1)) / predictions.shape[0])
    
    def train(self, train_X, train_Y, valid_X = None, valid_Y = None):
        m = train_X.shape[0]
        
        with tf.Session(graph=self.graph) as session:
            tf.global_variables_initializer().run()
            print("Initialized!")
            step = 0

            num_batches = int(math.floor(m / self.batch_size))
            if m % self.batch_size != 0:
                num_batches += 1

            for epoch in range(self.epochs):
                gc.collect(); indices = np.arange(m); np.random.shuffle(indices)
                for i in range(num_batches):
                    step += 1

                    if i == num_batches - 1:
                        batch_X = train_X[indices[self.batch_size * i:m]]
                        batch_Y = train_Y[indices[self.batch_size * i:m]]
                    else:
                        batch_X = train_X[indices[self.batch_size * i:self.batch_size * (i + 1)]]
                        batch_Y = train_Y[indices[self.batch_size * i:self.batch_size * (i + 1)]]
                    
                    _, l, predictions= session.run([self.optimizer, self.loss, self.predictor], 
                                                    feed_dict={self.tf_train_X: batch_X, self.tf_train_Y: batch_Y})                    
                    
                    if (step % self.print_loss_interval == 0):
                        print("Minibatch loss at step %d: %f" % (step, l))
                        print("Minibatch accuracy: %.1f%%" % 
                              (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(batch_Y, 1)) / predictions.shape[0]))
                        
                        if valid_X is not None and valid_Y is not None:
                            valid_predictions = session.run(self.valid_predictor, feed_dict={self.tf_valid_X: valid_X})
                            print("Validation accuracy: %.1f%%" %
                                  (100.0 * np.sum(np.argmax(valid_predictions, 1) == np.argmax(valid_Y, 1)) / valid_Y.shape[0]))
                        
                        # Save weights for this step
                        self.saver.save(session, self.savefile)
                    
                    # Print new learning rate
                    if self.learn_decay_rate > 0 and self.learn_decay_steps > 0 and step % self.learn_decay_steps == 0:
                        print("New learning rate:", self.learn_rate_decayed.eval())
                    
                    if self.max_iter and step >= self.max_iter:
                        # Save weights and biases early because there was a hard cap on iterations
                        self.saver.save(session, self.savefile)
                        return

            # Save weights and biases
            self.saver.save(session, self.savefile)

In [21]:
logistic_regression = Classifier(nx, ny, reg_lambda = 0, name = "logistic_regression")
logistic_regression.train(train_dataset, train_labels, valid_dataset, valid_labels)
print("***********************************************")
print("Validation accuracy: %.1f%%" % logistic_regression.score(valid_dataset, valid_labels))
print("Test accuracy: %.1f%%" % logistic_regression.score(test_dataset, test_labels))
print("***********************************************")

Initialized!
***********************************************
Minibatch loss at step 500: 3.782097
Minibatch accuracy: 57.8%
Validation accuracy: 63.5%
Minibatch loss at step 1000: 2.666139
Minibatch accuracy: 68.8%
Validation accuracy: 70.8%
Minibatch loss at step 1500: 3.117322
Minibatch accuracy: 68.8%
Validation accuracy: 72.9%
***********************************************
Minibatch loss at step 2000: 2.023502
Minibatch accuracy: 71.1%
Validation accuracy: 74.1%
Minibatch loss at step 2500: 2.276888
Minibatch accuracy: 76.6%
Validation accuracy: 74.9%
Minibatch loss at step 3000: 1.963044
Minibatch accuracy: 74.2%
Validation accuracy: 75.0%
***********************************************
Validation accuracy: 74.9%
Test accuracy: 82.7%
***********************************************


In [6]:
one_layer_nn = Classifier(nx, ny, layers = [1024], reg_lambda = 0, print_loss_interval = 500, 
                          print_loss_interval = 100, learning_rate = 10, name = "one_layer_nn")
one_layer_nn.train(train_dataset, train_labels, valid_dataset, valid_labels)
print("***********************************************")
print("Validation accuracy: %.1f%%" % one_layer_nn.score(valid_dataset, valid_labels))
print("Test accuracy: %.1f%%" % one_layer_nn.score(test_dataset, test_labels))
print("***********************************************")

Initialized!
***********************************************
Minibatch loss at step 500: 1433049.750000
Minibatch accuracy: 68.0%
Validation accuracy: 64.1%
Minibatch loss at step 1000: 561720.750000
Minibatch accuracy: 62.5%
Validation accuracy: 56.8%
Minibatch loss at step 1500: 241709.468750
Minibatch accuracy: 62.5%
Validation accuracy: 61.9%
***********************************************
Minibatch loss at step 2000: 77499.109375
Minibatch accuracy: 64.1%
Validation accuracy: 61.3%
Minibatch loss at step 2500: 119174.976562
Minibatch accuracy: 57.0%
Validation accuracy: 63.0%


KeyboardInterrupt: 

In [44]:
log_regression_reg = Classifier(nx, ny, reg_lambda = 0.5, name = "log_regression_reg")
log_regression_reg.train(train_dataset, train_labels, valid_dataset, valid_labels)
print("***********************************************")
print("Validation accuracy: %.1f%%" % log_regression_reg.score(valid_dataset, valid_labels))
print("Test accuracy: %.1f%%" % log_regression_reg.score(test_dataset, test_labels))
print("***********************************************")

Initialized!
***********************************************
Minibatch loss at step 500: 10.195614
Minibatch accuracy: 61.7%
Minibatch loss at step 1000: 8.546135
Minibatch accuracy: 68.8%
Minibatch loss at step 1500: 5.837636
Minibatch accuracy: 78.1%
***********************************************
Minibatch loss at step 2000: 5.418159
Minibatch accuracy: 77.3%
Minibatch loss at step 2500: 4.613855
Minibatch accuracy: 75.0%
Minibatch loss at step 3000: 3.016995
Minibatch accuracy: 79.7%
***********************************************
Validation accuracy: 77.2%
Test accuracy: 84.5%
***********************************************


In [26]:
one_layer_nn_reg = Classifier(nx, ny, layers = [1024], reg_lambda = 0.5, name = "one_layer_nn_reg",
                             print_loss_interval = 100, epochs = 5)
one_layer_nn_reg.train(train_dataset, train_labels, valid_dataset, valid_labels)
print("***********************************************")
print("Validation accuracy: %.1f%%" % one_layer_nn_reg.score(valid_dataset, valid_labels))
print("Test accuracy: %.1f%%" % one_layer_nn_reg.score(test_dataset, test_labels))
print("***********************************************")

Initialized!
***********************************************
Minibatch loss at step 100: 713.626343
Minibatch accuracy: 78.1%
Validation accuracy: 72.4%
Minibatch loss at step 200: 553.274231
Minibatch accuracy: 76.6%
Validation accuracy: 74.9%
Minibatch loss at step 300: 443.774170
Minibatch accuracy: 81.2%
Validation accuracy: 78.7%
Minibatch loss at step 400: 360.493225
Minibatch accuracy: 85.2%
Validation accuracy: 81.2%
Minibatch loss at step 500: 295.724182
Minibatch accuracy: 85.2%
Validation accuracy: 81.4%
Minibatch loss at step 600: 245.270111
Minibatch accuracy: 82.8%
Validation accuracy: 81.2%
Minibatch loss at step 700: 198.270386
Minibatch accuracy: 81.2%
Validation accuracy: 81.9%
Minibatch loss at step 800: 165.640488
Minibatch accuracy: 77.3%
Validation accuracy: 80.1%
Minibatch loss at step 900: 134.485794
Minibatch accuracy: 82.0%
Validation accuracy: 82.9%
Minibatch loss at step 1000: 109.733940
Minibatch accuracy: 84.4%
Validation accuracy: 82.1%
Minibatch loss at 

---
Problem 2
---------
Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

---

In [57]:
restricted_train_dataset = train_dataset[:5000]
restricted_train_labels = train_labels[:5000]
overfit_one_layer_nn = Classifier(nx, ny, layers = [1024], reg_lambda = 0, epochs = 5,
                                  print_loss_interval = 40, name = "overfit_one_layer_nn")
overfit_one_layer_nn.train(restricted_train_dataset, restricted_train_labels)
print("***********************************************")
print("Validation accuracy: %.1f%%" % overfit_one_layer_nn.score(valid_dataset, valid_labels))
print("Test accuracy: %.1f%%" % overfit_one_layer_nn.score(test_dataset, test_labels))
print("***********************************************")

Initialized!
***********************************************
Minibatch loss at step 40: 41.842270
Minibatch accuracy: 62.5%
***********************************************
Minibatch loss at step 80: 0.000000
Minibatch accuracy: 100.0%
***********************************************
Minibatch loss at step 120: 44.864929
Minibatch accuracy: 50.0%
***********************************************
Minibatch loss at step 160: 47.206039
Minibatch accuracy: 87.5%
***********************************************
Minibatch loss at step 200: 3.977848
Minibatch accuracy: 87.5%
***********************************************
Validation accuracy: 69.3%
Test accuracy: 76.2%
***********************************************


---
Problem 3
---------
Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides `nn.dropout()` for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

---

In [68]:
one_layer_nn_dropout = Classifier(nx, ny, layers = [1024], reg_lambda = 0.5, 
                                  epochs = 2, name = "one_layer_nn_dropout3")
one_layer_nn_dropout.train(train_dataset, train_labels)
print("***********************************************")
print("Validation accuracy: %.1f%%" % one_layer_nn_dropout.score(valid_dataset, valid_labels))
print("Test accuracy: %.1f%%" % one_layer_nn_dropout.score(test_dataset, test_labels))
print("***********************************************")

Initialized!
***********************************************
Minibatch loss at step 500: 575.905273
Minibatch accuracy: 85.2%
Minibatch loss at step 1000: 515.922791
Minibatch accuracy: 76.6%
Minibatch loss at step 1500: 446.372192
Minibatch accuracy: 79.7%
***********************************************
Minibatch loss at step 2000: 394.030212
Minibatch accuracy: 78.9%
Minibatch loss at step 2500: 335.208923
Minibatch accuracy: 78.9%
Minibatch loss at step 3000: 290.856323
Minibatch accuracy: 84.4%
***********************************************
Validation accuracy: 83.4%
Test accuracy: 89.8%
***********************************************


---
Problem 4
---------

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is [97.1%](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html?showComment=1391023266211#c8758720086795711595).

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
 
 ---


In [40]:
best_nn = Classifier(nx, ny, layers = [1024, 256], reg_lambda = 0.5, batch_size = 128, learning_rate = 0.001,
                     print_loss_interval = 100, epochs = 10, name = "best_nn")
#best_nn.train(train_dataset, train_labels, valid_dataset, valid_labels)
print("***********************************************")
print("Validation accuracy: %.1f%%" % best_nn.score(valid_dataset, valid_labels))
print("Test accuracy: %.1f%%" % best_nn.score(test_dataset, test_labels))
print("***********************************************")

***********************************************
INFO:tensorflow:Restoring parameters from ./models/best_nn.model
Validation accuracy: 87.9%
INFO:tensorflow:Restoring parameters from ./models/best_nn.model
Test accuracy: 94.3%
***********************************************


In [41]:
best_nn2 = Classifier(nx, ny, layers = [1024, 256], reg_lambda = 0.5, batch_size = 128, learning_rate = 0.001,
                      learn_decay_rate = 0.6, learn_decay_steps = 2000, print_loss_interval = 100, 
                      epochs = 15, name = "best_nn2")
#best_nn2.train(train_dataset, train_labels, valid_dataset, valid_labels)
print("***********************************************")
print("Validation accuracy: %.1f%%" % best_nn2.score(valid_dataset, valid_labels))
print("Test accuracy: %.1f%%" % best_nn2.score(test_dataset, test_labels))
print("***********************************************")

***********************************************
INFO:tensorflow:Restoring parameters from ./models/best_nn2.model
Validation accuracy: 89.4%
INFO:tensorflow:Restoring parameters from ./models/best_nn2.model
Test accuracy: 95.2%
***********************************************


In [42]:
best_nn3 = Classifier(nx, ny, layers = [1024, 256], reg_lambda = 0.5, batch_size = 128, learning_rate = 0.001,
                      learn_decay_rate = 0.75, learn_decay_steps = 2000, print_loss_interval = 1000, 
                      epochs = 15, name = "best_nn3")
#best_nn3.train(train_dataset, train_labels, valid_dataset, valid_labels)
print("***********************************************")
print("Validation accuracy: %.1f%%" % best_nn3.score(valid_dataset, valid_labels))
print("Test accuracy: %.1f%%" % best_nn3.score(test_dataset, test_labels))
print("***********************************************")

***********************************************
INFO:tensorflow:Restoring parameters from ./models/best_nn3.model
Validation accuracy: 89.6%
INFO:tensorflow:Restoring parameters from ./models/best_nn3.model
Test accuracy: 95.3%
***********************************************


In [45]:
best_nn4 = Classifier(nx, ny, layers = [1024, 256], reg_lambda = 0.5, batch_size = 128, learning_rate = 0.001,
                      learn_decay_rate = 0.8, learn_decay_steps = 3000, print_loss_interval = 1000, 
                      epochs = 15, name = "best_nn4")
#best_nn4.train(train_dataset, train_labels, valid_dataset, valid_labels)
print("***********************************************")
print("Validation accuracy: %.1f%%" % best_nn4.score(valid_dataset, valid_labels))
print("Test accuracy: %.1f%%" % best_nn4.score(test_dataset, test_labels))
print("***********************************************")

***********************************************
INFO:tensorflow:Restoring parameters from ./models/best_nn4.model
Validation accuracy: 89.0%
INFO:tensorflow:Restoring parameters from ./models/best_nn4.model
Test accuracy: 95.1%
***********************************************
