# DeepLearning 01. Feedforward Neural Network (FNN) from Scratch

* **Source**: Michael Nielson's book on neural nets (http://neuralnetworksanddeeplearning.com/chap1.html).
* **Contribution**: 
    * Hopefully more readable code annotation.
    * Extension to batch-vectorized implementation.

## I. Implementation 1

* Cost: Quadratic
* Library: Numpy only

In [3]:
import numpy as np
import random, os, sys

In [4]:
def sigmoid(z):
    """
    Elementwise sigmoid conversion.
    
    Arguments:
    z: Vector computed from (weight * x + bias).
    
    Returns sigmoid converted vector of the same shape.
    """
    return 1.0 / (1.0+np.exp(-z)) 

def sigmoid_prime(z): 
    """
    Computes the derivative of sigmoid function.
    
    Arguments:
    z: Vector computed from (weight * x + bias).
    
    Returns the derivative vector for sigmoid(z).
    """
    return sigmoid(z) * (1-sigmoid(z))

In [98]:
class NNNumpy:
    """
    Batch-vectorized implementation of Michael Nielson's FNN network.
    """
    
    def __init__(self, sizes): 
        """
        Initialize parameters for FNN.
        
        Arguments:
        sizes: List of sizes of layers of FNN.
        """
        self.numLayers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y,1) for y in sizes[1:]] 
        self.weights = [np.random.randn(y,x) for x,y in zip(sizes[:-1],sizes[1:])]
     
    def forward_propagation(self, a):
        """
        Feedforward step, feed input through FNN to obtain output.
        
        Arguments:
        a: Input vector.
        
        Returns output vector.
        """
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a) + b)
        return a
    
    def SGD(self, train_data, epochs, batchSize, lr, test=None):
        """
        Stochastic Gradient Descent with configurable batch size.
        
        Arguments:
        train_data: List of training data points.
        epochs:     Number of epochs to be run.
        batchSize:  Size of minibatch.
        lr:         Learning rate.
        test:       List of test data points, for validation. Because no tuning of hyperparams will be performed,
                    test data points are used for validation, monitoring performance improvement at each epoch.
        """
        if test: nTest = len(test)
        n = len(train_data)
        for j in xrange(epochs):
            random.shuffle(train_data)
            batches = [ train_data[k:k+batchSize] for k in xrange(0, n, batchSize) ]
            for batch in batches:
                self.update_batch(batch, lr)
            if test:
                print "Epoch {0}: {1} / {2}".format(j, self.evaluate(test), nTest)
            else: 
                print "Epoch {0} complete".format(j)
    
    def update_batch(self, batch, lr):
        """
        Update weights and biases for the layers of FNN with a minibatch.
        
        Arguments:
        batch: List of training data points in minibatch.
        lr:    Learning rate.
        """
        bGrads = [ np.zeros(b.shape) for b in self.biases ]
        wGrads = [ np.zeros(w.shape) for w in self.weights ]

        x_batch = np.hstack([x for x,y in batch])
        y_batch = np.hstack([y for x,y in batch])

        bGradDeltas, wGradDeltas = self.back_propagation(x_batch, y_batch)

        bGradDeltas = [bGradDelta.sum(axis=1).reshape(b.shape) 
                       for bGradDelta,b in zip(bGradDeltas,self.biases)] 
        
        bGrads = [bGrad+bGradDelta for bGrad,bGradDelta in zip(bGrads,bGradDeltas)]
        wGrads = [wGrad+wGradDelta for wGrad,wGradDelta in zip(wGrads,wGradDeltas)]

        self.biases = [ b-(lr/len(batch))*bGrad for b,bGrad in zip(self.biases,bGrads) ]
        self.weights = [ w-(lr/len(batch))*wGrad for w,wGrad in zip(self.weights,wGrads) ]
    
    def back_propagation(self, x_batch, y_batch): 
        """
        Backpropagation step, propagates errors from output end.
        
        Arguments:
        x_batch: List of training data inputs.
        y_batch: List of training data true classes (binarized).
        
        Returns gradients of weights and biases.
        """
        bGrads = [ np.zeros(b.shape) for b in self.biases ]
        wGrads = [ np.zeros(w.shape) for w in self.weights ]

        activation = x_batch                                                     
        activations = [x_batch]                          
        zs = []                                             
        for b,w in zip(self.biases, self.weights):                               
            z = np.dot(w, activation) + b                                        
            zs.append(z)                                                         
            activation = sigmoid(z)                                              
            activations.append(activation)                                       

        delta = self.cost_derivative(activations[-1], y_batch) * sigmoid_prime(zs[-1])
                                                                                 
        bGrads[-1] = delta                                                                                                                           
        wGrads[-1] = np.dot(delta, activations[-2].transpose())                   

        for l in xrange(2, self.numLayers):                                      
            z = zs[-l]                                                           
            sp = sigmoid_prime(z)    
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp                                                                            
            bGrads[-l] = delta                                             
            wGrads[-l] = np.dot(delta, activations[-l-1].transpose())            
            
        return (bGrads, wGrads)                                                  
    
    def evaluate(self, test):
        """
        Evaluate function for test data as validation.
        
        Arguments:
        test: List of test data points.
        
        Returns accuracy on the validation.
        """
        testResults = [ (np.argmax(self.forward_propagation(x)), y) for (x, y) in test ]
        return sum(int(x==y) for (x,y) in testResults)
    
    def cost_derivative(self, outputActivations, y):
        """
        Computes error on the output end.
        
        Arguments:
        outputActivations: Output of FNN.
        y:                 True classes (binarized).
        
        Returns error matrix.
        """
        return (outputActivations-y)


In [145]:
# LOAD DATA

path = "/Users/jacobsw/Desktop/CODER/IMPLEMENTATION_CAMP/BASIC_TOPICS/NN/DATA/neural-networks-and-deep-learning-master/src"
os.chdir(path)
sys.path.append(path)
import mnist_loader
train_data, dev_data, test_data = mnist_loader.load_data_wrapper()

In [104]:
%%time
fnn = NNNumpy([784,30,10])
fnn.SGD(train_data,30,10,3.0,test=test_data)

Epoch 0: 8981 / 10000
Epoch 1: 9221 / 10000
Epoch 2: 9299 / 10000
Epoch 3: 9346 / 10000
Epoch 4: 9372 / 10000
Epoch 5: 9382 / 10000
Epoch 6: 9405 / 10000
Epoch 7: 9357 / 10000
Epoch 8: 9427 / 10000
Epoch 9: 9411 / 10000
Epoch 10: 9448 / 10000
Epoch 11: 9436 / 10000
Epoch 12: 9452 / 10000
Epoch 13: 9459 / 10000
Epoch 14: 9480 / 10000
Epoch 15: 9469 / 10000
Epoch 16: 9477 / 10000
Epoch 17: 9493 / 10000
Epoch 18: 9466 / 10000
Epoch 19: 9494 / 10000
Epoch 20: 9481 / 10000
Epoch 21: 9468 / 10000
Epoch 22: 9491 / 10000
Epoch 23: 9458 / 10000
Epoch 24: 9471 / 10000
Epoch 25: 9502 / 10000
Epoch 26: 9502 / 10000
Epoch 27: 9483 / 10000
Epoch 28: 9485 / 10000
Epoch 29: 9485 / 10000
CPU times: user 1min 10s, sys: 7.05 s, total: 1min 17s
Wall time: 1min 13s


## II. Implementation 2

* Cost: Quadratic
* Library: Tensorflow

In [105]:
import tensorflow as tf
from tensorflow import placeholder, Variable
from tensorflow.examples.tutorials.mnist import input_data

In [113]:
# LOAD DATA

path = "/Users/jacobsw/Desktop/UNIV/SPRING_2017/LIN389C_RSCH_COMPLING/CODE/MNIST/"
mnist = input_data.read_data_sets(path, one_hot=True)

Extracting /Users/jacobsw/Desktop/UNIV/SPRING_2017/LIN389C_RSCH_COMPLING/CODE/MNIST/train-images-idx3-ubyte.gz
Extracting /Users/jacobsw/Desktop/UNIV/SPRING_2017/LIN389C_RSCH_COMPLING/CODE/MNIST/train-labels-idx1-ubyte.gz
Extracting /Users/jacobsw/Desktop/UNIV/SPRING_2017/LIN389C_RSCH_COMPLING/CODE/MNIST/t10k-images-idx3-ubyte.gz
Extracting /Users/jacobsw/Desktop/UNIV/SPRING_2017/LIN389C_RSCH_COMPLING/CODE/MNIST/t10k-labels-idx1-ubyte.gz


In [130]:
# SET CONFIGURATIONS

num_train = mnist.train.num_examples
num_batchs = num_train / 100
num_epochs = 30

input_size = mnist.train.images.shape[1]
output_size = mnist.train.labels.shape[1]

In [128]:
# BUILD COMPUTATIONAL GRAPH

# set sizes for input and output
x = placeholder(tf.float32, [None, 784]) # None: unspecified input size.
y = placeholder(tf.float32, [None, 10])

# initialize weights
w1 = Variable(tf.truncated_normal([784, 30], stddev=.1))
b1 = Variable(tf.truncated_normal([30], stddev=.1))
w2 = Variable(tf.truncated_normal([30, 10], stddev=.1))
b2 = Variable(tf.truncated_normal([10], stddev=.1))

# link input to activations
a1 = tf.nn.sigmoid(tf.add(tf.matmul(x, w1), b1))
a2 = tf.nn.softmax(tf.add(tf.matmul(a1, w2), b2))

# set cost
cost = tf.nn.l2_loss(a2 - y)

# set optimization option
train = tf.train.GradientDescentOptimizer(learning_rate=.1).minimize(cost)

# set port for computing accuracy
correct_pred = tf.equal(tf.argmax(a2,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# TRAINING

with tf.Session() as sess: # tf.Session(): the environment under which tensorflow operates (must have).
    # initialize computational graph
    sess.run(tf.global_variables_initializer())
    for i in xrange(num_epochs):
        for _ in xrange(num_batchs):
            batch_xs, batch_ys = mnist.train.next_batch(100)
            sess.run(train, feed_dict={x: batch_xs, y: batch_ys})
        valid_acc = sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels})
        print "Epoch", i, "accuracy:", valid_acc
    print "Final accuracy:", sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels})

Epoch 0 accuracy: 0.9305
Epoch 1 accuracy: 0.9459
Epoch 2 accuracy: 0.9571
Epoch 3 accuracy: 0.9606
Epoch 4 accuracy: 0.9618
Epoch 5 accuracy: 0.9602
Epoch 6 accuracy: 0.9577
Epoch 7 accuracy: 0.9638
Epoch 8 accuracy: 0.9631
Epoch 9 accuracy: 0.9633
Epoch 10 accuracy: 0.9609
Epoch 11 accuracy: 0.963
Epoch 12 accuracy: 0.967
Epoch 13 accuracy: 0.9653
Epoch 14 accuracy: 0.9625
Epoch 15 accuracy: 0.9635
Epoch 16 accuracy: 0.9629
Epoch 17 accuracy: 0.9632
Epoch 18 accuracy: 0.9647
Epoch 19 accuracy: 0.9656
Epoch 20 accuracy: 0.9639
Epoch 21 accuracy: 0.9633
Epoch 22 accuracy: 0.9638
Epoch 23 accuracy: 0.9654
Epoch 24 accuracy: 0.9604
Epoch 25 accuracy: 0.9654
Epoch 26 accuracy: 0.9629
Epoch 27 accuracy: 0.9639
Epoch 28 accuracy: 0.9658
Epoch 29 accuracy: 0.9653
Final accuracy: 0.9653
