## Implement a  network that can distinguish MNIST hand written digits

based on Michael Neilson's nn book: http://neuralnetworksanddeeplearning.com/chap1.html

In [63]:
import numpy as np
import matplotlib.pyplot as plt

import pickle
import gzip
import random

In [64]:
# Load MNIST data set
def load_data(file_loc):
    with gzip.open(file_loc, 'rb') as file:
        training_data, validation_data, test_data = pickle.load(file, encoding='latin1')
    return (training_data, validation_data, test_data)

In [65]:
def vectorize_result(y):
    e = np.zeros((10,1))
    e[y] = 1.0
    return e

In [66]:
def load_mnist(file_loc):
    tr_data, val_data, test_data = load_data(file_loc)
    training_inputs = [np.reshape(x, (784, 1)) for x in tr_data[0]]
    training_results = [vectorize_result(y) for y in tr_data[1]]
    training_data = list(zip(training_inputs, training_results))
    
    validation_inputs = [np.reshape(x, (784, 1)) for x in val_data[0]]
    validation_data = list(zip(validation_inputs, val_data[1]))
    
    test_inputs = [np.reshape(x, (784, 1)) for x in test_data[0]]
    test_data = list(zip(test_inputs, test_data[1]))
    return (training_data, validation_data, test_data)

In [67]:
def sigmoid(z):
    return 1.0/(1.0 + np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z) * (1 - sigmoid(z))

In [80]:
class Network:
    
    def __init__(self, sizes):
        self.sizes = sizes
        self.num_layers = len(sizes)
        self.weights = [np.random.randn(y, x) for x, y in list(zip(sizes[:-1], sizes[1:]))]
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
    
    def feed_fwd(self, a):
        for w, b in list(zip(self.weights, self.biases)):
            a = sigmoid(np.dot(w,a) + b)
        return a
    
    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feed_fwd(x)), y) for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)
    
    def cost_derivative(self, a, y):
        return a - y
    
    def SGD(self, train_data, epochs, mini_batch_size, eta, test_data=None):
        if test_data:
            n_test = len(test_data)
        n = len(train_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [train_data[k: k + mini_batch_size] for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print(f"Epoch {j}: {self.evaluate(test_data)}, {n_test}")
            else:
                print(f"Epoch {j}")
    
    def update_mini_batch(self, mini_batch, eta):
        m = len(mini_batch)
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        for x, y in mini_batch:
            # delta_nabla_w is nabla_w for each training example x derived using backprop
            delta_nabla_w, delta_nabla_b = self.back_prop(x, y)
            
            # then we accumulate the nablas contributed by all the training examples in the mini_batch in nabla_w
            nabla_w = [nw + dnw for nw, dnw in list(zip(nabla_w, delta_nabla_w))]
            nabla_b = [nb + dnb for nb, dnb in list(zip(nabla_b, delta_nabla_b))]
        self.weights = [w - (eta/m) * nw for w, nw in list(zip(self.weights, nabla_w))]
        self.biases  = [b - (eta/m) * nb for b, nb in list(zip(self.biases, nabla_b))]
    
    def back_prop(self, x, y):
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        
        # feed fwd
        activations = [x]
        zs = []
        for w, b in list(zip(self.weights, self.biases)):
            z = np.dot(w, activations[-1]) + b
            zs.append(z)
            activations.append(sigmoid(z))
            
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        nabla_b[-1] = delta
        
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
            nabla_b[-l] = delta
        return (nabla_w, nabla_b)

In [81]:
train_data, val_data, test_data = load_mnist('../data/mnist.pkl.gz')
net = Network([784, 30, 10])
net.SGD(train_data, 30, 10, 3.0, test_data=test_data)

Epoch 0: 9068, 10000
Epoch 1: 9214, 10000
Epoch 2: 9260, 10000
Epoch 3: 9274, 10000
Epoch 4: 9346, 10000
Epoch 5: 9340, 10000
Epoch 6: 9378, 10000
Epoch 7: 9383, 10000
Epoch 8: 9382, 10000
Epoch 9: 9401, 10000
Epoch 10: 9409, 10000
Epoch 11: 9434, 10000
Epoch 12: 9419, 10000
Epoch 13: 9398, 10000
Epoch 14: 9430, 10000
Epoch 15: 9436, 10000
Epoch 16: 9427, 10000
Epoch 17: 9423, 10000
Epoch 18: 9408, 10000
Epoch 19: 9433, 10000
Epoch 20: 9447, 10000
Epoch 21: 9434, 10000
Epoch 22: 9469, 10000
Epoch 23: 9460, 10000
Epoch 24: 9457, 10000
Epoch 25: 9450, 10000
Epoch 26: 9463, 10000
Epoch 27: 9453, 10000
Epoch 28: 9445, 10000
Epoch 29: 9453, 10000
