# Neural network - Simple
A module to implement the stochastic gradient descent learning algorithm for a feedforward neural network.  Gradients are calculated using backpropagation.  Note that I have focused on making the code simple, easily readable, and easily modifiable.  It is not optimized, and omits many desirable features.
## Feed forward

## Backpropagation

In [1]:
import random
import numpy as np
import tensorflow as tf

In [2]:
def vectorize(y):
    res = np.zeros((10, 1))
    res[y] = 1.0
    return res

mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

x_train = [np.reshape(x, (784, 1)) for x in x_train]
y_train = [vectorize(y) for y in y_train]
data_train = list(zip(x_train, y_train))

x_test = [np.reshape(x, (784, 1)) for x in x_test]
data_test = list(zip(x_test, y_test))

In [3]:
def sigmoid(z):
    """Sigmoid function"""
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_prime(z):
    """Derivative of sigmoid function"""
    return sigmoid(z) * (1 - sigmoid(z))

In [6]:
class Network(object):
    def __init__(self, shape):
        """
        @shape: shape of the network, ex: [3, 2, 1]
        @n: number of layers
        @B: list of biases
        @W: list of weights
        """
        self.shape = shape
        self.n = len(shape)        
        self.B = [np.random.randn(y, 1) for y in shape[1:]]
        self.W = [np.random.randn(y, x) for x, y in zip(shape[:-1], shape[1:])]
    
    def feedforward(self, a):
        """
        Return the output of the network if @a is input
        """
        for w, b in zip(self.W, self.B):
            a = sigmoid(np.dot(w, a) + b)
        
        return a
    
    def backprop(self, x, y):
        """
        Return a tuple (delta_W, delta_B) representing the gradient
        for the cost function C_x
        """
        delta_W = [np.zeros(w.shape) for w in self.W]
        delta_B = [np.zeros(b.shape) for b in self.B]
        
        # Feed forward
        a = x
        a_lst = [x]
        z_lst = []        
        for w, b in zip(self.W, self.B):
            z = np.dot(w, a) + b
            z_lst.append(z)
            a = sigmoid(z)
            a_lst.append(a)
         
        # Backward pass - Quadratic cost
        delta = (a_lst[-1] - y) * sigmoid_prime(z_lst[-1])
        delta_W[-1] = np.dot(delta, a_lst[-2].transpose())
        delta_B[-1] = delta
        
        for l in range(2, self.n):
            z = z_lst[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.W[-l+1].transpose(), delta) * sp
            delta_W[-l] = np.dot(delta, a_lst[-l-1].transpose())
            delta_B[-l] = delta            
    
        return (delta_W, delta_B)
    
    def update(self, batch, eta):
        """
        Update network's weights and biases:
            1. Loop through each (x, y) ~ 1 example in the batch:
                1.1. Do 1 backprop
                1.2. Accumulate @delta_W and @delta_B
            2. Apply the update to network weights and biases
        """
        delta_W_sum = [np.zeros(w.shape) for w in self.W]
        delta_B_sum = [np.zeros(b.shape) for b in self.B]
        
        for x, y in batch:
            delta_W, delta_B = self.backprop(x, y)
            delta_W_sum = [dw + dws for dw, dws in zip(delta_W, delta_W_sum)]
            delta_B_sum = [db + dbs for db, dbs in zip(delta_B, delta_B_sum)]

        n_batch = len(batch)
        self.W = [w - eta * dws / n_batch
                 for w, dws in zip(self.W, delta_W_sum)]
        self.B = [b - eta * dbs / n_batch
                 for b, dbs in zip(self.B, delta_B_sum)]
        
    def SGD(self, data_train, epochs, batch_size, eta, data_test=None):
        """
        Stochastic gradient descent: 
            1. Loop through each epoch:
                1.1. Randomly shuffle the @data_train, 
                    then partitions it into batches. 
                1.2. Loop through each batch to apply a single
                    step of gradient descent to update the 
                    network weights and biases.
        """                 
        n = len(data_train)
        
        for i in range(epochs):
            random.shuffle(data_train)
            batches = [
                data_train[k: k+batch_size]
                for k in range(0, n, batch_size)]
            
            for batch in batches:
                self.update(batch, eta)
            
            if data_test:
                print(f'Epoch {i}: {self.evaluate(data_test)} / {len(data_test)}')
            else:
                print(f"Epoch {i} completed")
     
    def evaluate(self, data_test):
        test_results = [(np.argmax(self.feedforward(x)), y)
                       for (x, y) in data_test]
        return sum(int(x == y) for (x, y) in test_results)        

In [8]:
net = Network([784, 30, 10])
net.SGD(data_train, 30, 10, 3.0, data_test=data_test)

Epoch 0: 8219 / 10000


KeyboardInterrupt: 