In [1]:
# """
# network2.py
# ~~~~~~~~~~
#
# Code directly from Michael Nielsen - "Neural Networks and Deep Learning"
# This is the module to implement stochastic gradient descent. 
# """

In [2]:
#### Libraries
# Standard library
import random

# Third-party libraries
import numpy as np

In [3]:
class Network(object):
    
    def __init__(self, sizes):
    # sizes is a list/array that contains number of neurons per layer. 
        # Example: sizes = [2, 3, 1]. Three layers, first with 2 neurons ...
        # The first layer is assumed to be an input layer. 
        # The weights and biases are initialized randomly from Standard Normal here. 
        # The input layer has no biases, just layers we are feed-forwarding into. 
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y,1) for y in sizes[1:]]
        # weight matrix dimension = next layer neurons, previous layer neurons
        # its sizes[:-1] because that contains previous layer, in other words, not last layer
        self.weights = [np.random.randn(y, x)
                       for x, y in zip(sizes[:-1], sizes[1:])]
        
    def feedforward(self, a):
        # returns the activations for each unit in network for input `a`
        # also returns the zs for each unit
        zs = []
        activations = [np.array([a])]
        z=0
        for i in range((self.num_layers-1)):
            z = np.dot(self.weights[i], a) + self.biases[i]
            zs.append(z)
            if i < (self.num_layers-2):
                a = vectorize_ReLu(z)
                activations.append(a)
            else:
                activations.append(z)
        return activations, zs
    
    def feedforwardEvaluate(self, a):
        for i in range((self.num_layers-1)):
            z = np.dot(self.weights[i], a) + self.biases[i]
            if i < (self.num_layers-2):
                a = vectorize_ReLu(z)
        return z
    
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        # Here we are doing the stochastic gradient descent using mini-batches. 
        # If test_data is provided (different from training data) the partial 
        # progress at each epoch will be evaluated. Default is none. 
        if test_data: n_test = len(test_data)
        n = len(training_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in range(0,n,mini_batch_size)
            ]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print("Epoch {0}: Insample: {1}, Outsample {2}".format(
                j, self.in_sample_cost(training_data), self.evaluate(test_data)))
            else: 
                print("Epoch {0} complete: Insample: {1}".format(j,
                self.in_sample_cost(training_data)))
                
    def update_mini_batch(self, mini_batch, eta):
        # update the network's weights and biases by applying gradient
        # descent per batch. 
        delta_b_batchsum = [np.zeros(b.shape) for b in self.biases]
        delta_w_batchsum = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_b_single, delta_w_single = self.backprop(x, y)
            delta_b_batchsum = [nb+dnb for nb, dnb in zip(delta_b_batchsum, delta_b_single)]
            delta_w_batchsum = [nw+dnw for nw, dnw in zip(delta_w_batchsum, delta_w_single)]
        self.weights = [w - (eta/len(mini_batch))*nw
                       for w, nw in zip(self.weights, delta_w_batchsum)]
        self.biases = [b - (eta/len(mini_batch))*nb
                      for b, nb in zip(self.biases, delta_b_batchsum)]
    
    def backprop(self, x, y):
        # calculate the updates to each weight and bias using backprop
        # x is input from training, y is output from training
        delta_b = [np.zeros(b.shape) for b in self.biases]
        delta_w = [np.zeros(w.shape) for w in self.weights]
        # feedforward
        activations, zs = self.feedforward(x)
        # backward pass
        # deltaL, delta at last layer is just derivative of cost with respect to network output
        # the activation is identity, so derivative is just 1. 
        delta = self.cost_derivative(activations[-1], y)*1.0
        delta_b[-1] = delta
        delta_w[-1] = np.dot(delta, activations[-2].transpose())
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = vectorize_ReLu_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta)*sp
            delta_b[-l] = delta
            delta_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return delta_b, delta_w
    
    def evaluate(self, test_data):
        test_results = [(self.feedforwardEvaluate(x), y)
                       for (x,y) in test_data]
        totalcost = sum(0.5*(x-y)**2 for (x, y) in test_results)
        totalcost = totalcost/len(test_data)
        return totalcost
    
    def in_sample_cost(self, training_data):
        # take mini-batch and compute in sample loss, this should go down??
        network_output = [(self.feedforwardEvaluate(x), y)
                         for (x,y) in training_data]
        insample_cost = sum(0.5*(x-y)**2 for (x, y) in network_output)
        insample_cost = insample_cost/len(training_data)
        return insample_cost
    
    def cost_derivative(self, output_activations, y):
        # cost = (1/2)*(output - y)^2
        # cost derivative therefore is just difference
        return (output_activations - y)

#### Misc. functions
def ReLu(z):
    if z > 0: 
        return z
    else:
        return 0.0
vectorize_ReLu = np.vectorize(ReLu)

def ReLu_prime(z):
    if z > 0:
        return 1.0
    else:
        return 0.0
vectorize_ReLu_prime = np.vectorize(ReLu_prime)