In [1]:
# """
# network.py
# ~~~~~~~~~~
#
# Code directly from Michael Nielsen - "Neural Networks and Deep Learning"
# This is the module to implement stochastic gradient descent. 
# """

In [2]:
#### Libraries
# Standard library
import random

# Third-party libraries
import numpy as np

In [3]:
class Network(object):
    
    def __init__(self, sizes):
        # sizes is a list/array that contains number of neurons per layer. 
        # Example: sizes = [2, 3, 1]. Three layers, first with 2 neurons ...
        # The first layer is assumed to be an input layer. 
        # The weights and biases are initialized randomly from Standard Normal here. 
        # The input layer has no biases, just layers we are feed-forwarding into. 
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y,1) for y in sizes[1:]]
        # weight matrix dimension = next layer neurons, previous layer neurons
        # its sizes[:-1] because that contains previous layer, in other words, not last layer
        self.weights = [np.random.randn(y, x)
                       for x, y in zip(sizes[:-1], sizes[1:])]
        print(self.weights)
        print(self.biases)
        print(self.num_layers)
#        print(self.weights[:-1])
#        print(self.weights[:-2])
#        print(self.weights[-1])
        
    def feedforward(self, a):
        # returns the output of network given `a` as the inputs
        layer_tracker = 1
        for b, w in zip(self.biases, self.weights):
            a = np.dot(w, a) + b
            # np.dot will do dot product when vectors, else inner product. so this will be in general an inner product.
            layer_tracker = layer_tracker + 1
#            print(layer_tracker)
            if layer_tracker != self.num_layers:
                a = sigmoid(a)
            # I don't want to sigmoid the output.     
        return a
    
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        # Here we are doing the stochastic gradient descent using mini-batches. 
        # If test_data is provided (different from training data) the partial 
        # progress at each epoch will be evaluated. Default is none. 
        if test_data: n_test = len(test_data)
        n = len(training_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in range(0,n,mini_batch_size)
            ]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print("Epoch {0}: Insample: {1}, Outsmaple {2}".format(
                j, self.in_sample_cost(training_data), self.evaluate(test_data)))
            else: 
                print("Epoch {0} complete: Insample: {1}".format(j,
                self.in_sample_cost(training_data)))
    
    def update_mini_batch(self, mini_batch, eta):
        # update the network's weights and biases by applying gradient
        # descent per batch. 
        delta_b_batchsum = [np.zeros(b.shape) for b in self.biases]
        delta_w_batchsum = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_b_single, delta_w_single = self.backprop(x, y)
#            print("done with backprop")
#            print(self.biases)
#            print(self.weights)
            delta_b_batchsum = [nb+dnb for nb, dnb in zip(delta_b_batchsum, delta_b_single)]
            delta_w_batchsum = [nw+dnw for nw, dnw in zip(delta_w_batchsum, delta_w_single)]
#        print("before changing parameters")
#        print(self.biases)
#        print(self.weights)
        self.weights = [w - (eta/len(mini_batch))*nw
                       for w, nw in zip(self.weights, delta_w_batchsum)]
        self.biases = [b - (eta/len(mini_batch))*nb
                      for b, nb in zip(self.biases, delta_b_batchsum)]
#        print("after parameters")
#        print(self.biases)
#        print(self.weights)
    
    def backprop(self, x, y):
        # calculate the updates to each weight and bias using backprop
        # x is input from training, y is output from training
        delta_b = [np.zeros(b.shape) for b in self.biases]
        delta_w = [np.zeros(w.shape) for w in self.weights]
#        print(delta_b)
#        print(delta_w)
#        print(self.weights)
#        print(self.biases)
        # feedforward
        activation = x
        activations = [x] # list to store all activations, layer by layer
        zs = [] # list to store weighted sums, z is before activation
#        print("before for loop?")
        layer_tracker = 1
        for b, w in zip(self.biases, self.weights):
            # we do have a feedforward function, but the feedforward function
            # just returns the output of the network, but we want the 
            # activations and z's per layer to do backprop
            z = np.dot(w, activation)+b
            zs.append(z)
            layer_tracker = layer_tracker + 1
#            print(layer_tracker)
            if layer_tracker != self.num_layers:
                activation = sigmoid(z)
                activations.append(activation)
            else:
                activations.append(z)
#        print("forward pass end?")
        # backward pass
#        delta = self.cost_derivative(activations[-1], y)*sigmoid_prime(zs[-1])
        delta = self.cost_derivative(activations[-1], y)*1.0
        delta_b[-1] = delta
        delta_w[-1] = np.dot(delta, activations[-2].transpose())
#        print(self.sizes)
#        print(self.biases)
#        print(self.weights)
#        print("before back pass loop?")
        for l in range(2, self.num_layers):
#            print("layer {0} of {1}".format(l, self.num_layers))
#            print(zs)
            z = zs[-l]
#            print(z)
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta)*sp
#            print(delta)
#            print("what the index 1?")
            delta_b[-l] = delta
            delta_w[-l] = np.dot(delta, activations[-l-1].transpose())
#            print("what the index 2?")
#        print("before return")
        return (delta_b, delta_w)
    
    def evaluate(self, test_data):
        test_results = [(self.feedforward(x), y)
                       for (x,y) in test_data]
        totalcost = sum(0.5*(x-y)**2 for (x, y) in test_results)
        totalcost = totalcost/len(test_data)
        return totalcost
    
    def in_sample_cost(self, training_data):
        # take mini-batch and compute in sample loss, this should go down??
        network_output = [(self.feedforward(x), y)
                         for (x,y) in training_data]
        insample_cost = sum(0.5*(x-y)**2 for (x, y) in network_output)
        insample_cost = insample_cost/len(training_data)
        return insample_cost
    
    def cost_derivative(self, output_activations, y):
        # cost = (1/2)*(output - y)^2
        # cost derivative therefore is just difference
        return (output_activations - y)

#### Misc. functions
def sigmoid(z):
#    return 1.0/(1.0+np.exp(-z))
    return z

def sigmoid_prime(z):
#    return sigmoid(z)*(1-sigmoid(z))
    return 1.0