# Tried implementing neural net learning based on 3blue1brown video and Nielssen book code
## author: me

In [1]:
#import some data
import mnist_loader
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()

In [2]:
len(training_data), len(validation_data), len(test_data)

(50000, 10000, 10000)

In [120]:
red_training_data = training_data[:1000]
red_validation_data = validation_data[:1000]
red_test_data = test_data[:1000]

In [121]:
import numpy as np

In [122]:
testimage = training_data[0][0]
testimage2 = training_data[1][0]

In [196]:
#create activation function
def ReLU(x):
    return max(0,x)
def sigmoid(x):
    return 1/(1+np.exp(-x))
def sigmoid_prime(x):
    return np.exp(-x)/(1+np.exp(-x))**2

# create network
class Network:
    def __init__(self, layers,activation_function):
        self.layers = np.append(784,layers)
        self.weights = []
        self.biases = []
        self.activations = []
        self.zvalues = []
        self.act_func = activation_function
        #initialise weights and biases randomly
        for i in range(len(self.layers)-1):
            w = np.random.rand(self.layers[i+1],self.layers[i])
            self.weights.append(w)
            b = np.random.rand(self.layers[i+1])
            self.biases.append(b)
            z = np.zeros(self.layers[i+1])
            self.zvalues.append(z)
            a = np.zeros(self.layers[i+1])
            self.activations.append(a)
            
    def forwardfeed(self,image):
        for i in range(len(self.layers)-1):
            if i == 0:
                a = np.dot(self.weights[i],image).flatten()
                a = np.divide(a,len(image)) #normalize by number of terms in sum 
                b = self.biases[i]
                self.activations[i] = [self.act_func(j) for j in np.add(a,b)]
            else:
                a = np.dot(self.weights[i],self.activations[i-1]).flatten()
                a = np.divide(a,len(self.activations[i-1])) #normalize by number of terms in sum
                b = self.biases[i]
                self.activations[i] = [self.act_func(j) for j in np.add(a,b)]
    
    def cost(self):
        costs = []
        for i in red_training_data:
            self.forwardfeed(i[0])
            produced_activations = self.activations[-1]
            cost = 0
            for j in range(0,10):
                desired_output = i[1]
                cost += (produced_activations[j]-desired_output[j])**2
            costs.append(cost)
        return np.mean(costs)
    
    def train(self,trainingdata,batchsize=50,normfactor=100):
        bs = batchsize
        td = trainingdata
        assert len(td)%bs == 0, "trainingdata module batchsize non-zero"
        for i in range(len(td)/bs):
            print("batch",i)
            training_batch = td[i*bs:(i+1)*bs]
            weights_gradient_total = np.array([np.zeros((16,784)),np.zeros((18,16)),np.zeros((10,18))])
            biases_gradient_total = np.array([np.zeros(16),np.zeros(18),np.zeros(10)])
            for j in training_batch:
                gradient = self.single_cost_gradient(j)
                weights_gradient_total = np.add(weights_gradient_total,gradient[0])
                biases_gradient_total = np.add(biases_gradient_total,gradient[1])
            weights_gradient_total = np.divide(weights_gradient_total,bs*normfactor)
            biases_gradient_total = np.divide(biases_gradient_total,bs*normfactor)
            for i in range(len(self.weights)):
                self.weights[i] = np.add(self.weights[i],weights_gradient_total[i])
                self.biases[i] = np.add(self.biases[i],biases_gradient_total[i])
                
     
    def single_cost_gradient(self,training_sample):
        ts = training_sample
        weight_nudges = np.array([np.zeros((16,784)),np.zeros((18,16)),np.zeros((10,18))])
        biases_nudges = np.array([np.zeros(16),np.zeros(18),np.zeros(10)])

        for j in range(self.layers[-1]): #L layer
            dc_daj = 2*(self.activations[-1][j]-ts[1][j])
            daj_dzj = sigmoid_prime(self.zvalues[-1][j])
            biases_nudges[-1][j] = dc_daj*daj_dzj   #L layer biases

            derivative_sum = 0
            for k in range(self.layers[-2]): #L-1 layer
                dzj_dwjk = self.activations[-2][k]
                weight_nudges[-1][j,k] = dc_daj*daj_dzj*dzj_dwjk #L layer weights

                dzj_dzk = sigmoid_prime(self.zvalues[-2][k])
                derivative_sum += dc_daj*daj_dzj*dzj_dzk

                biases_nudges[-2][k] = derivative_sum  #L-1 layer biases
                derivative_sum2 = 0
                for l in range(self.layers[-3]): #L-2 layer
                    dzk_dwkl = self.activations[-3][l]
                    weight_nudges[-2][k,l] = derivative_sum*dzk_dwkl #L-1 layer weights

                    dzk_dzl = sigmoid_prime(self.zvalues[-3][l])
                    derivative_sum2 += derivative_sum*dzk_dzl

                    biases_nudges[-3][l] = derivative_sum2 #L-2 layer biases
                    for h in range(self.layers[-4]): #L-3 layer
                        dzl_dwlm = ts[0][h]
                        weight_nudges[-3][l,h] = derivative_sum2*dzl_dwlm #L-2 layer weights

        return weight_nudges, biases_nudges

In [197]:
testnet = Network([16,18,10],sigmoid)
#testnet2 = Network([16,18,10],sigmoid) #different due to random

In [198]:
testnet.forwardfeed(testimage)

In [199]:
testnet.weights[2][0]

array([ 0.90101668,  0.6352517 ,  0.90695253,  0.7364634 ,  0.87976872,
        0.0736008 ,  0.68225777,  0.97853571,  0.27787319,  0.59951349,
        0.34411534,  0.44010823,  0.83983052,  0.23792687,  0.50625824,
        0.67196369,  0.13451844,  0.56851194])

In [200]:
#xx = testnet.single_cost_gradient(training_data[0])

In [201]:
testnet.train(training_data[:50],batchsize=5)

('batch', 0)
('batch', 1)
('batch', 2)
('batch', 3)
('batch', 4)
('batch', 5)
('batch', 6)
('batch', 7)
('batch', 8)
('batch', 9)


In [202]:
testnet.weights[2][0]

array([ 0.92371709,  0.66031504,  0.93011888,  0.76127519,  0.90333048,
        0.0991539 ,  0.70800024,  1.00172473,  0.30068109,  0.62141756,
        0.36917503,  0.46314131,  0.86004338,  0.26179935,  0.52823281,
        0.69579357,  0.15706414,  0.59443626])

In [204]:
testnet.biases[2]

array([ 0.9248293 ,  0.612632  ,  0.14186262,  0.94156069,  0.91484248,
        0.90670774,  0.86404458,  0.52250647,  0.37142391,  0.47308001])

In [208]:
testnet.activations[2]

[0.78185599031696607,
 0.72223413594703767,
 0.60840628395109531,
 0.7823576194808225,
 0.77153751174259055,
 0.77121931772663777,
 0.75271893508579268,
 0.69471531072056958,
 0.66979123067031798,
 0.67860423150551152]

In [209]:
testnet.forwardfeed(testimage)

In [210]:
testnet.activations[2]

[0.79280143955762594,
 0.73366926629805329,
 0.62065530991928342,
 0.79266229277139966,
 0.78256973338797731,
 0.78261638386403309,
 0.76345490820405992,
 0.70687120547424787,
 0.68273757841818861,
 0.68978670716227963]