https://machinelearningmastery.com/implement-backpropagation-algorithm-scratch-python/

### Single layer nerual network 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [34]:
df = pd.read_csv('seeds_dataset.csv', header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


In [35]:
print (df.columns)
df.iloc[:, 7].unique()

Int64Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='int64')


array([1, 2, 3])

In [36]:
# convert column names from integer to string 

df.columns = df.columns.map(str)
print (df.columns)

Index(['0', '1', '2', '3', '4', '5', '6', '7'], dtype='object')


In [37]:
df['7'].unique()

array([1, 2, 3])

### initialzied the network

In [38]:
# n_hidden: number of neurons in each hidden layer and each each hidden layer has n_input + 1 weights 
# n_outputs: number of neurons in the output layer and output layer has n_hidden + 1 weights 


def initialize_network(n_inputs, n_hidden, n_outputs):
    network = list()
    # weight matrix {num_row, num_col}
    hidden_layer = [{'weights':[random() for i in range(n_inputs + 1)]} for i in range(n_hidden)]
    network.append(hidden_layer)
    
    output_layer = [{'weights': [random() for i in range(n_hidden + 1)]} for i in range(n_outputs)]
    network.append(output_layer)
    
    return network

In [39]:
from random import seed
from random import random

seed(1)
# test initialize_network - two dimensional input data, one hidden layer with one neuron, binary output data 
network = initialize_network(2, 1, 2)

for layer in network:
    print (layer)
    
print ('==')

for layer in network:
    for neuron in layer:
        print (neuron)

[{'weights': [0.13436424411240122, 0.8474337369372327, 0.763774618976614]}]
[{'weights': [0.2550690257394217, 0.49543508709194095]}, {'weights': [0.4494910647887381, 0.651592972722763]}]
==
{'weights': [0.13436424411240122, 0.8474337369372327, 0.763774618976614]}
{'weights': [0.2550690257394217, 0.49543508709194095]}
{'weights': [0.4494910647887381, 0.651592972722763]}


### forward propagation 

In [40]:
# linear weight function -- sum(weight_i * input_i) + bias

def activate(weights, inputs): 
    # assume bias is the last entry in the weightvector
    activation = weights[-1]
    
    for i in range(len(weights) - 1):
        # inputs are row data (with each data point from each attribute) in the training data - elementwise multiplicaiotn
        activation += weights[i] * inputs[i]
        
    return activation 

In [41]:
# use sigmoid function as activation function to transfer the linear weight function to a range between(0, 1)
# g(z) = 1/(1 + exp(-z)) where z is the linear weight function defined in the above cell as activate(-, -)

from math import exp

def transfer (activation):
    return 1/(1 + exp(-activation))

forward propagate each row in the dataset through the netwrok and 

save the forward propagate result in each neuron as 'output' after the neuron's weight vector and

return those activated weighted inputs through the entire netwrok, 

which is also the model prediction for a given row of data 

(haven't use softmax function on output layer yet, predicted probabilities won't sum to one)


In [42]:
def forward_propagate(network, row):
    inputs = row
    
    for layer in network:
        new_inputs = []
        for neuron in layer:
            # eadch neruon's outputs are weighted and eprocessd by the neurons in the next layer 
            activation = activate(neuron['weights'], inputs)
            neuron['output'] = transfer(activation)
            new_inputs.append(neuron['output'])  
        # to use out/activated weighted inputs from the previous layer as input for the next layer 
        inputs = new_inputs
#         print(inputs)
        
    return inputs

In [23]:
# forward propagation calculation 
# print (transfer(0.13436424411240122 + 0.763774618976614 ))
# print (transfer(0.7105668883115941 * 0.2550690257394217 + 0.49543508709194095 ))

0.7105668883115941
0.6629970129852887


In [43]:
# test forward propagate

# one example row of data
row = [1, 0, None]

seed(1)
network = initialize_network(2, 1, 2)
print(network)
print ('--')
output = forward_propagate(network, row)
print (output)
print ('--')
for layer in network:
    print (layer)

[[{'weights': [0.13436424411240122, 0.8474337369372327, 0.763774618976614]}], [{'weights': [0.2550690257394217, 0.49543508709194095]}, {'weights': [0.4494910647887381, 0.651592972722763]}]]
--
[0.6629970129852887, 0.7253160725279748]
--
[{'weights': [0.13436424411240122, 0.8474337369372327, 0.763774618976614], 'output': 0.7105668883115941}]
[{'weights': [0.2550690257394217, 0.49543508709194095], 'output': 0.6629970129852887}, {'weights': [0.4494910647887381, 0.651592972722763], 'output': 0.7253160725279748}]


### back propagation

http://www.bogotobogo.com/python/scikit-learn/Artificial-Neural-Network-ANN-4-Backpropagation.php

In [44]:
# derivative of sigmoid function folows dg(z)/ d(z) = g(z) (1 - g(z)) where z is the weighted input function

# derivative of one neuron  
def transfer_derivative(output):
    return output * (1 - output)

In [45]:
# from the link above, to get the gradient for weight update -
# output layer error: delta3 = (y - yhat) * transfer_derivative(forward_propagate- output layer)
# hidden layer error: delta2 = weight_jk * delta3 * transfer_derivative(forward_propagate - hidden layer)
# error_k: error from the kth neuron in the output layer 
# weight_jk: weight that connects the jth neuron in the previous layer to the current neuron k, which is the kth neuron in the output layer 

def backward_propagate_error(network, target):
    
    # start from the last layer 
    for i in reversed(range(len(network))):
        layer = network[i]
        errors = list()
        
        if i == len(network) - 1: 
            for j in range(len(layer)): 
                # for each neuron in the output layer, get the list of errors (y - yhat) from all the neurons in the output layer
                neuron = layer[j]
                errors.append(target[j] - neuron['output'])
                                
        else:
            for j in range(len(layer)):
                error = 0
                # delta2 = error of each neuron in the output layer/delta3 weighted by the output layer weight matrix/ W_out
                for neuron in network[i + 1]:
#                     print (('delta3 {} ').format(neuron['delta']))
                    
#                     print (('W2 {} ').format(neuron['weights'][j]))
                    error += neuron['weights'][j] * neuron['delta']
                
                errors.append(error)
                
        # first goes through ouput layer then hidden layer to update delta (changes in error)
        for j in range(len(layer)):
            # first run occurs when it's the output layer weight gradient - delta3 = (y - yhat) * transfer_derivative(value from forward_propagate at the corresponding layer/neuron)
            # second run occurs when it's the output layer weight gradient - delta2 = W_out * delta3 * transfer_derivative(value from forward_propagate at the corresponding layer/neuron)
            
            neuron = layer[j]
            neuron['delta'] = errors[j] * transfer_derivative(neuron['output'])
            
#             print (neuron['delta'])

In [46]:
# test back propagation

target = [0, 1]
backward_propagate_error(network, target)

print('--')
for layer in network:
    for neuron in layer:
        print(neuron)

--
{'weights': [0.13436424411240122, 0.8474337369372327, 0.763774618976614], 'output': 0.7105668883115941, 'delta': -0.002711797799238243}
{'weights': [0.2550690257394217, 0.49543508709194095], 'output': 0.6629970129852887, 'delta': -0.14813473120687762}
{'weights': [0.4494910647887381, 0.651592972722763], 'output': 0.7253160725279748, 'delta': 0.05472601157879688}


In [47]:
for layer in network:
    for neuron in layer:
        print(neuron['output'])

print ('--')

# delta3 calculation double checking 
print ((0 - 0.6629970129852887) * transfer_derivative(0.6629970129852887))
print ((1 - 0.7253160725279748) * transfer_derivative(0.7253160725279748))

print ('--')

# delta2 calculation double checking 
a = -0.13436424411240122 * 0.14813473120687762 
b = 0.8474337369372327 * 0.05472601157879688
c = transfer_derivative(0.7253160725279748)

print ((a + b) * c)

0.7105668883115941
0.6629970129852887
0.7253160725279748
--
-0.14813473120687762
0.05472601157879688
--
0.005274218131534226


### train network

In [76]:
# update network 'weights' with error using Stochastic Gradient Descent: weight = weight + learning rate * error * input

def update_weights(network, row, l_rate):
    for i in range(len(network)):
        inputs = row[: -1]
    
        if i != 0:
            inputs = [neuron['output'] for neuron in network [i - 1]]
        
        for neuron in network[i]:
            for j in range(len(inputs)):
                # input here is the output in each neuron from forward_propagation function
                # dL/dW2
                neuron['weights'][j] += l_rate * neuron['delta'] * inputs[j]
            # bias 
            neuron['weights'][-1] += l_rate * neuron['delta']
            

$$
\begin{aligned}
\delta_3 = (y - \hat{y})  df(z_3)\\
\frac{\partial{L}}{\partial{W_2}} = a_2^T \delta_3  \\
\frac{\partial{L}}{\partial{W_1}} = x^T \delta_2\\
\delta_2 = \delta_3 W_2^ Tdf(z_2) \\
\frac{\partial{L}}{\partial{b_2}} = \delta_3\\
\frac{\partial{L}}{\partial{b_1}} = \delta_2 \\
\end{aligned}
$$

In [77]:
def train_network(network, train, l_rate, n_epoch, n_outputs):
	for epoch in range(n_epoch):
		sum_error = 0
        
		for row in train:
            # forward_propagate function output is created based on the number of neurons in the output layer s
			outputs = forward_propagate(network, row)
            
            # output label vector size 
			target = [0 for i in range(n_outputs)]
            # create one hot encoding label(class 1: [0, 1], class 2:[1, 0]) 
			target[row[-1]] = 1
            
			sum_error += sum([(target[i] - outputs[i]) ** 2 for i in range(len(target))])
            
			backward_propagate_error(network, target)
			update_weights(network, row, l_rate)
            
		print('>epoch = %d, lrate = %.3f, error = %.3f' % (epoch, l_rate, sum_error)) 

In [81]:
# test training 
# synthetic small dataset randoms

dataset = [[2.7810836,2.550537003,0],
	[1.465489372,2.362125076,0],
	[3.396561688,4.400293529,0],
	[1.38807019,1.850220317,0],
	[3.06407232,3.005305973,0],
	[7.627531214,2.759262235,1],
	[5.332441248,2.088626775,1],
	[6.922596716,1.77106367,1],
	[8.675418651,-0.242068655,1],
	[7.673756466,3.508563011,1]]

# for mdoel parameter initialization
n_inputs = len(dataset[0]) - 1
n_outputs = len(set([row[-1] for row in dataset]))

network = initialize_network(n_inputs, 2, n_outputs)
train_network(network, dataset, 0.5, 20, n_outputs)

for layer in network:
    print(layer)

>epoch = 0, lrate = 0.500, error = 5.947
>epoch = 1, lrate = 0.500, error = 5.380
>epoch = 2, lrate = 0.500, error = 5.203
>epoch = 3, lrate = 0.500, error = 5.117
>epoch = 4, lrate = 0.500, error = 5.028
>epoch = 5, lrate = 0.500, error = 4.907
>epoch = 6, lrate = 0.500, error = 4.737
>epoch = 7, lrate = 0.500, error = 4.512
>epoch = 8, lrate = 0.500, error = 4.237
>epoch = 9, lrate = 0.500, error = 3.925
>epoch = 10, lrate = 0.500, error = 3.587
>epoch = 11, lrate = 0.500, error = 3.233
>epoch = 12, lrate = 0.500, error = 2.874
>epoch = 13, lrate = 0.500, error = 2.526
>epoch = 14, lrate = 0.500, error = 2.203
>epoch = 15, lrate = 0.500, error = 1.916
>epoch = 16, lrate = 0.500, error = 1.666
>epoch = 17, lrate = 0.500, error = 1.454
>epoch = 18, lrate = 0.500, error = 1.274
>epoch = 19, lrate = 0.500, error = 1.123
[{'weights': [-0.7946098379053232, 0.767287431991693, 0.8048366482018935], 'output': 0.08464031263028213, 'delta': -0.006089570629683444}, {'weights': [1.245334197200103,

### prediction

In [82]:
# make prediction with the network
def predict(network, row):
    outputs = forward_propagate(network, row)
    return outputs.index(max(outputs))

In [83]:
# on the same small dataset used before

for row in dataset:
	prediction = predict(network, row)
	print('Target = %d, Got = %d' % (row[-1], prediction))

Target = 0, Got = 0
Target = 0, Got = 0
Target = 0, Got = 0
Target = 0, Got = 0
Target = 0, Got = 0
Target = 1, Got = 1
Target = 1, Got = 1
Target = 1, Got = 1
Target = 1, Got = 1
Target = 1, Got = 1
