In [110]:
# ReLu function
def activation_ReLu(value):
    if value > 0:
        return value
    else:
        return 0

# partial derivative of ReLu for the backpropagation
def activation_ReLu_part_deriv(value):
    if value > 0:
        return 1
    else:
        return 0   


In [111]:
# Initialising weights and biases
# Weights
w1 = 1
w2 = 0.5
w3 = 1
w4 = -0.5
w5 = 1
w6 = 1

# Biases
bias1 = 0.5
bias2 = 0
bias3 = 0.5

# input values and target value
input1 = 1
input2 = 0
true_value = 2

# learning rate
LR = 0.01

In [112]:
# Node 1
# Forward pass
node_1_output = input1 * w1 + input2 * w3 + bias1

# apply activation function
node_1_output = activation_ReLu(node_1_output)
node_1_output

1.5

In [113]:
# Repeat for the second node in the first hidden layer
node_2_output = input1 * w2 + input2 * w4 + bias2
node_2_output = activation_ReLu(node_2_output)
node_2_output

0.5

In [114]:
# Take the outputs from node 1 and node 2 to get the final node output
node_3_output = node_1_output * w5 + node_2_output * w6 + bias3
node_3_output = activation_ReLu(node_3_output)
node_3_output

2.5

In [115]:
# calculate the loss for this forward pass
predicted_value = node_3_output
loss = (predicted_value - true_value) ** 2
loss

0.25

In [116]:
# Backpropagation

# Partial derivative of the loss function
# New weight is calculated by multiplying the partial derivative of the weight by the derivative of the loss function
# So L = (output - true value)^2 means the derivative is deriv_L = 2 * (output - true value)
# The partial derivative of the weight w5 in respect to w5 is d(node1 * w5) = node1
# Thus we get 2 * node1 * (output - true value)
deriv_L_w5 = 2 * node_1_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w5

1.5

In [117]:
new_w5 = w5 - LR * deriv_L_w5
new_w5

0.985

In [118]:
deriv_L_w6 = 2 * node_2_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w6

0.5

In [119]:
# and the same way, update also w6
new_w6 = w6 - LR * deriv_L_w6
new_w6

0.995

In [120]:
# partial derivative of the loss function with respect to bias 3
deriv_L_b3 = 2 * 1 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b3

1.0

In [121]:
# update b3 in the same way
new_b3 = bias3 - LR * deriv_L_b3
new_b3

0.49

In [122]:
# Getting the new value for weight1

# deriv_L_w1_left
deriv_L_w1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w1_left

# deriv_L_w1_right
deriv_L_w1_right = activation_ReLu_part_deriv(input1 * w1 + input2 * w3 + bias1) * input1
deriv_L_w1_right

# Combining
deriv_L_w1 = deriv_L_w1_left * deriv_L_w1_right
new_w1 = w1 - LR * deriv_L_w1
new_w1

0.99

In [123]:
# Getting the new value for weight2
deriv_L_w2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w2_right = activation_ReLu_part_deriv(input1 * w2 + input2 * w4 + bias2) * input1
deriv_L_w2 = deriv_L_w2_left * deriv_L_w2_right
new_w2 = w2 - LR * deriv_L_w2
new_w2

0.49

In [124]:
# Getting the new value for weight3
deriv_L_w3_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w3_right = activation_ReLu_part_deriv(input1 * w1 + input2 * w3 + bias1) * input2
deriv_L_w3 = deriv_L_w3_left * deriv_L_w3_right
new_w3 = w3 - LR * deriv_L_w3
new_w3

1.0

In [125]:
# Getting the new value for weight4
deriv_L_w4_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w4_right = activation_ReLu_part_deriv(input1 * w2 + input2 * w4 + bias2) * input2
deriv_L_w4 = deriv_L_w4_left * deriv_L_w4_right
new_w4 = w4 - LR * deriv_L_w4
new_w4

-0.5

In [126]:
# Getting the new value for bias1
deriv_L_b1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b1_right = activation_ReLu_part_deriv(input1 * w1 + input2 * w3 + bias1) * 1
deriv_L_b1 = deriv_L_b1_left * deriv_L_b1_right
new_b1 = bias1 - LR * deriv_L_b1
new_b1

0.49

In [127]:
# Getting the new value for bias2
deriv_L_b2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b2_right = activation_ReLu_part_deriv(input1 * w2 + input2 * w4 + bias2) * 1
deriv_L_b2 = deriv_L_b2_left * deriv_L_b2_right
new_b2 = bias2 - LR * deriv_L_b2
new_b2

-0.01

In [128]:
print(f"W1: {new_w1}")
print(f"W2: {new_w2}")
print(f"W3: {new_w3}")
print(f"W4: {new_w4}")
print(f"W5: {new_w5}")
print(f"W6: {new_w6}")
print(f"B1: {new_b1}")
print(f"B2: {new_b2}")
print(f"B3: {new_b3}")

W1: 0.99
W2: 0.49
W3: 1.0
W4: -0.5
W5: 0.985
W6: 0.995
B1: 0.49
B2: -0.01
B3: 0.49


: 