## Neural network, math experimentation

In [172]:
# activation functions
# ReLu is very simple, it filters out all negative values
# this is a powerful activation function in reality
def activation_ReLu(number):
    if number > 0:
        return number
    else:
        return 0


# we also need a derivated version of ReLu
# otherwise same as original, but instead of original value, return 1 instead
def activation_ReLu_partial_derivative(number):
    if number > 0:
        return 1
    else:
        return 0

In [173]:
# initialize weights and biases
# in Keras/TensorFlow/PyTorch etc. these are usually randomized in the beginning
w1 = 1
w2 = 0.5
w3 = 1
w4 = -0.5
w5 = 1
w6 = 1
bias1 = 0.5
bias2 = 0
bias3 = 0.5

# our training data
# x1 = input1, x2 = input2, y = true_value
input1 = 1
input2 = 0
true_value = 2

# learning rate
LR = 0.01

### FORWARD PASS

In [174]:
# NODE 1 OUTPUT
node_1_output = input1 * w1 + input2 * w3 + bias1
node_1_output = activation_ReLu(node_1_output)
node_1_output

1.5

In [175]:
# NODE 2 OUTPUT
node_2_output = input1 * w2 + input2 * w4 + bias2
node_2_output = activation_ReLu(node_2_output)
node_2_output

0.5

In [176]:
# NODE 3 OUTPUT
# we can just use Node 1 and 2 outputs, since they
# already contain the previous weights in their result
node_3_output = node_1_output * w5 + node_2_output * w6 + bias3
node_3_output = activation_ReLu(node_3_output)
node_3_output

2.5

In [177]:
# compare predicted value with true value
print(f"Predicted: {node_3_output} --> True value should be: {true_value}")

Predicted: 2.5 --> True value should be: 2


In [178]:
# LOSS FUNCTION - we are going to use MSE -> mean squared error
# MSE formula for LOSS => (predicted_value - true_value) ^ 2
predicted_value = node_3_output
loss = (predicted_value - true_value) ** 2
loss

0.25

### BACKPROPAGATION - update the weights and biases while traversing the network BACKWARDS

In [179]:
# solving the partial derivative of the loss function with respect to w5
deriv_L_w5 = 2 * node_1_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w5

1.5

In [180]:
# the weight was decreased a little bit
# this is basically our optimizer + learning rate
# this optimizer is known as gradient descent
new_w5 = w5 - LR * deriv_L_w5
new_w5

0.985

In [181]:
# solving the partial derivative of the loss function with respect to w6
deriv_L_w6 = 2 * node_2_output * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w6

0.5

In [182]:
# update also w6 with optimizer
new_w6 = w6 - LR * deriv_L_w6
new_w6

0.995

In [183]:
# solving the partial derivative of the loss function with respect to bias3
# NOTE! the * 1 comes from derivating the bias, which is same as derivating x
# which will result in 1
deriv_L_b3 = 2 * 1 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b3

1.0

In [184]:
# update also w6 with optimizer based previous derivation
new_b3 = bias3 - LR * deriv_L_b3
new_b3

0.49

**To access the first layer, we need to use the CHAIN RULE, in order to calculate the new values for w1-w4 and bias1/2**

In [185]:
# see materials for how we need to split this calculation into two parts
# here we solve the left and right sides separately

# left side is mostyl the same as derivating w5 and w6
deriv_L_w1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)

# right side: use ReLu derivation and remember to match correct weights with correct inputs/biases
# based on WHICH WEIGHT you are derivating
# COMPARE THIS TO THE ORIGINAL MATERIALS AND PICTURE
# in the case of w1 => use w1 and w3 inside the ReLu-derivation, because
# these weights are connected to node 1 (which is connected to w1)
# also use bias1, since it's part of node 1
# finally, multiply all with input 1, because it's connected to w1
deriv_L_w1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input1
deriv_L_w1 = deriv_L_w1_left * deriv_L_w1_right
new_w1 = w1 - LR * deriv_L_w1
new_w1

0.99

In [186]:
# use the same logic as above, but now from the point of view of w2
# notice how we use w6 and w2/w4 and bias2 in the right side equation
deriv_L_w2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w2_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input1
deriv_L_w2 = deriv_L_w2_left * deriv_L_w2_right
new_w2 = w2 - LR * deriv_L_w2
new_w2

0.49

In [187]:
# use the same logic as above, but now from the point of view of w2
# notice how we use w6 and w2/w4 and bias2 in the right side equation
deriv_L_w3_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w3_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input2
deriv_L_w3 = deriv_L_w3_left * deriv_L_w3_right
new_w3 = w3 - LR * deriv_L_w3
new_w3

1.0

In [188]:
# use the same logic as above, but now from the point of view of w2
# notice how we use w6 and w2/w4 and bias2 in the right side equation
deriv_L_w4_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_w4_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * input2
deriv_L_w4 = deriv_L_w4_left * deriv_L_w4_right
new_w4 = w4 - LR * deriv_L_w4
new_w4

-0.5

In [189]:
# with bias, otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias => which is the same as derivation of x => which is always 1
deriv_L_b1_left = 2 * w5 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b1_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * 1
deriv_L_b1 = deriv_L_b1_left * deriv_L_b1_right
new_b1 = bias1 - LR * deriv_L_b1
new_b1

0.49

In [190]:
# with bias, otherwise the same formula, but we can multiply the right side function with just 1
# because it's a derivation of bias => which is the same as derivation of x => which is always 1
deriv_L_b2_left = 2 * w6 * (node_1_output * w5 + node_2_output * w6 + bias3 - true_value)
deriv_L_b2_right = activation_ReLu_partial_derivative(input1 * w1 + input2 * w3 + bias1) * 1
deriv_L_b2 = deriv_L_b2_left * deriv_L_b2_right
new_b2 = bias2 - LR * deriv_L_b2
new_b2

-0.01

**Everything shuld be okay now... let's compare the results**

In [191]:
print("ORIGINAL WEIGHTS AND BIASES")
print(f"w1: {w1}")
print(f"w2: {w2}")
print(f"w3: {w3}")
print(f"w4: {w4}")
print(f"w5: {w5}")
print(f"w6: {w6}")
print(f"b1: {bias1}")
print(f"b2: {bias2}")
print(f"b3: {bias3}")

print("\n\n#################################")

print("NEW WEIGHTS AND BIASES")
print(f"w1: {new_w1}")
print(f"w2: {new_w2}")
print(f"w3: {new_w3}")
print(f"w4: {new_w4}")
print(f"w5: {new_w5}")
print(f"w6: {new_w6}")
print(f"b1: {new_b1}")
print(f"b2: {new_b2}")
print(f"b3: {new_b3}")



ORIGINAL WEIGHTS AND BIASES
w1: 1
w2: 0.5
w3: 1
w4: -0.5
w5: 1
w6: 1
b1: 0.5
b2: 0
b3: 0.5


#################################
NEW WEIGHTS AND BIASES
w1: 0.99
w2: 0.49
w3: 1.0
w4: -0.5
w5: 0.985
w6: 0.995
b1: 0.49
b2: -0.01
b3: 0.49
