In [8]:
import numpy as np

# Set NumPy to display all numbers with 4 decimal places
np.set_printoptions(precision=4, suppress=True)

# Learning Rate
lr = 0.75

# Inputs and Outputs of XOR
X = np.array([[0., 0.],
              [0., 1.],
              [1., 0.],
              [1., 1.]])
print(f'X:\n{X}\n')

Y = np.array([[0.],
              [1.],
              [1.],
              [0.]])
print(f'Y:\n{Y}\n')

# Weights and Biases
W1 = np.array([
    [0.2985, -0.5792], 
    [0.0913, 0.4234]
])
print(f'W1:\n{W1}\n')

W2 = np.array([
    [0.5266, 0.2958]
])
print(f'W2:\n{W2}\n')

B1 = np.array([-0.4939, 0.189])
print(f'B1:\n{B1}\n')

B2 = np.array([0.6358])
print(f'B2:\n{B2}\n')

X:
[[0. 0.]
 [0. 1.]
 [1. 0.]
 [1. 1.]]

Y:
[[0.]
 [1.]
 [1.]
 [0.]]

W1:
[[ 0.2985 -0.5792]
 [ 0.0913  0.4234]]

W2:
[[0.5266 0.2958]]

B1:
[-0.4939  0.189 ]

B2:
[0.6358]



## Forward Pass

In [9]:
# Layer 1
Z1 = X @ W1.T
print(f'Z1 pre bias:\n{Z1}\n')
Z1 = Z1 + B1
print(f'Z1:\n{Z1}\n')
H1 = np.maximum(0.5 * Z1, Z1)
print(f'H1:\n{H1}\n')

# Layer 2
Z2 = H1 @ W2.T
print(f'Z2 pre bias:\n{Z2}\n')
Z2 = Z2 + B2
print(f'Z2:\n{Z2}\n')
H2 = np.maximum(0.5 * Z2, Z2)
print(f'H2:\n{H2}\n')

Z1 pre bias:
[[ 0.      0.    ]
 [-0.5792  0.4234]
 [ 0.2985  0.0913]
 [-0.2807  0.5147]]

Z1:
[[-0.4939  0.189 ]
 [-1.0731  0.6124]
 [-0.1954  0.2803]
 [-0.7746  0.7037]]

H1:
[[-0.247   0.189 ]
 [-0.5366  0.6124]
 [-0.0977  0.2803]
 [-0.3873  0.7037]]

Z2 pre bias:
[[-0.0741]
 [-0.1014]
 [ 0.0315]
 [ 0.0042]]

Z2:
[[0.5617]
 [0.5344]
 [0.6673]
 [0.64  ]]

H2:
[[0.5617]
 [0.5344]
 [0.6673]
 [0.64  ]]



## Loss Calculation

In [10]:
# MSE loss calculation (not needed for training)
loss = np.mean((Y - H2) ** 2)
print(f'{loss:.4f}')

0.2631


## Backwards Pass

In [11]:
# Long Chain Calculation
dL_dH2 = 2*(H2 - Y)/len(Y)   # Derivative of loss with respect to H2
print(f'dL/dH2:\n{dL_dH2}\n')

dL_dZ2 = np.multiply(dL_dH2, np.where(Z2 > 0, 1, 0.5))
print(f'dL/dZ2:\n{dL_dZ2}\n')

dL_dH1 = dL_dZ2 @ W2
print(f'dL/dH1:\n{dL_dH1}\n')

dL_dZ1 = np.multiply(dL_dH1, np.where(Z1 > 0, 1, 0.5))
print(f'dL/dZ1:\n{dL_dZ1}\n')

# This one is not needed for training
dL_dX = dL_dZ1 @ W1
print(f'dL/dX:\n{dL_dX}\n')

dL/dH2:
[[ 0.2808]
 [-0.2328]
 [-0.1664]
 [ 0.32  ]]

dL/dZ2:
[[ 0.2808]
 [-0.2328]
 [-0.1664]
 [ 0.32  ]]

dL/dH1:
[[ 0.1479  0.0831]
 [-0.1226 -0.0689]
 [-0.0876 -0.0492]
 [ 0.1685  0.0947]]

dL/dZ1:
[[ 0.0739  0.0831]
 [-0.0613 -0.0689]
 [-0.0438 -0.0492]
 [ 0.0843  0.0947]]

dL/dX:
[[ 0.0297 -0.0077]
 [-0.0246  0.0063]
 [-0.0176  0.0045]
 [ 0.0338 -0.0087]]



In [12]:
# Leaf Node Calculations
dL_dW2 = dL_dZ2.T @ H1
print(f'dL/dW2:\n{dL_dW2}\n')

dL_dB2 = dL_dZ2.sum(axis=0)
print(f'dL/dB2:\n{dL_dB2}\n')

dL_dW1 = dL_dZ1.T @ X
print(f'dL/dW1:\n{dL_dW1}\n')

dL_dB1 = dL_dZ1.sum(axis=0)
print(f'dL/dB1:\n{dL_dB1}\n')

dL/dW2:
[[-0.0521  0.0891]]

dL/dB2:
[0.2017]

dL/dW1:
[[0.0405 0.023 ]
 [0.0454 0.0258]]

dL/dB1:
[0.0531 0.0597]



## Update Parameters

In [13]:
W1_new = W1 - lr * dL_dW1
print(f'W1_new:\n{W1_new}\n')

B1_new = B1 - lr * dL_dB1
print(f'B1_new:\n{B1_new}\n')

W2_new = W2 - lr * dL_dW2
print(f'W2_new:\n{W2_new}\n')

B2_new = B2 - lr * dL_dB2
print(f'B2_new:\n{B2_new}\n')

W1_new:
[[ 0.2682 -0.5964]
 [ 0.0572  0.4041]]

B1_new:
[-0.5337  0.1443]

W2_new:
[[0.5657 0.229 ]]

B2_new:
[0.4846]



In [14]:
# Leaf Node Calculations (Tiling)
dL_dW2 = dL_dZ2[:2].T @ H1[:2]
print(f'dL/dW2:\n{dL_dW2}\n')

# Update W2
W2_new_tiled = W2 - lr * dL_dW2
print(f'W2_new:\n{W2_new_tiled}\n')

# Leaf Node Calculations (Tiling)
dL_dW2 = dL_dZ2[2:4].T @ H1[2:4]
print(f'dL/dW2:\n{dL_dW2}\n')

# Update W2
W2_new_tiled = W2_new_tiled - lr * dL_dW2
print(f'W2_new:\n{W2_new_tiled}\n')


dL_dW1 = dL_dZ1[:2].T @ X[:2]
print(f'dL/dW1:\n{dL_dW1}\n')

# Update W1
W1_new_tiled = W1 - lr * dL_dW1
print(f'W1_new:\n{W1_new_tiled}\n')

# Leaf Node Calculations (Tiling)
dL_dW1 = dL_dZ1[2:4].T @ X[2:4]
print(f'dL/dW1:\n{dL_dW1}\n')

# Update W1
W1_new_tiled = W1_new_tiled - lr * dL_dW1
print(f'W1_new:\n{W1_new_tiled}\n')

dL/dW2:
[[ 0.0556 -0.0895]]

W2_new:
[[0.4849 0.3629]]

dL/dW2:
[[-0.1077  0.1786]]

W2_new:
[[0.5657 0.229 ]]

dL/dW1:
[[ 0.     -0.0613]
 [ 0.     -0.0689]]

W1_new:
[[ 0.2985 -0.5332]
 [ 0.0913  0.475 ]]

dL/dW1:
[[0.0405 0.0843]
 [0.0454 0.0947]]

W1_new:
[[ 0.2682 -0.5964]
 [ 0.0572  0.4041]]

