In [1]:
import numpy as np

# Learning Rate
lr = 0.75

# Inputs and Outputs of XOR
X = np.array([[0., 0.],
              [0., 1.],
              [1., 0.],
              [1., 1.]])

Y = np.array([[0.],
              [1.],
              [1.],
              [0.]])

# Weights and Biases
W1 = np.array([
    [0.2985, -0.5792], 
    [0.0913, 0.4234]
])
W2 = np.array([
    [0.5266, 0.2958]
])
B1 = np.array([-0.4939, 0.189])
B2 = np.array([0.6358])

## Forward Pass

In [2]:
# Layer 1
Z1 = X @ W1.T
Z1 = Z1 + B1
print(f'Z1:\n{Z1}\n')
H1 = np.maximum(0.5 * Z1, Z1)
print(f'H1:\n{H1}\n')

# Layer 2
Z2 = H1 @ W2.T
Z2 = Z2 + B2
print(f'Z2:\n{Z2}\n')
H2 = np.maximum(0.5 * Z2, Z2)
print(f'H2:\n{H2}\n')

Z1:
[[-0.4939  0.189 ]
 [-1.0731  0.6124]
 [-0.1954  0.2803]
 [-0.7746  0.7037]]

H1:
[[-0.24695  0.189  ]
 [-0.53655  0.6124 ]
 [-0.0977   0.2803 ]
 [-0.3873   0.7037 ]]

Z2:
[[0.56166233]
 [0.53440069]
 [0.66726392]
 [0.64000228]]

H2:
[[0.56166233]
 [0.53440069]
 [0.66726392]
 [0.64000228]]



## Loss Calculation

In [3]:
# MSE loss calculation (not needed for training)
loss = np.mean((Y - H2) ** 2)
print(loss)

0.2631408769381175


## Backwards Pass

In [4]:
# Long Chain Calculation
dL_dH2 = 2*(H2 - Y)/len(Y)   # Derivative of loss with respect to H2
print(f'dL/dH2:\n{dL_dH2}\n')

dL_dZ2 = np.multiply(dL_dH2, np.where(Z2 > 0, 1, 0.5))
print(f'dL/dZ2:\n{dL_dZ2}\n')

dL_dH1 = dL_dZ2 @ W2
print(f'dL/dH1:\n{dL_dH1}\n')

dL_dZ1 = np.multiply(dL_dH1, np.where(Z1 > 0, 1, 0.5))
print(f'dL/dZ1:\n{dL_dZ1}\n')

dL/dH2:
[[ 0.28083117]
 [-0.23279965]
 [-0.16636804]
 [ 0.32000114]]

dL/dZ2:
[[ 0.28083117]
 [-0.23279965]
 [-0.16636804]
 [ 0.32000114]]

dL/dH1:
[[ 0.14788569  0.08306986]
 [-0.1225923  -0.06886214]
 [-0.08760941 -0.04921167]
 [ 0.1685126   0.09465634]]

dL/dZ1:
[[ 0.07394285  0.08306986]
 [-0.06129615 -0.06886214]
 [-0.0438047  -0.04921167]
 [ 0.0842563   0.09465634]]



In [5]:
# Leaf Node Calculations
dL_dW2 = dL_dZ2.T @ H1
print(f'layer2.weight gradients:\n{dL_dW2}\n')
dL_dB2 = dL_dZ2.sum(axis=0)
print(f'layer2.bias gradients:\n{dL_dB2}\n')

dL_dW1 = dL_dZ1.T @ X
print(f'layer1.weight gradients:\n{dL_dW1}\n')
dL_dB1 = dL_dZ1.sum(axis=0)
print(f'layer1.bias gradients:\n{dL_dB1}\n')

layer2.weight gradients:
[[-0.05212489  0.08906242]]

layer2.bias gradients:
[0.20166461]

layer1.weight gradients:
[[0.0404516  0.02296015]
 [0.04544467 0.0257942 ]]

layer1.bias gradients:
[0.05309829 0.05965239]



## Update Parameters

In [6]:
W1 = W1 - lr * dL_dW1
print(f'layer1.weight:\n{W1}\n')

B1 = B1 - lr * dL_dB1
print(f'layer1.bias:\n{B1}\n')

W2 = W2 - lr * dL_dW2
print(f'layer2.weight:\n{W2}\n')

B2 = B2 - lr * dL_dB2
print(f'layer2.bias:\n{B2}\n')

layer1.weight:
[[ 0.2681613  -0.59642011]
 [ 0.0572165   0.40405435]]

layer1.bias:
[-0.53372372  0.14426071]

layer2.weight:
[[0.56569366 0.22900318]]

layer2.bias:
[0.48455154]

