In [1]:
import numpy as np

# Input values (X)
X = np.array([0.8, 0.6, 0.7])

# Weights between input and hidden layer (W1)
W1 = np.array([[0.2, 0.4, 0.1],
               [0.5, 0.3, 0.2],
               [0.3, 0.7, 0.8]])

# Biases for hidden layer (B1)
B1 = np.array([0.1, 0.2, 0.3])

# Weights between hidden and output layer (W2)
W2 = np.array([[0.6, 0.4, 0.5],
               [0.1, 0.2, 0.3],
               [0.3, 0.7, 0.2]])

# Biases for output layer (B2)
B2 = np.array([0.1, 0.2, 0.3])

# True labels (Y)
Y = np.array([0, 1, 0])

# Activation Functions
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return np.where(x > 0, 1, 0)

def softmax(x):
    e_x = np.exp(x - np.max(x))  # stability improvement
    return e_x / e_x.sum(axis=0)  # ensure softmax is applied along the correct axis

# Forward Pass

# Hidden layer activations
H_pre_activation = np.dot(X, W1) + B1
H = relu(H_pre_activation)

# Output layer activations
O_pre_activation = np.dot(H, W2) + B2
O = softmax(O_pre_activation)

# Backpropagation

# Step 1: Calculate gradients of the loss with respect to output layer activations
dLoss_dO = O - Y  # Gradient of loss w.r.t. output

# Step 2: Calculate gradients of the loss with respect to weights between hidden and output layer (W2)
dO_dW2 = np.outer(H, dLoss_dO)  # Gradient of the output layer w.r.t. W2
dO_dB2 = dLoss_dO  # Gradient w.r.t. biases (B2)

# Step 3: Propagate the error back to the hidden layer
dO_dH = np.dot(W2, dLoss_dO)  # Gradient of output w.r.t. hidden layer
dH_pre_activation = dO_dH * relu_derivative(H_pre_activation)  # Applying ReLU derivative to backpropagate through ReLU

# Step 4: Calculate gradients of the loss w.r.t. weights between input and hidden layer (W1)
dH_dW1 = np.outer(X, dH_pre_activation)  # Gradient of hidden layer w.r.t. W1
dH_dB1 = dH_pre_activation  # Gradient w.r.t. biases (B1)

# Learning rate
learning_rate = 0.01

# Step 5: Update weights and biases using gradient descent
W2 -= learning_rate * dO_dW2
B2 -= learning_rate * dO_dB2

W1 -= learning_rate * dH_dW1
B1 -= learning_rate * dH_dB1

# Print updated weights and biases
print("Updated W1:\n", W1)
print("Updated B1:\n", B1)
print("Updated W2:\n", W2)
print("Updated B2:\n", B2)

# Print results
print("Output after softmax:", O)
print("Gradient of loss with respect to output layer activations:", dLoss_dO)


Updated W1:
 [[0.19932867 0.39994074 0.1021325 ]
 [0.49949651 0.29995555 0.20159938]
 [0.29941259 0.69994815 0.80186594]]
Updated B1:
 [0.09916084 0.19992592 0.30266563]
Updated W2:
 [[0.59803629 0.40449781 0.49746591]
 [0.09696517 0.20695115 0.29608367]
 [0.29729671 0.70619178 0.19651151]]
Updated B2:
 [0.09744973 0.20584131 0.29670897]
Output after softmax: [0.25502746 0.41586939 0.32910315]
Gradient of loss with respect to output layer activations: [ 0.25502746 -0.58413061  0.32910315]
