In [1]:
import numpy as np

# -----------------------
# Step 1: Dataset (XOR)
# -----------------------
# X: 4 samples, each with 2 binary features (inputs)
# y: corresponding targets (XOR truth table)

In [2]:
X = np.array([[0, 0],
              [0, 1],
              [1, 0],
              [1, 1]])
# y is column vector of shape (4,1) for binary targets
y = np.array([[0], [1], [1], [0]])

# -----------------------
# Step 2: Initialize weights & biases
# -----------------------
# Seed for reproducibility: same random numbers each run

In [3]:
np.random.seed(42)

# W1: weights between input layer (2 units) and hidden layer (2 units)
# shape (input_dim, hidden_dim) = (2,2)

In [4]:
W1 = np.random.randn(2, 2)

# b1: bias for hidden layer, shape (1, hidden_dim)
# using shape (1,2) so broadcasting works when adding to matrix of shape (4,2)

In [5]:
b1 = np.zeros((1, 2))

# W2: weights between hidden layer (2 units) and output layer (1 unit)
# shape (hidden_dim, output_dim) = (2,1)

In [6]:
W2 = np.random.randn(2, 1)

# b2: bias for output layer, shape (1,1)

In [7]:
b2 = np.zeros((1, 1))

# -----------------------
# Step 3: Activation functions and their derivatives
# -----------------------
# Sigmoid activation: maps real numbers to (0,1), used for binary output

In [8]:
def sigmoid(x):
    # element-wise sigmoid
    return 1 / (1 + np.exp(-x))

# Derivative of sigmoid given sigmoid(x) output
# If s = sigmoid(z) then derivative d/dz sigmoid(z) = s * (1 - s)
# Note: here we pass sigmoid output to this function for efficiency.

In [9]:
def sigmoid_derivative(sigmoid_output):
    return sigmoid_output * (1 - sigmoid_output)

# -----------------------
# Step 4: Training hyperparameters
# -----------------------

In [10]:
lr = 0.1     # learning rate (step size for gradient descent)
epochs = 10000 # number of training iterations

# -----------------------
# Step 5: Training loop (Gradient Descent using Backpropagation)
# -----------------------

In [11]:
for epoch in range(epochs):
    # ---------- Forward pass ----------
    # Hidden layer linear combination: Z1 = X · W1 + b1
    # X shape: (4,2); W1 shape: (2,2) -> result (4,2)
    hidden_input = np.dot(X, W1) + b1
    # Apply activation to get hidden layer outputs H = sigmoid(Z1)
    # hidden_output shape: (4,2)
    hidden_output = sigmoid(hidden_input)

    # Output layer linear combination: Z2 = H · W2 + b2
    # hidden_output shape: (4,2); W2 shape: (2,1) -> result (4,1)
    final_input = np.dot(hidden_output, W2) + b2
    # Apply sigmoid to get predictions y_pred in (0,1)
    y_pred = sigmoid(final_input)

    # ---------- Loss computation ----------
    # Using Mean Squared Error (MSE): L = mean((y - y_pred)^2)
    # For binary classification, binary_crossentropy is more common,
    # but MSE is fine for educational demonstration.
    loss = np.mean((y - y_pred) ** 2)

    # ---------- Backpropagation (compute gradients) ----------
    # We compute gradients of loss w.r.t. parameters using chain rule.
    # Start with derivative of loss w.r.t predictions:
    # dL/dy_pred = 2 * (y_pred - y) / N  (we use mean so factor 1/N is implicit in np.mean)
    # Here we omit dividing by N explicitly because later updates scale by lr; consistent scaling works.
    d_loss_y_pred = 2 * (y_pred - y)  # shape: (4,1)

    # derivative of y_pred w.r.t. final_input: dy_pred/dZ2 = sigmoid_derivative(y_pred)
    d_y_pred_final = sigmoid_derivative(y_pred)  # shape: (4,1)

    # Combine to get gradient at Z2: dL/dZ2 = dL/dy_pred * dy_pred/dZ2
    d_final_input = d_loss_y_pred * d_y_pred_final  # shape: (4,1)

    # Gradients for W2 and b2:
    # dL/dW2 = H^T · dZ2  (matrix multiplication)
    # hidden_output.T shape: (2,4), d_final_input: (4,1) -> result (2,1)
    d_hidden_to_output = np.dot(hidden_output.T, d_final_input)  # shape: (2,1)

    # dL/db2 = sum over samples of dZ2 (keepdims for shape consistency)
    d_b2 = np.sum(d_final_input, axis=0, keepdims=True)  # shape: (1,1)

    # Now backpropagate to hidden layer:
    # dZ2 affects the hidden layer through W2: d_hidden_input = dZ2 · W2^T elementwise * sigmoid'(hidden_output)
    # d_final_input shape: (4,1); W2.T shape: (1,2) -> np.dot gives (4,2)
    d_hidden_input = np.dot(d_final_input, W2.T) * sigmoid_derivative(hidden_output)  # shape: (4,2)

    # Gradients for W1 and b1:
    # dL/dW1 = X^T · d_hidden_input
    d_input_to_hidden = np.dot(X.T, d_hidden_input)  # shape: (2,2)

    # dL/db1 = sum over samples of d_hidden_input
    d_b1 = np.sum(d_hidden_input, axis=0, keepdims=True)  # shape: (1,2)

    # ---------- Parameter updates (Gradient Descent) ----------
    # Subtract learning rate * gradient from parameters
    W2 -= lr * d_hidden_to_output
    b2 -= lr * d_b2
    W1 -= lr * d_input_to_hidden
    b1 -= lr * d_b1

    # ---------- Optional: monitor training ----------
    # Print loss occasionally so we can see progress
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")

# After training, y_pred contains predictions on the training set
print("\nFinal Predictions (rounded):")
print(y_pred.round(3))

# You can also print thresholds: convert probabilities to 0/1 decisions
pred_binary = (y_pred >= 0.5).astype(int)
print("Predicted classes:\n", pred_binary)
print("True classes:\n", y)

Epoch 0, Loss: 0.2558
Epoch 1000, Loss: 0.2455
Epoch 2000, Loss: 0.1532
Epoch 3000, Loss: 0.1336
Epoch 4000, Loss: 0.1298
Epoch 5000, Loss: 0.1282
Epoch 6000, Loss: 0.1274
Epoch 7000, Loss: 0.1269
Epoch 8000, Loss: 0.1266
Epoch 9000, Loss: 0.1264

Final Predictions (rounded):
[[0.03 ]
 [0.498]
 [0.971]
 [0.501]]
Predicted classes:
 [[0]
 [0]
 [1]
 [1]]
True classes:
 [[0]
 [1]
 [1]
 [0]]
