***Back Propogation***

In [None]:
import numpy as np

In [None]:
x1 = 0.03
x2 = 0.23


***Sigmoid activation***

In [12]:
# Initial weights (input -> hidden)
w1, w2 = 0.02, 0.20  # weights to H1 from x1, x2
w3, w4 = 0.25, 0.70  # weights to H2 from x1, x2

# Initial weights (hidden -> output)
w5, w6 = 0.30, 0.45  # weights to y1 from H1, H2
w7, w8 = 0.56, 0.55  # weights to y2 from H1, H2

# Biases
b1 = 0.10  # bias for hidden neurons H1, H2 (same in example)
b2 = 0.09  # bias for output neurons y1, y2 (same in example)

# Targets (desired outputs)
T1 = 0.06
T2 = 0.88

# Learning rate
lr = 0.007

# -------------------------------
# 2) Activation functions
# -------------------------------

def sigmoid(z):
    """Sigmoid activation."""
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_derivative_from_activation(a):
    """Derivative of sigmoid given activation a = sigmoid(z): a*(1-a)."""
    return a * (1.0 - a)

In [13]:
def forward_pass(x1, x2, w1, w2, w3, w4, w5, w6, w7, w8, b1, b2):
    """
    Compute net inputs and activations for hidden and output layers.
    Returns a dict with nets and activations for printing/teaching.
    """
    # Hidden layer linear combinations (net inputs)
    H1_net = x1 * w1 + x2 * w2 + b1
    H2_net = x1 * w3 + x2 * w4 + b1

    # Hidden activations (sigmoid)
    H1 = sigmoid(H1_net)
    H2 = sigmoid(H2_net)

    # Output layer linear combinations
    y1_net = H1 * w5 + H2 * w6 + b2
    y2_net = H1 * w7 + H2 * w8 + b2

    # Output activations (sigmoid)
    y1 = sigmoid(y1_net)
    y2 = sigmoid(y2_net)

    return {
        "H1_net": H1_net, "H2_net": H2_net,
        "H1": H1, "H2": H2,
        "y1_net": y1_net, "y2_net": y2_net,
        "y1": y1, "y2": y2
    }

# -------------------------------
# 4) Forward before update (print step-by-step)
# -------------------------------

print("\n=== FORWARD PASS (before weight update) ===")
out = forward_pass(x1, x2, w1, w2, w3, w4, w5, w6, w7, w8, b1, b2)

print(f"H1_net = {out['H1_net']:.7f} => H1 = sigmoid(H1_net) = {out['H1']:.9f}")
print(f"H2_net = {out['H2_net']:.7f} => H2 = sigmoid(H2_net) = {out['H2']:.9f}")
print(f"y1_net = {out['y1_net']:.9f} => y1 = sigmoid(y1_net) = {out['y1']:.9f}")
print(f"y2_net = {out['y2_net']:.9f} => y2 = sigmoid(y2_net) = {out['y2']:.9f}")

# Compute per-output squared errors and total error
E1 = 0.5 * (T1 - out['y1'])**2
E2 = 0.5 * (T2 - out['y2'])**2
E_total = E1 + E2
print(f"\nE1 = 0.5*(T1 - y1)^2 = {E1:.9f}")
print(f"E2 = 0.5*(T2 - y2)^2 = {E2:.9f}")
print(f"Total error E_total = E1 + E2 = {E_total:.9f}")

# -------------------------------
# 5) BACKPROPAGATION — output layer
#    compute deltas and gradients for w5..w8
# -------------------------------

print("\n=== BACKPROP: output layer ===")

# For each output neuron i: delta_i = dE/dy_i * dy_i/dnet_i
# where dE/dy_i = -(T_i - y_i) for E = 1/2*(T-y)^2
dE_dy1 = -(T1 - out['y1'])
dy1_dnet = sigmoid_derivative_from_activation(out['y1'])
delta1 = dE_dy1 * dy1_dnet  # scalar

dE_dy2 = -(T2 - out['y2'])
dy2_dnet = sigmoid_derivative_from_activation(out['y2'])
delta2 = dE_dy2 * dy2_dnet  # scalar

print(f"dE/dy1 = {dE_dy1:.9f}, dy1/dnet = {dy1_dnet:.9f}, => delta1 = {delta1:.9f}")
print(f"dE/dy2 = {dE_dy2:.9f}, dy2/dnet = {dy2_dnet:.9f}, => delta2 = {delta2:.9f}")

# Gradients for weights from hidden -> outputs:
# dw5 = dE/dw5 = delta1 * H1
# dw6 = dE/dw6 = delta1 * H2
# dw7 = dE/dw7 = delta2 * H1
# dw8 = dE/dw8 = delta2 * H2
dw5 = delta1 * out['H1']
dw6 = delta1 * out['H2']
dw7 = delta2 * out['H1']
dw8 = delta2 * out['H2']

print("\nGradients for hidden->output weights:")
print(f"dw5 (for w5) = delta1 * H1 = {dw5:.9f}")
print(f"dw6 (for w6) = delta1 * H2 = {dw6:.9f}")
print(f"dw7 (for w7) = delta2 * H1 = {dw7:.9f}")
print(f"dw8 (for w8) = delta2 * H2 = {dw8:.9f}")

# Update output weights (gradient descent): w <- w - lr * dw
w5_new = w5 - lr * dw5
w6_new = w6 - lr * dw6
w7_new = w7 - lr * dw7
w8_new = w8 - lr * dw8

print("\nUpdated hidden->output weights (one step):")
print(f"w5 -> {w5_new:.9f}")
print(f"w6 -> {w6_new:.9f}")
print(f"w7 -> {w7_new:.9f}")
print(f"w8 -> {w8_new:.9f}")

# -------------------------------
# 6) BACKPROPAGATION — hidden layer
#    compute deltas for H1, H2 and gradients for w1..w4
# -------------------------------

print("\n=== BACKPROP: hidden layer ===")

# Error contribution from both output neurons flows back to each hidden neuron:
# delta_H1 = (delta1*w5 + delta2*w7) * sigmoid'(H1_net)
# delta_H2 = (delta1*w6 + delta2*w8) * sigmoid'(H2_net)
# Note: use the original w5,w6,w7,w8 (the ones that were used in the forward pass)
delta_H1 = (delta1 * w5 + delta2 * w7) * sigmoid_derivative_from_activation(out['H1'])
delta_H2 = (delta1 * w6 + delta2 * w8) * sigmoid_derivative_from_activation(out['H2'])

print(f"delta_H1 = (delta1*w5 + delta2*w7) * sigmoid'(H1) = {delta_H1:.12f}")
print(f"delta_H2 = (delta1*w6 + delta2*w8) * sigmoid'(H2) = {delta_H2:.12f}")

# Gradients for input->hidden weights:
# dw1 = delta_H1 * x1, dw2 = delta_H1 * x2
# dw3 = delta_H2 * x1, dw4 = delta_H2 * x2
dw1 = delta_H1 * x1
dw2 = delta_H1 * x2
dw3 = delta_H2 * x1
dw4 = delta_H2 * x2

print("\nGradients for input->hidden weights:")
print(f"dw1 (for w1) = delta_H1 * x1 = {dw1:.12f}")
print(f"dw2 (for w2) = delta_H1 * x2 = {dw2:.12f}")
print(f"dw3 (for w3) = delta_H2 * x1 = {dw3:.12f}")
print(f"dw4 (for w4) = delta_H2 * x2 = {dw4:.12f}")

# Update hidden weights
w1_new = w1 - lr * dw1
w2_new = w2 - lr * dw2
w3_new = w3 - lr * dw3
w4_new = w4 - lr * dw4

print("\nUpdated input->hidden weights (one step):")
print(f"w1 -> {w1_new:.9f}")
print(f"w2 -> {w2_new:.9f}")
print(f"w3 -> {w3_new:.9f}")
print(f"w4 -> {w4_new:.9f}")

# For completeness, update biases too (if you want)
# bias at output b2 <- b2 - lr * delta (sum of deltas for outputs)
# bias at hidden b1 <- b1 - lr * delta_H (sum of deltas for hidden neurons)
b2_new = b2 - lr * (delta1 + delta2)  # update using both output deltas
b1_new = b1 - lr * (delta_H1 + delta_H2)  # update using hidden deltas

print(f"\nUpdated biases:")
print(f"b1 -> {b1_new:.9f}")
print(f"b2 -> {b2_new:.9f}")

# -------------------------------
# 7) Forward pass after the update (to show error decreased)
# -------------------------------

print("\n=== FORWARD PASS (after weight update) ===")
out_after = forward_pass(x1, x2,
                         w1_new, w2_new, w3_new, w4_new,
                         w5_new, w6_new, w7_new, w8_new,
                         b1_new, b2_new)

print(f"H1 (after) = {out_after['H1']:.9f}")
print(f"H2 (after) = {out_after['H2']:.9f}")
print(f"y1 (after) = {out_after['y1']:.9f}")
print(f"y2 (after) = {out_after['y2']:.9f}")

E1_after = 0.5 * (T1 - out_after['y1'])**2
E2_after = 0.5 * (T2 - out_after['y2'])**2
E_total_after = E1_after + E2_after

print(f"\nE_total (before) = {E_total:.9f}")
print(f"E_total (after)  = {E_total_after:.9f}")
print("\n(You should see the total error decreased after one backprop step.)")



=== FORWARD PASS (before weight update) ===
H1_net = 0.2700000 => H1 = sigmoid(H1_net) = 0.567092905
H2_net = 0.7850000 => H2 = sigmoid(H2_net) = 0.686756727
y1_net = 0.569168398 => y1 = sigmoid(y1_net) = 0.638571265
y2_net = 0.785288226 => y2 = sigmoid(y2_net) = 0.686818727

E1 = 0.5*(T1 - y1)^2 = 0.167372355
E2 = 0.5*(T2 - y2)^2 = 0.018659502
Total error E_total = E1 + E2 = 0.186031857

=== BACKPROP: output layer ===
dE/dy1 = 0.578571265, dy1/dnet = 0.230798004, => delta1 = 0.133533093
dE/dy2 = -0.193181273, dy2/dnet = 0.215098763, => delta2 = -0.041553053

Gradients for hidden->output weights:
dw5 (for w5) = delta1 * H1 = 0.075725670
dw6 (for w6) = delta1 * H2 = 0.091704750
dw7 (for w7) = delta2 * H1 = -0.023564441
dw8 (for w8) = delta2 * H2 = -0.028536839

Updated hidden->output weights (one step):
w5 -> 0.299469920
w6 -> 0.449358067
w7 -> 0.560164951
w8 -> 0.550199758

=== BACKPROP: hidden layer ===
delta_H1 = (delta1*w5 + delta2*w7) * sigmoid'(H1) = 0.004121974146
delta_H2 = (de

***Softmax Activation Function***

In [16]:

# sample inputs & targets
x1, x2 = 0.5, 0.8
T1, T2 = 1, 0        # class 1 is the correct class

# initial weights
w1=w2=w3=w4=w5=w6=w7=w8=0.5
b1=b2=0.1

lr = 0.1

train_step_softmax(
    x1, x2, T1, T2,
    w1, w2, w3, w4,
    w5, w6, w7, w8,
    b1, b2, lr
)

def train_step_softmax(x1, x2, T1, T2,
                       w1, w2, w3, w4,
                       w5, w6, w7, w8,
                       b1, b2, lr):

    # ---------------- FORWARD PASS -------------------
    out = forward_pass_softmax(
        x1, x2,
        w1, w2, w3, w4,
        w5, w6, w7, w8,
        b1, b2
    )

    print("\n=========== FORWARD PASS (Softmax) ===========")
    print(f"H1 = {out['H1']:.6f}, H2 = {out['H2']:.6f}")
    print(f"y1 = {out['y1']:.6f}, y2 = {out['y2']:.6f}")

    # Errors BEFORE update
    E_before = 0.5 * ((T1 - out["y1"])**2 + (T2 - out["y2"])**2)
    print(f"Total Error (BEFORE) = {E_before:.9f}")

    # ---------- Softmax Backprop (with MSE) ----------
    y1, y2 = out["y1"], out["y2"]
    Y = np.array([y1, y2])
    T = np.array([T1, T2])

    dE_dy = -(T - Y)

    # Softmax Jacobian
    J = np.array([
        [y1*(1-y1), -y1*y2],
        [-y1*y2,   y2*(1-y2)]
    ])

    delta = J @ dE_dy
    delta1, delta2 = delta

    print("\n=========== OUTPUT LAYER DELTAS ===========")
    print(f"delta1 = {delta1:.9f}")
    print(f"delta2 = {delta2:.9f}")

    # ----------- GRADIENTS hidden→output -----------
    H1 = out["H1"]
    H2 = out["H2"]

    dw5 = delta1 * H1
    dw6 = delta1 * H2
    dw7 = delta2 * H1
    dw8 = delta2 * H2

    w5_new = w5 - lr * dw5
    w6_new = w6 - lr * dw6
    w7_new = w7 - lr * dw7
    w8_new = w8 - lr * dw8

    # ----------- BACKPROP TO HIDDEN LAYER -----------
    delta_H1 = (delta1*w5 + delta2*w7) * sigmoid_derivative_from_activation(H1)
    delta_H2 = (delta1*w6 + delta2*w8) * sigmoid_derivative_from_activation(H2)

    print("\n=========== HIDDEN LAYER DELTAS ===========")
    print(f"delta_H1 = {delta_H1:.9f}")
    print(f"delta_H2 = {delta_H2:.9f}")

    dw1 = delta_H1 * x1
    dw2 = delta_H1 * x2
    dw3 = delta_H2 * x1
    dw4 = delta_H2 * x2

    w1_new = w1 - lr * dw1
    w2_new = w2 - lr * dw2
    w3_new = w3 - lr * dw3
    w4_new = w4 - lr * dw4

    b1_new = b1 - lr*(delta_H1 + delta_H2)
    b2_new = b2 - lr*(delta1 + delta2)

    print("\n=========== UPDATED WEIGHTS ===========")
    print(f"w1={w1_new:.6f}, w2={w2_new:.6f}, w3={w3_new:.6f}, w4={w4_new:.6f}")
    print(f"w5={w5_new:.6f}, w6={w6_new:.6f}, w7={w7_new:.6f}, w8={w8_new:.6f}")
    print(f"b1={b1_new:.6f}, b2={b2_new:.6f}")

    # ---------- RUN FORWARD PASS AGAIN TO SEE NEW ERROR ----------
    out_new = forward_pass_softmax(
        x1, x2,
        w1_new, w2_new, w3_new, w4_new,
        w5_new, w6_new, w7_new, w8_new,
        b1_new, b2_new
    )

    E_after = 0.5 * ((T1 - out_new["y1"])**2 + (T2 - out_new["y2"])**2)
    print(f"\n=========== TOTAL ERROR (AFTER UPDATE) ===========")
    print(f"Total Error (AFTER) = {E_after:.9f}")

    return (w1_new, w2_new, w3_new, w4_new,
            w5_new, w6_new, w7_new, w8_new,
            b1_new, b2_new)



H1 = 0.679179, H2 = 0.679179
y1 = 0.500000, y2 = 0.500000
Total Error (BEFORE) = 0.250000000

delta1 = -0.250000000
delta2 = 0.250000000

delta_H1 = 0.000000000
delta_H2 = 0.000000000

w1=0.500000, w2=0.500000, w3=0.500000, w4=0.500000
w5=0.516979, w6=0.516979, w7=0.483021, w8=0.483021
b1=0.100000, b2=0.100000

Total Error (AFTER) = 0.238602894
