In [None]:
# ===============================
# Deep Neural Network from Scratch
# Dataset: Iris (Multiclass Classification)
# ===============================

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler


# =======================================
# 1. Load and Preprocess Data
# =======================================
iris = datasets.load_iris()
X = iris.data
y = iris.target.reshape(-1, 1)

# Normalize features for better convergence
scaler = RobustScaler()
X = scaler.fit_transform(X)

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# =======================================
# 2. Helper Functions
# =======================================
def sigmoid(x, is_derivative=False):
    if is_derivative:
        return np.exp(-x) / ((1 + np.exp(-x)) ** 2)
    return 1 / (1 + np.exp(-x))

def relu(x, is_derivative=False):
    if is_derivative:
        x = np.where(x < 0, 0, x)
        x = np.where(x >= 0, 1, x)
        return x
    return np.maximum(0, x)

def softmax(x):
    exp_shifted = np.exp(x - np.max(x, axis=0, keepdims=True))
    return exp_shifted / np.sum(exp_shifted, axis=0, keepdims=True)

def mse_loss(Y, Y_pred):
    m = Y.shape[0]
    return np.sum((Y - Y_pred.T) ** 2) / (2 * m)

def cross_entropy_loss(Y, Y_pred, eps=1e-9):
    m = Y.shape[0]
    return -np.sum(Y * np.log(Y_pred.T + eps)) / m


# =======================================
# 3. Parameter Initialization
# =======================================
def initialize_parameters(layer_dims):
    np.random.seed(42)
    params = {}
    for i in range(1, len(layer_dims)):
        params[f"W{i}"] = np.random.randn(layer_dims[i], layer_dims[i - 1]) * np.sqrt(1. / layer_dims[i - 1])
        params[f"b{i}"] = np.zeros((layer_dims[i], 1))
    return params


# =======================================
# 4. Forward & Backward Propagation
# =======================================
def forward_propagation(X, params, activations):
    cache = {"A0": X.T}
    L = len(activations)

    for i in range(1, L + 1):
        W, b = params[f"W{i}"], params[f"b{i}"]
        A_prev = cache[f"A{i-1}"]
        Z = np.dot(W, A_prev) + b

        if activations[i - 1] == "sigmoid":
            A = sigmoid(Z)
        elif activations[i - 1] == "relu":
            A = relu(Z)
        elif activations[i - 1] == "softmax":
            A = softmax(Z)
        else:
            raise ValueError(f"Unsupported activation: {activations[i-1]}")

        cache[f"Z{i}"] = Z
        cache[f"A{i}"] = A

    return cache[f"A{L}"], cache

def backward_propagation(Y, params, cache, activations):
    grads = {}
    m = Y.shape[0]
    L = len(activations)
    Y = Y.T
    A_L = cache[f"A{L}"]

    # Output layer gradient
    dZ = A_L - Y
    for i in reversed(range(1, L + 1)):
        A_prev = cache[f"A{i-1}"]
        W = params[f"W{i}"]
        grads[f"dW{i}"] = (1 / m) * np.dot(dZ, A_prev.T)
        grads[f"db{i}"] = (1 / m) * np.sum(dZ, axis=1, keepdims=True)

        if i > 1:
            Z_prev = cache[f"Z{i-1}"]
            if activations[i - 2] == "sigmoid":
                dZ = np.dot(W.T, dZ) * sigmoid(Z_prev, is_derivative=True)
            elif activations[i - 2] == "relu":
                dZ = np.dot(W.T, dZ) * relu(Z_prev, is_derivative=True)
            else:
                raise ValueError(f"Unsupported activation: {activations[i-2]}")

    return grads

def update_parameters(params, grads, lr):
    for key in params.keys():
        if key.startswith("W"):
            idx = key[1:]
            params[f"W{idx}"] -= lr * grads[f"dW{idx}"]
            params[f"b{idx}"] -= lr * grads[f"db{idx}"]
    return params


# =======================================
# 4. Training Loop
# =======================================
def train_neural_network(X, Y, hidden_layers=[10], lr=0.05, epochs=5000, print_every=500, activations=None, loss_function="cross_entropy"):
    input_dim, output_dim = X.shape[1], Y.shape[1]
    layer_dims = [input_dim] + hidden_layers + [output_dim]

    if activations is None:
        activations = ["sigmoid"] * len(hidden_layers) + ["softmax"]

    params = initialize_parameters(layer_dims)

    for i in range(epochs + 1):
        A_out, cache = forward_propagation(X, params, activations)

        if loss_function == "mse":
            loss = mse_loss(Y, A_out)
        elif loss_function == "cross_entropy":
            loss = cross_entropy_loss(Y, A_out)
        else:
            raise ValueError("Unsupported loss function")

        grads = backward_propagation(Y, params, cache, activations)
        params = update_parameters(params, grads, lr)

        if i % print_every == 0:
            print(f"Epoch {i:5d} | Loss: {loss:.4f}")

    return params


# =======================================
# 5. Prediction & Evaluation
# =======================================
def predict(X, params, activations):
    A_out, _ = forward_propagation(X, params, activations)
    return np.argmax(A_out, axis=0)

def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred) * 100


# =======================================
# 6. Run Training and Evaluate
# =======================================
params = train_neural_network(
    X_train, y_train,
    hidden_layers=[10, 10, 10],
    lr=0.05,
    epochs=5000,
    print_every=500,
    activations=["sigmoid", "sigmoid", "sigmoid", "softmax"]
)

y_pred = predict(X_test, params, ["sigmoid", "sigmoid", "sigmoid", "softmax"])
y_true = np.argmax(y_test, axis=1)
acc = accuracy_score(y_true, y_pred)

print(f"\nTest Set Accuracy: {acc:.2f}%")

Epoch     0 | Loss: 1.3451
Epoch   500 | Loss: 1.0717
Epoch  1000 | Loss: 0.9548
Epoch  1500 | Loss: 0.5260
Epoch  2000 | Loss: 0.3474
Epoch  2500 | Loss: 0.2406
Epoch  3000 | Loss: 0.1657
Epoch  3500 | Loss: 0.1208
Epoch  4000 | Loss: 0.0961
Epoch  4500 | Loss: 0.0820
Epoch  5000 | Loss: 0.0730

Test Set Accuracy: 96.67%
