In [None]:
# ===============================
# Deep Neural Network from Scratch adding hyperparameters
# Dataset: Iris (Multiclass Classification)
# ===============================

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler


# =======================================
# 1. Load and Preprocess Data
# =======================================
iris = datasets.load_iris()
X = iris.data
y = iris.target.reshape(-1, 1)

# Normalize features for better convergence
scaler = RobustScaler()
X = scaler.fit_transform(X)

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# =======================================
# 2. Activations
# =======================================
def sigmoid(x, is_derivative=False):
    if is_derivative:
        return np.exp(-x) / ((1 + np.exp(-x)) ** 2)
    return 1 / (1 + np.exp(-x))

def tanh(x, is_derivative=False):
    if is_derivative:
        return 1 - np.tanh(x) ** 2
    return np.tanh(x)

def relu(x, is_derivative=False):
    if is_derivative:
        x = np.where(x < 0, 0, x)
        x = np.where(x >= 0, 1, x)
        return x
    return np.maximum(0, x)

def leaky_relu(x, is_derivative=False, alpha=0.01):
    if is_derivative:
        x = np.where(x < 0, alpha, 1)
        return x
    return np.maximum(alpha * x, x)

def softmax(x):
    exp_shifted = np.exp(x - np.max(x, axis=0, keepdims=True))
    return exp_shifted / np.sum(exp_shifted, axis=0, keepdims=True)


# =======================================
# 3. Loss Functions
# =======================================
def mse_loss(Y, Y_pred):
    m = Y.shape[0]
    return np.sum((Y - Y_pred.T) ** 2) / (2 * m)

def cross_entropy_loss(Y, Y_pred, eps=1e-9):
    m = Y.shape[0]
    return -np.sum(Y * np.log(Y_pred.T + eps)) / m


# =======================================
# 4. Initialization
# =======================================
def initialize_parameters(layer_dims):
    np.random.seed(42)
    params = {}
    for i in range(1, len(layer_dims)):
        params[f"W{i}"] = np.random.randn(layer_dims[i], layer_dims[i - 1]) * np.sqrt(1. / layer_dims[i - 1])
        params[f"b{i}"] = np.zeros((layer_dims[i], 1))
    return params


# =======================================
# 5. Forward
# =======================================
def forward_propagation(X, params, activations, dropout_rates=None, training=True):
    cache = {"A0": X.T}
    L = len(activations)

    for i in range(1, L + 1):
        W, b = params[f"W{i}"], params[f"b{i}"]
        A_prev = cache[f"A{i-1}"]
        Z = np.dot(W, A_prev) + b

        if activations[i-1] == "sigmoid": A = sigmoid(Z)
        elif activations[i-1] == "relu": A = relu(Z)
        elif activations[i-1] == "tanh": A = tanh(Z)
        elif activations[i-1] == "leaky_relu": A = leaky_relu(Z)
        elif activations[i-1] == "softmax": A = softmax(Z)
        else: raise ValueError(f"Unsupported activation: {activations[i-1]}")

        # Dropout
        if training and dropout_rates and i <= len(dropout_rates):
            rate = dropout_rates[i-1]
            D = (np.random.rand(*A.shape) > rate).astype(float)
            A = (A * D) / (1 - rate)
            cache[f"D{i}"] = D

        cache[f"Z{i}"], cache[f"A{i}"] = Z, A

    return cache[f"A{L}"], cache


# =======================================
# 6. Backpropagation
# =======================================
def backward_propagation(Y, params, cache, activations, dropout_rates=None, weight_decay=0.0):
    grads = {}
    m = Y.shape[0]
    L = len(activations)
    Y = Y.T
    dZ = cache[f"A{L}"] - Y

    for i in reversed(range(1, L + 1)):
        A_prev = cache[f"A{i-1}"]
        W = params[f"W{i}"]

        grads[f"dW{i}"] = (1 / m) * np.dot(dZ, A_prev.T) + weight_decay * W
        grads[f"db{i}"] = (1 / m) * np.sum(dZ, axis=1, keepdims=True)

        if i > 1:
            Z_prev = cache[f"Z{i-1}"]
            if activations[i - 2] == "sigmoid":
                dZ = np.dot(W.T, dZ) * sigmoid(Z_prev, is_derivative=True)
            elif activations[i - 2] == "relu":
                dZ = np.dot(W.T, dZ) * relu(Z_prev, is_derivative=True)
            elif activations[i - 2] == "tanh":
                dZ = np.dot(W.T, dZ) * tanh(Z_prev, is_derivative=True)
            elif activations[i - 2] == "leaky_relu":
                dZ = np.dot(W.T, dZ) * leaky_relu(Z_prev, is_derivative=True)

            # Apply dropout backward
            if dropout_rates and (i - 2) < len(dropout_rates):
                rate = dropout_rates[i-2]
                D = cache.get(f"D{i-1}", np.ones_like(dZ))
                dZ = (dZ * D) / (1 - rate)

    return grads


# =======================================
# 7. Optimizers
# =======================================
'''
    Stochatic Gradient Descent (SGD):
    θ^(t+1) <- θ^t - η∇L(y, ŷ)

    Momentum:
    v^(t+1) <- βv^t + (1-β)∇L(y, ŷ)^t
    θ^(t+1) <- θ^t - ηv^(t+1)
'''
def update_parameters(params, grads, lr, optimizer_type="sgd", velocity=None, beta=0.9):
    if optimizer_type == "sgd":
        for i in range(1, len(params)//2 + 1):
            params[f"W{i}"] -= lr * grads[f"dW{i}"]
            params[f"b{i}"] -= lr * grads[f"db{i}"]
    elif optimizer_type == "momentum":
        if velocity is None:
            velocity = {f"dW{i}": np.zeros_like(grads[f"dW{i}"]) for i in range(1, len(params)//2 + 1)}
            velocity.update({f"db{i}": np.zeros_like(grads[f"db{i}"]) for i in range(1, len(params)//2 + 1)})
        for i in range(1, len(params)//2 + 1):
            velocity[f"dW{i}"] = beta * velocity[f"dW{i}"] + (1 - beta) * grads[f"dW{i}"]
            velocity[f"db{i}"] = beta * velocity[f"db{i}"] + (1 - beta) * grads[f"db{i}"]
            params[f"W{i}"] -= lr * velocity[f"dW{i}"]
            params[f"b{i}"] -= lr * velocity[f"db{i}"]
    return params, velocity


# =======================================
# 8. Training Loop
# =======================================
def train_neural_network(
    X, Y,
    hidden_layers=[10],
    lr=0.05,
    epochs=5000,
    batch_size=None,
    print_every=500,
    activations=None,
    loss_function="cross_entropy",
    optimizer_type="sgd",
    dropout_rates=None,
    weight_decay=0.0,
    early_stopping_patience=200
):
    input_dim, output_dim = X.shape[1], Y.shape[1]
    layer_dims = [input_dim] + hidden_layers + [output_dim]
    if activations is None:
        activations = ["relu"] * len(hidden_layers) + ["softmax"]

    params = initialize_parameters(layer_dims)
    velocity = None
    best_loss = np.inf
    patience_counter = 0

    for epoch in range(1, epochs + 1):
        # Shuffle
        perm = np.random.permutation(X.shape[0])
        X_shuffled, Y_shuffled = X[perm], Y[perm]

        # Mini-batch iteration
        if batch_size is None:
            batches = [(X_shuffled, Y_shuffled)]
        else:
            batches = [
                (X_shuffled[i:i+batch_size], Y_shuffled[i:i+batch_size])
                for i in range(0, X.shape[0], batch_size)
            ]

        epoch_loss = 0
        for X_batch, Y_batch in batches:
            A_out, cache = forward_propagation(X_batch, params, activations, dropout_rates, training=True)
            loss = cross_entropy_loss(Y_batch, A_out) if loss_function == "cross_entropy" else mse_loss(Y_batch, A_out)
            grads = backward_propagation(Y_batch, params, cache, activations, dropout_rates, weight_decay)
            params, velocity = update_parameters(params, grads, lr, optimizer_type, velocity)
            epoch_loss += loss

        epoch_loss /= len(batches)

        # Early stopping check
        if epoch_loss < best_loss:
            best_loss = epoch_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter > early_stopping_patience:
                print(f"Early stopping at epoch {epoch}")
                break

        if epoch % print_every == 0:
            print(f"Epoch {epoch:5d} | Loss: {epoch_loss:.4f}")

    return params


# =======================================
# 9. Prediction & Evaluation
# =======================================
def predict(X, params, activations):
    A_out, _ = forward_propagation(X, params, activations, training=False)
    return np.argmax(A_out, axis=0)

def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred) * 100


# =======================================
# 10. Run Training and Evaluate
# =======================================
params = train_neural_network(
    X_train, y_train,
    hidden_layers=[10],
    lr=0.05,
    epochs=5000,
    batch_size=16,
    activations=["sigmoid", "softmax"],
    dropout_rates=[0.1],
    weight_decay=1e-3,
    optimizer_type="sgd",
    early_stopping_patience=500,
    print_every=200
)

y_pred = predict(X_test, params, ["sigmoid", "softmax"])
y_true = np.argmax(y_test, axis=1)
acc = accuracy_score(y_true, y_pred)

print(f"\nTest Set Accuracy: {acc:.2f}%")

Epoch   200 | Loss: 0.3335
Epoch   400 | Loss: 0.2115
Epoch   600 | Loss: 0.1330
Epoch   800 | Loss: 0.1448
Epoch  1000 | Loss: 0.1023
Epoch  1200 | Loss: 0.1071
Epoch  1400 | Loss: 0.1232
Epoch  1600 | Loss: 0.0836
Epoch  1800 | Loss: 0.1065
Epoch  2000 | Loss: 0.0812
Epoch  2200 | Loss: 0.1190
Early stopping at epoch 2382

Test Set Accuracy: 96.67%
