In [None]:
# ===============================
# Shallow Neural Network from Scratch with K-Fold Cross-Validation
# Dataset: Iris (Multiclass Classification)
# ===============================

import numpy as np
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder, RobustScaler


# =======================================
# 1. Load and Preprocess Data
# =======================================
iris = datasets.load_iris()
X = iris.data
y = iris.target.reshape(-1, 1)

# Normalize features for better convergence
scaler = RobustScaler()
X = scaler.fit_transform(X)

# One-hot encode labels
encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y)


# =======================================
# 2. Helper Functions
# =======================================
def sigmoid(x, is_derivative=False):
    if is_derivative:
        return np.exp(-x) / ((1 + np.exp(-x)) ** 2)
    return 1 / (1 + np.exp(-x))

def tanh(x, is_derivative=False):
    if is_derivative:
        return 1 - np.tanh(x) ** 2
    return np.tanh(x)

def relu(x, is_derivative=False):
    if is_derivative:
        x = np.where(x < 0, 0, x)
        x = np.where(x >= 0, 1, x)
        return x
    return np.maximum(0, x)

def leaky_relu(x, is_derivative=False, alpha=0.01):
    if is_derivative:
        x = np.where(x < 0, alpha, 1)
        return x
    return np.maximum(alpha * x, x)

def softmax(x):
    exp_shifted = np.exp(x - np.max(x, axis=0, keepdims=True))
    return exp_shifted / np.sum(exp_shifted, axis=0, keepdims=True)

def mse_loss(Y, Y_pred):
    m = Y.shape[0]
    return np.sum((Y - Y_pred.T) ** 2) / (2 * m)

def cross_entropy_loss(Y, Y_pred, eps=1e-9):
    m = Y.shape[0]
    return -np.sum(Y * np.log(Y_pred.T + eps)) / m


# =======================================
# 3. Parameter Initialization
# =======================================
def initialize_parameters(layer_dims):
    np.random.seed(42)
    params = {}
    for i in range(1, len(layer_dims)):
        params[f"W{i}"] = np.random.randn(layer_dims[i], layer_dims[i - 1]) * np.sqrt(1. / layer_dims[i - 1])
        params[f"b{i}"] = np.zeros((layer_dims[i], 1))
    return params


# =======================================
# 4. Forward & Backward Propagation
# =======================================
def forward_propagation(X, params, activations):
    cache = {"A0": X.T}
    L = len(activations)

    for i in range(1, L + 1):
        W, b = params[f"W{i}"], params[f"b{i}"]
        A_prev = cache[f"A{i-1}"]
        Z = np.dot(W, A_prev) + b

        if activations[i - 1] == "sigmoid":
            A = sigmoid(Z)
        elif activations[i - 1] == "relu":
            A = relu(Z)
        elif activations[i - 1] == "softmax":
            A = softmax(Z)
        elif activations[i - 1] == "tanh":
            A = tanh(Z)
        elif activations[i - 1] == "leaky_relu":
            A = leaky_relu(Z)
        else:
            raise ValueError(f"Unsupported activation: {activations[i-1]}")

        cache[f"Z{i}"] = Z
        cache[f"A{i}"] = A

    return cache[f"A{L}"], cache

def backward_propagation(Y, params, cache, activations):
    grads = {}
    m = Y.shape[0]
    L = len(activations)
    Y = Y.T
    A_L = cache[f"A{L}"]

    # Output layer gradient
    dZ = A_L - Y
    for i in reversed(range(1, L + 1)):
        A_prev = cache[f"A{i-1}"]
        W = params[f"W{i}"]
        grads[f"dW{i}"] = (1 / m) * np.dot(dZ, A_prev.T)
        grads[f"db{i}"] = (1 / m) * np.sum(dZ, axis=1, keepdims=True)

        if i > 1:
            Z_prev = cache[f"Z{i-1}"]
            if activations[i - 2] == "sigmoid":
                dZ = np.dot(W.T, dZ) * sigmoid(Z_prev, is_derivative=True)
            elif activations[i - 2] == "relu":
                dZ = np.dot(W.T, dZ) * relu(Z_prev, is_derivative=True)
            elif activations[i - 2] == "tanh":
                dZ = np.dot(W.T, dZ) * tanh(Z_prev, is_derivative=True)
            elif activations[i - 2] == "leaky_relu":
                dZ = np.dot(W.T, dZ) * leaky_relu(Z_prev, is_derivative=True)
            else:
                raise ValueError(f"Unsupported activation: {activations[i-2]}")

    return grads

def update_parameters(params, grads, lr):
    for key in params.keys():
        if key.startswith("W"):
            idx = key[1:]
            params[f"W{idx}"] -= lr * grads[f"dW{idx}"]
            params[f"b{idx}"] -= lr * grads[f"db{idx}"]
    return params


# =======================================
# 4. Training Loop
# =======================================
def train_neural_network(X, Y, hidden_layers=[10], lr=0.05, epochs=5000, print_every=500, activations=None, loss_function="cross_entropy"):
    input_dim, output_dim = X.shape[1], Y.shape[1]
    layer_dims = [input_dim] + hidden_layers + [output_dim]

    if activations is None:
        activations = ["sigmoid"] * len(hidden_layers) + ["softmax"]

    params = initialize_parameters(layer_dims)

    for i in range(epochs + 1):
        A_out, cache = forward_propagation(X, params, activations)

        if loss_function == "mse":
            loss = mse_loss(Y, A_out)
        elif loss_function == "cross_entropy":
            loss = cross_entropy_loss(Y, A_out)
        else:
            raise ValueError("Unsupported loss function")

        grads = backward_propagation(Y, params, cache, activations)
        params = update_parameters(params, grads, lr)

        if i % print_every == 0:
            print(f"Epoch {i:5d} | Loss: {loss:.4f}")

    return params


# =======================================
# 5. Prediction & Evaluation
# =======================================
def predict(X, params, activations):
    A_out, _ = forward_propagation(X, params, activations)
    return np.argmax(A_out, axis=0)

def accuracy_score(y_true, y_pred):
    return np.mean(y_true == y_pred) * 100


# =======================================
# 7. K-Fold Cross Validation
# =======================================
def cross_validate_model(X, Y, k=5, **train_kwargs):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    accuracies = []
    fold = 1
    for train_idx, val_idx in kf.split(X):
        print(f"\n=== Fold {fold}/{k} ===")
        X_train, X_val = X[train_idx], X[val_idx]
        Y_train, Y_val = Y[train_idx], Y[val_idx]
        params = train_neural_network(X_train, Y_train, **train_kwargs)
        y_pred = predict(X_val, params, train_kwargs.get("activations"))
        y_true = np.argmax(Y_val, axis=1)
        acc = accuracy_score(y_true, y_pred)
        accuracies.append(acc)
        print(f"Validation Accuracy: {acc:.2f}%")
        fold += 1

    mean_acc = np.mean(accuracies)
    std_acc = np.std(accuracies)
    print(f"\nK-Fold Mean Accuracy: {mean_acc:.2f}% ± {std_acc:.2f}%")
    return mean_acc, std_acc


# =======================================
# Run K-Fold Cross Validation
# =======================================

cross_validate_model(
    X, y,
    k=5,
    hidden_layers=[10, 10, 10],
    lr=0.05,
    epochs=3000,
    print_every=1000,
    activations=["sigmoid", "sigmoid", "sigmoid", "softmax"]
)


=== Fold 1/5 ===
Epoch     0 | Loss: 1.3303
Epoch  1000 | Loss: 0.9697
Epoch  2000 | Loss: 0.3686
Epoch  3000 | Loss: 0.1729
Validation Accuracy: 96.67%

=== Fold 2/5 ===
Epoch     0 | Loss: 1.3744
Epoch  1000 | Loss: 0.9500
Epoch  2000 | Loss: 0.3627
Epoch  3000 | Loss: 0.1692
Validation Accuracy: 96.67%

=== Fold 3/5 ===
Epoch     0 | Loss: 1.3643
Epoch  1000 | Loss: 0.9365
Epoch  2000 | Loss: 0.3511
Epoch  3000 | Loss: 0.1584
Validation Accuracy: 96.67%

=== Fold 4/5 ===
Epoch     0 | Loss: 1.3263
Epoch  1000 | Loss: 0.9604
Epoch  2000 | Loss: 0.3504
Epoch  3000 | Loss: 0.1578
Validation Accuracy: 93.33%

=== Fold 5/5 ===
Epoch     0 | Loss: 1.3318
Epoch  1000 | Loss: 0.9355
Epoch  2000 | Loss: 0.3407
Epoch  3000 | Loss: 0.1747
Validation Accuracy: 96.67%

K-Fold Mean Accuracy: 96.00% ± 1.33%


(np.float64(96.0), np.float64(1.333333333333337))