Importing Dataset and Preprocessing 

load_breast_cancer(): Loads the dataset (30 features, binary labels: 0/1).

StandardScaler: Normalizes the input features to zero mean and unit variance.

OneHotEncoder: Converts labels from shape (n, 1) to one-hot vectors like [1, 0] and [0, 1].

In [3]:
# Import Dataset and Preprocess
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import precision_score, recall_score, f1_score

data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

encoder = OneHotEncoder(sparse_output=False)
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)


Activation and Loss Functions

ReLU: max(0, x)

Sigmoid: Maps to range (0, 1)

Tanh: Maps to range (-1, 1)

Softmax: Converts logits to probabilities over multiple classes

Cross-entropy loss: For one-hot encoded labels

In [4]:
# Activation and Loss Functions
def relu(x): return np.maximum(0, x)
def relu_derivative(x): return (x > 0).astype(float)
def sigmoid(x): return 1 / (1 + np.exp(-x))
def sigmoid_derivative(x): return sigmoid(x) * (1 - sigmoid(x))
def tanh(x): return np.tanh(x)
def tanh_derivative(x): return 1 - np.tanh(x)**2
def softmax(x): exp_x = np.exp(x - np.max(x, axis=1, keepdims=True)); return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def cross_entropy(y_true, y_pred): return -np.sum(y_true * np.log(y_pred + 1e-9)) / y_true.shape[0]
def cross_entropy_derivative(y_true, y_pred): return y_pred - y_true



Model Initialization

Createing weight matrices and bias vectors for each layer using He initialization for ReLU/tanh.

In [5]:

# Model Initialization
def initialize_parameters(layer_sizes):
    weights, biases = [], []
    for i in range(len(layer_sizes) - 1):
        w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2. / layer_sizes[i])
        b = np.zeros((1, layer_sizes[i+1]))
        weights.append(w)
        biases.append(b)
    return weights, biases

Forward Pass

Performing Weighted sum + activation at each layer

Supports relu, sigmoid, and tanh

Final layer uses sigmoid

In [7]:
# Forward Pass 
def forward_pass(X, weights, biases, activation_hidden):
    activations, zs = [X], []
    for i in range(len(weights) - 1):
        z = np.dot(activations[-1], weights[i]) + biases[i]
        zs.append(z)
        if activation_hidden == "relu": a = relu(z)
        elif activation_hidden == "sigmoid": a = sigmoid(z)
        elif activation_hidden == "tanh": a = tanh(z)
        activations.append(a)
    z = np.dot(activations[-1], weights[-1]) + biases[-1]
    zs.append(z)
    activations.append(sigmoid(z))
    return activations, zs

Backward Pass

Implementing backpropagation for each layer,

Use derivative of selected activation (relu, sigmoid, or tanh)

Computes gradients of loss with respect to weights and biases

In [8]:
# Backward Pass
def backward_pass(activations, zs, y_true, weights, activation_hidden):
    grad_weights, grad_biases = [None] * len(weights), [None] * len(weights)
    delta = cross_entropy_derivative(y_true, activations[-1])
    grad_weights[-1] = np.dot(activations[-2].T, delta) / y_true.shape[0]
    grad_biases[-1] = np.sum(delta, axis=0, keepdims=True) / y_true.shape[0]
    for l in range(len(weights) - 2, -1, -1):
        if activation_hidden == "relu": delta = np.dot(delta, weights[l+1].T) * relu_derivative(zs[l])
        elif activation_hidden == "sigmoid": delta = np.dot(delta, weights[l+1].T) * sigmoid_derivative(zs[l])
        elif activation_hidden == "tanh": delta = np.dot(delta, weights[l+1].T) * tanh_derivative(zs[l])
        grad_weights[l] = np.dot(activations[l].T, delta) / y_true.shape[0]
        grad_biases[l] = np.sum(delta, axis=0, keepdims=True) / y_true.shape[0]
    return grad_weights, grad_biases

Standard SGD: param -= learning_rate * gradient

In [9]:
# Gradient Descent
def update_parameters(weights, biases, grad_weights, grad_biases, lr):
    for i in range(len(weights)):
        weights[i] -= lr * grad_weights[i]
        biases[i] -= lr * grad_biases[i]
    return weights, biases

Evaluation Utilities

predict(): Uses np.argmax() to get class labels from sigmoid outputs

accuracy(): Direct class comparison

count_learnable_params(): Weight + bias count

estimate_memory_usage(): Memory used by weights + biases in MB

In [10]:
# Evaluation Utilities
def predict(X, weights, biases, activation_hidden):
    activations, _ = forward_pass(X, weights, biases, activation_hidden)
    return np.argmax(activations[-1], axis=1)

def accuracy(y_true, y_pred): return np.mean(y_true == y_pred)

def count_learnable_params(layers):
    total = 0
    for i in range(len(layers) - 1):
        total += layers[i] * layers[i+1] + layers[i+1]
    return total

def estimate_memory_usage(weights, biases):
    total_bytes = sum(w.nbytes + b.nbytes for w, b in zip(weights, biases))
    return total_bytes / (1024 ** 2)

Experimental Configuration and Grid Search

Running 6 configurations:

Different layer widths (5, 10, 10/10)

Different depths (2-layer, 3-layer)

Activation functions (tanh, relu)

Each runs for 10,000 epochs

In [11]:
# Experimental Configuration and Grid Search
custom_configs = [
    {"name": "V1", "layers": [30, 5, 2], "activation": "tanh", "lr": 0.01, "epochs": 10000},
    {"name": "V2", "layers": [30, 5, 2], "activation": "relu", "lr": 0.01, "epochs": 10000},
    {"name": "V3", "layers": [30, 10, 2], "activation": "tanh", "lr": 0.01, "epochs": 10000},
    {"name": "V4", "layers": [30, 10, 2], "activation": "relu", "lr": 0.01, "epochs": 10000},
    {"name": "V5", "layers": [30, 10, 10, 2], "activation": "tanh", "lr": 0.01, "epochs": 10000},
    {"name": "V6", "layers": [30, 10, 10, 2], "activation": "relu", "lr": 0.01, "epochs": 10000}
]

results = []


Main Training Loop

For each configuration:

Initialize weights/biases

Train using forward/backward passes

Predict test set

Evaluate: accuracy, precision, recall, F1

Log results to a list of tuples

In [12]:
# Main Training Loop
for config in custom_configs:
    weights, biases = initialize_parameters(config["layers"])
    for epoch in range(config["epochs"]):
        activations_list, zs = forward_pass(X_train, weights, biases, config["activation"])
        loss = cross_entropy(y_train, activations_list[-1])
        grad_weights, grad_biases = backward_pass(activations_list, zs, y_train, weights, config["activation"])
        weights, biases = update_parameters(weights, biases, grad_weights, grad_biases, config["lr"])
    y_pred_test = predict(X_test, weights, biases, config["activation"])
    y_true_test = np.argmax(y_test, axis=1)
    acc = accuracy(y_true_test, y_pred_test)
    prec = precision_score(y_true_test, y_pred_test)
    rec = recall_score(y_true_test, y_pred_test)
    f1 = f1_score(y_true_test, y_pred_test)
    n_params = count_learnable_params(config["layers"])
    mem_mb = estimate_memory_usage(weights, biases)
    results.append((
        config["name"], config["layers"], config["activation"], config["lr"],
        config["epochs"], acc, prec, rec, f1, n_params, mem_mb
    ))

In [13]:

# Output Results
df = pd.DataFrame(results, columns=[
    "Config", "Layers", "Activation", "Learning Rate", "Epochs",
    "Accuracy", "Precision", "Recall", "F1 Score",
    "Learnable Parameters", "Estimated RAM (MB)"
])

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

print(df)


  Config           Layers Activation  Learning Rate  Epochs  Accuracy  \
0     V1       [30, 5, 2]       tanh           0.01   10000  0.982456   
1     V2       [30, 5, 2]       relu           0.01   10000  0.982456   
2     V3      [30, 10, 2]       tanh           0.01   10000  0.982456   
3     V4      [30, 10, 2]       relu           0.01   10000  0.973684   
4     V5  [30, 10, 10, 2]       tanh           0.01   10000  0.973684   
5     V6  [30, 10, 10, 2]       relu           0.01   10000  0.973684   

   Precision    Recall  F1 Score  Learnable Parameters  Estimated RAM (MB)  
0   0.985915  0.985915  0.985915                   167            0.001274  
1   0.985915  0.985915  0.985915                   167            0.001274  
2   0.985915  0.985915  0.985915                   332            0.002533  
3   0.972222  0.985915  0.979021                   332            0.002533  
4   0.985714  0.971831  0.978723                   442            0.003372  
5   0.985714  0.971831  0.