In [None]:
# Three Layers, 64 neurons, ReLU in hidden layers, softmax in output layers
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import numpy as np
import pandas as pd
import tracemalloc

tracemalloc.start()

data = fetch_covtype()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

encoder = OneHotEncoder(sparse_output=False)
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

layers = [X_train.shape[1], 64, 64, y_train.shape[1]]

# WEIGHTS AND BIAS ------------------------------

def weight_and_bias(layers):
    weight = []
    bias = []
    np.random.seed(0)
    for i in range(len(layers)-1):
        W = np.random.randn(layers[i], layers[i+1]) * np.sqrt(1. / layers[i])
        weight.append(W)
        b = np.zeros((1, layers[i+1]))
        bias.append(b)
    return weight, bias

weight, bias = weight_and_bias(layers)

# ACTIVATION FUNCTIONS --------------------------

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    s = sigmoid(x)
    return s * (1 - s)

def tanh(x):
    x = np.clip(x, -100, 100)
    return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))

def tanh_derivative(x):
    t = tanh(x)
    return 1 - t**2

def ReLU(x):
    return np.maximum(0, x)

def ReLU_derivative(x):
    return (x > 0).astype(float)

def softmax(z):
    z_exp = np.exp(z - np.max(z, axis=1, keepdims=True))
    return z_exp / np.sum(z_exp, axis=1, keepdims=True)

def categorical_cross_entropy(y_true, y_pred):
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=1))

def softmax_cross_entropy_derivative(y_true, y_pred):
    return y_pred - y_true

# FORWARD PROPAGATION ----------------------------

def forward_propagation(X, weight, bias):
    Z = []
    A = [X]
    for i in range(len(layers) - 1):
        Z_cur = np.dot(A[i], weight[i]) + bias[i]
        if i < len(layers) - 2:
            A_cur = ReLU(Z_cur)
        else:
            A_cur = softmax(Z_cur)
        Z.append(Z_cur)
        A.append(A_cur)
    return Z, A

# BACKWARD PROPAGATION ---------------------------

def backward_propagation(layers, y_true, A, Z, weight, bias, learning_rate):
    n_layer = len(layers) - 1
    dA = [0] * (n_layer - 1)
    dZ = [0] * n_layer
    dW = [0] * n_layer
    db = [0] * n_layer

    dZ[n_layer - 1] = softmax_cross_entropy_derivative(y_true, A[n_layer])
    dW[n_layer - 1] = np.dot(A[n_layer - 1].T, dZ[n_layer - 1])
    db[n_layer - 1] = np.sum(dZ[n_layer - 1], axis=0, keepdims=True)

    for i in reversed(range(len(layers) - 2)):
        dA[i] = np.dot(dZ[i + 1], weight[i + 1].T)
        dZ[i] = dA[i] * ReLU_derivative(Z[i])
        dW[i] = np.dot(A[i].T, dZ[i])
        db[i] = np.sum(dZ[i], axis=0, keepdims=True)

    for i in reversed(range(len(layers) - 1)):
        weight[i] -= learning_rate * dW[i]
        bias[i] -= learning_rate * db[i]

    return weight, bias

# TRAINING  --------------------------

learning_rate = 0.005
n_epoch = 500
batch_size = 128

n_samples = X_train.shape[0]
n_batches = n_samples // batch_size

for epoch in range(n_epoch):

    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    X_train = X_train[indices]
    y_train = y_train[indices]

    epoch_loss = 0

    for i in range(n_batches):
        start = i * batch_size
        end = start + batch_size
        X_batch = X_train[start:end]
        y_batch = y_train[start:end]

        Z, A = forward_propagation(X_batch, weight, bias)
        loss = categorical_cross_entropy(y_batch, A[-1])
        weight, bias = backward_propagation(layers, y_batch, A, Z, weight, bias, learning_rate)
        epoch_loss += loss

    if epoch % 100 == 0:
        print(f"Epoch: {epoch} \t Avg Loss: {epoch_loss / n_batches:.4f}")

Z_train, A_train = forward_propagation(X_train, weight, bias)
y_pred_train = np.argmax(A_train[-1], axis=1)
y_true_train = np.argmax(y_train, axis=1)

print("TRAINING RESULTS: \n")
print(f"Accuracy:  {accuracy_score(y_true_train, y_pred_train):.4f}")
print(f"Precision: {precision_score(y_true_train, y_pred_train, average='weighted'):.4f}")
print(f"Recall:    {recall_score(y_true_train, y_pred_train, average='weighted'):.4f}")
print(f"F1 score:  {f1_score(y_true_train, y_pred_train, average='weighted'):.4f}")

# TEST  --------------------------------

Z_test, A_test = forward_propagation(X_test, weight, bias)
loss = categorical_cross_entropy(y_test, A_test[-1])
y_pred_test = np.argmax(A_test[-1], axis=1)
y_true_test = np.argmax(y_test, axis=1)

print("\nTEST RESULTS: \n")
print(f"Loss:      {loss:.4f}")
print(f"Accuracy:  {accuracy_score(y_true_test, y_pred_test):.4f}")
print(f"Precision: {precision_score(y_true_test, y_pred_test, average='weighted'):.4f}")
print(f"Recall:    {recall_score(y_true_test, y_pred_test, average='weighted'):.4f}")
print(f"F1 score:  {f1_score(y_true_test, y_pred_test, average='weighted'):.4f}")

# Total Learnable Parameters --------------------

def count_parameters(weights, biases):
    total = 0
    for w, b in zip(weights, biases):
        total += np.prod(w.shape) + np.prod(b.shape)
    return total

print("\nTotal Learnable Parameters:", count_parameters(weight, bias))

current, peak = tracemalloc.get_traced_memory()
print(f"Peak RAM usage: {peak / 1024 / 1024:.2f} MB")

tracemalloc.stop()


Epoch: 0 	 Avg Loss: 0.5417
Epoch: 100 	 Avg Loss: 0.3094
Epoch: 200 	 Avg Loss: 1.2059
Epoch: 300 	 Avg Loss: 1.2059
Epoch: 400 	 Avg Loss: 1.2060
TRAINING RESULTS: 

Accuracy:  0.4879
Precision: 0.2381
Recall:    0.4879
F1 score:  0.3200

TEST RESULTS: 

Loss:      1.2061
Accuracy:  0.4862
Precision: 0.2364
Recall:    0.4862
F1 score:  0.3181

Total Learnable Parameters: 8135
Peak RAM usage: 1733.68 MB


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
