# Neural Network from Scratch with NumPy

This notebook implements and trains a fully-connected feedforward neural network from scratch using only NumPy.

## Project Overview
- Implementing forward and backward propagation manually
- Training on Fashion-MNIST and CIFAR-10 datasets
- Experiment tracking with Weights & Biases (WandB)
- Comparing different architectures, optimizers, and hyperparameters



In [None]:
# Import required libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import our custom modules from the refactored package
from src.data.loaders import load_dataset, one_hot_encode, train_val_split
from src.evaluation.metrics import accuracy, confusion_matrix_counts, classification_report_dict
from src.models.feedforward import FeedForwardNN, NetworkConfig
from src.optimizers import Optimizer, OptimizerConfig
from src.training.trainer import TrainingConfig, train_model

# Set random seed for reproducibility
np.random.seed(42)

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")



## 1. Load Dataset

We'll start with Fashion-MNIST dataset, which is a good starting point for testing our implementation.



In [None]:
# Load Fashion-MNIST dataset
print("Loading Fashion-MNIST dataset...")
X_train_raw, y_train_raw, X_test, y_test = load_dataset("fashion_mnist", source="keras")

# Create validation split
X_train, y_train, X_val, y_val = train_val_split(X_train_raw, y_train_raw, validation_split=0.1)

num_classes = int(np.max(y_train_raw)) + 1
input_size = X_train.shape[1]

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")
print(f"Input size: {input_size}")
print(f"Number of classes: {num_classes}")

# One-hot encode labels
y_train_onehot = one_hot_encode(y_train, num_classes)
y_val_onehot = one_hot_encode(y_val, num_classes)
y_test_onehot = one_hot_encode(y_test, num_classes)

print("\nLabels one-hot encoded!")



## 2. Initialize and Train Model

Let's create a simple neural network and train it on Fashion-MNIST.



In [None]:
# Configure the network and optimizer
network_config = NetworkConfig(
    input_size=input_size,
    hidden_sizes=[128, 64],
    output_size=num_classes,
    activation="relu",
    output_activation="softmax",
    weight_init="he",
    l2_coeff=1e-4,
)

optimizer_config = OptimizerConfig(
    optimizer_type="adam",
    learning_rate=1e-3,
    weight_decay=0.0,
)

model = FeedForwardNN(network_config)
optimizer = Optimizer(optimizer_config)
train_config = TrainingConfig(batch_size=32, num_epochs=20, loss="cross_entropy", use_wandb=False)

print("Model created successfully!")
print(f"Number of layers: {model.num_layers}")
print(f"Layer sizes: {model.layer_sizes}")



In [None]:
# Train the model
print("Starting training...")
history = train_model(
    model,
    optimizer,
    X_train,
    y_train,
    X_val,
    y_val,
    config=train_config,
)

print("\nTraining completed!")



## 3. Evaluate Model

Let's evaluate the trained model on the test set.



In [None]:
# Evaluate on test set
test_probabilities = model.predict_proba(X_test)
test_accuracy = accuracy(y_test_onehot, test_probabilities)

print(f"Test Accuracy: {test_accuracy:.4f}")

# Confusion matrix
cm = confusion_matrix_counts(y_test_onehot, test_probabilities)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Classification report
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
report = classification_report_dict(y_test_onehot, test_probabilities, class_names=class_names)
print("\nClassification Report:")
print(f"Overall Accuracy: {report['accuracy']:.4f}")



## 4. Plot Training Curves

Visualize the training and validation loss/accuracy over epochs.



In [None]:
# Plot training curves
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss curve
axes[0].plot(history['train_loss'], label='Train Loss', linewidth=2)
axes[0].plot(history['val_loss'], label='Validation Loss', linewidth=2)
axes[0].set_xlabel('Epoch', fontsize=12)
axes[0].set_ylabel('Loss', fontsize=12)
axes[0].set_title('Training and Validation Loss', fontsize=14)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy curve
axes[1].plot(history['train_accuracy'], label='Train Accuracy', linewidth=2)
axes[1].plot(history['val_accuracy'], label='Validation Accuracy', linewidth=2)
axes[1].set_xlabel('Epoch', fontsize=12)
axes[1].set_ylabel('Accuracy', fontsize=12)
axes[1].set_title('Training and Validation Accuracy', fontsize=14)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()



## 5. Experiment with Different Hyperparameters

Now let's experiment with different configurations to see how they affect performance.

### Experiment 1: Different Optimizers



In [None]:
# Compare different optimizers
optimizers = ['sgd', 'adam', 'rmsprop']
optimizer_results = {}

for opt in optimizers:
    print(f"\nTraining with {opt.upper()} optimizer...")

    net_cfg = NetworkConfig(
        input_size=input_size,
        hidden_sizes=[128, 64],
        output_size=num_classes,
        activation='relu',
        output_activation='softmax',
        weight_init='he',
        l2_coeff=1e-4,
    )
    model_opt = FeedForwardNN(net_cfg)
    opt_cfg = OptimizerConfig(optimizer_type=opt, learning_rate=1e-3)
    optimizer_instance = Optimizer(opt_cfg)
    train_cfg = TrainingConfig(batch_size=32, num_epochs=10, loss='cross_entropy', use_wandb=False)

    history_opt = train_model(model_opt, optimizer_instance, X_train, y_train, X_val, y_val, config=train_cfg)

    test_prob = model_opt.predict_proba(X_test)
    test_acc = accuracy(y_test_onehot, test_prob)

    optimizer_results[opt] = {
        'history': history_opt,
        'test_accuracy': test_acc
    }

    print(f"{opt.upper()} - Test Accuracy: {test_acc:.4f}")



In [None]:
# Plot comparison of optimizers
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
for opt in optimizers:
    plt.plot(optimizer_results[opt]['history']['val_loss'], label=f'{opt.upper()}', linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Validation Loss', fontsize=12)
plt.title('Validation Loss Comparison', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
for opt in optimizers:
    plt.plot(optimizer_results[opt]['history']['val_accuracy'], label=f'{opt.upper()}', linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Validation Accuracy', fontsize=12)
plt.title('Validation Accuracy Comparison', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nFinal Test Accuracies:")
for opt in optimizers:
    print(f"{opt.upper()}: {optimizer_results[opt]['test_accuracy']:.4f}")



### Experiment 2: Different Activation Functions



In [None]:
# Compare different activation functions
activations = ['relu', 'sigmoid', 'tanh']
activation_results = {}

for act in activations:
    print(f"\nTraining with {act.upper()} activation...")

    weight_init = 'he' if act == 'relu' else 'xavier'
    net_cfg = NetworkConfig(
        input_size=input_size,
        hidden_sizes=[128, 64],
        output_size=num_classes,
        activation=act,
        output_activation='softmax',
        weight_init=weight_init,
        l2_coeff=1e-4,
    )
    model_act = FeedForwardNN(net_cfg)
    opt_cfg = OptimizerConfig(optimizer_type='adam', learning_rate=1e-3)
    optimizer_instance = Optimizer(opt_cfg)
    train_cfg = TrainingConfig(batch_size=32, num_epochs=10, loss='cross_entropy', use_wandb=False)

    history_act = train_model(model_act, optimizer_instance, X_train, y_train, X_val, y_val, config=train_cfg)

    test_prob = model_act.predict_proba(X_test)
    test_acc = accuracy(y_test_onehot, test_prob)

    activation_results[act] = {
        'history': history_act,
        'test_accuracy': test_acc
    }

    print(f"{act.upper()} - Test Accuracy: {test_acc:.4f}")



## 6. WandB Integration for Experiment Tracking

To use WandB for comprehensive experiment tracking, uncomment and modify the code below.



In [None]:
# Example WandB sweep configuration
# Uncomment and run this cell to start a WandB sweep

# import wandb
# 
# wandb.init(project="numpy-neural-network", 
#            name="fashion-mnist-experiment",
#            config={
#                "input_size": input_size,
#                "hidden_sizes": [128, 64],
#                "output_size": num_classes,
#                "activation": "relu",
#                "loss": "cross_entropy",
#                "learning_rate": 0.001,
#                "optimizer": "adam",
#                "l2_coeff": 0.0001,
#                "weight_init": "he",
#                "batch_size": 32,
#                "num_epochs": 20,
#                "dataset": "fashion-mnist"
#            })
# 
# model = FFNN(
#     input_size=input_size,
#     hidden_sizes=[128, 64],
#     output_size=num_classes,
#     activation='relu',
#     loss='cross_entropy',
#     learning_rate=wandb.config.learning_rate,
#     optimizer=wandb.config.optimizer,
#     l2_coeff=wandb.config.l2_coeff,
#     weight_init=wandb.config.weight_init,
#     batch_size=wandb.config.batch_size,
#     num_epochs=wandb.config.num_epochs
# )
# 
# history = train_model(model, optimizer, X_train, y_train, X_val, y_val, config=train_cfg)
# 
# # Log final test accuracy
# test_pred = model.predict(X_test)
# test_acc = accuracy(y_test_onehot, test_pred)
# wandb.log({"test_accuracy": test_acc})
# 
# wandb.finish()

print("WandB integration example (commented out). Uncomment to use.")



## 7. CIFAR-10 Experiment (Optional)

You can also experiment with CIFAR-10 dataset. Note that it requires more computational resources.



In [None]:
# Uncomment to load and train on CIFAR-10
# Note: CIFAR-10 has larger images (32x32x3 = 3072 features) and will take longer to train

# print("Loading CIFAR-10 dataset...")
# X_train_cifar_raw, y_train_cifar_raw, X_test_cifar, y_test_cifar = load_dataset("cifar10", source="keras")
# X_train_cifar, y_train_cifar, X_val_cifar, y_val_cifar = train_val_split(X_train_cifar_raw, y_train_cifar_raw, validation_split=0.1)
# 
# num_classes_cifar = int(np.max(y_train_cifar_raw)) + 1
# input_size_cifar = X_train_cifar.shape[1]
# 
# print(f"CIFAR-10 Training set: {X_train_cifar.shape}")
# print(f"CIFAR-10 Input size: {input_size_cifar}")
# 
# # One-hot encode labels for evaluation convenience
# y_test_cifar_onehot = one_hot_encode(y_test_cifar, num_classes_cifar)
# 
# # Create and train model for CIFAR-10
# network_config_cifar = NetworkConfig(
#     input_size=input_size_cifar,
#     hidden_sizes=[256, 128, 64],  # Larger network for CIFAR-10
#     output_size=num_classes_cifar,
#     activation='relu',
#     output_activation='softmax',
#     weight_init='he',
#     l2_coeff=1e-4,
# )
# model_cifar = FeedForwardNN(network_config_cifar)
# optimizer_config_cifar = OptimizerConfig(optimizer_type='adam', learning_rate=1e-3)
# optimizer_cifar = Optimizer(optimizer_config_cifar)
# train_config_cifar = TrainingConfig(batch_size=64, num_epochs=30, loss='cross_entropy', use_wandb=False)
# 
# print("Training on CIFAR-10...")
# history_cifar = train_model(model_cifar, optimizer_cifar, X_train_cifar, y_train_cifar, X_val_cifar, y_val_cifar, config=train_config_cifar)
# 
# # Evaluate
# test_prob_cifar = model_cifar.predict_proba(X_test_cifar)
# test_acc_cifar = accuracy(y_test_cifar_onehot, test_prob_cifar)
# print(f"CIFAR-10 Test Accuracy: {test_acc_cifar:.4f}")

print("CIFAR-10 experiment code (commented out). Uncomment to run.")

