In [1]:
import numpy as np
from tensorflow.keras.datasets import fashion_mnist

# Activation Functions
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(a):
    return a * (1 - a)

# Neural Network Class
class NeuralNetwork:
    def __init__(self, input_size, hidden_layers, output_size):
        """
        :param input_size: Number of input features (e.g., 784 for Fashion-MNIST)
        :param hidden_layers: List containing the number of neurons in each hidden layer
        :param output_size: Number of output classes (e.g., 10 for Fashion-MNIST)
        """
        self.layers = [input_size] + hidden_layers + [output_size]
        self.weights = []
        self.biases = []

        # Initialize weights and biases
        for i in range(len(self.layers) - 1):
            self.weights.append(np.random.randn(self.layers[i], self.layers[i+1]) * 0.01)
            self.biases.append(np.zeros((1, self.layers[i+1])))

    def forward(self, X):
        """
        Forward pass through the network.
        :return: activations and weighted sums
        """
        activations = [X]
        zs = []

        for i in range(len(self.weights) - 1):
            z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            zs.append(z)
            activations.append(sigmoid(z))

        # Output layer with softmax
        z = np.dot(activations[-1], self.weights[-1]) + self.biases[-1]
        zs.append(z)
        activations.append(softmax(z))

        return activations, zs

    def backward(self, X, y, activations, zs, learning_rate):
        """
        Backward pass using backpropagation.
        """
        m = X.shape[0]
        y_one_hot = np.eye(self.layers[-1])[y]

        # Compute output layer gradient
        delta = activations[-1] - y_one_hot
        dW = np.dot(activations[-2].T, delta) / m
        db = np.sum(delta, axis=0, keepdims=True) / m

        # Update weights and biases
        self.weights[-1] -= learning_rate * dW
        self.biases[-1] -= learning_rate * db

        # Backpropagate through hidden layers
        for i in range(len(self.weights) - 2, -1, -1):
            delta = np.dot(delta, self.weights[i+1].T) * sigmoid_derivative(activations[i+1])
            dW = np.dot(activations[i].T, delta) / m
            db = np.sum(delta, axis=0, keepdims=True) / m

            self.weights[i] -= learning_rate * dW
            self.biases[i] -= learning_rate * db

    def train(self, X_train, y_train, epochs, batch_size, learning_rate):
        """
        Train the neural network using mini-batch gradient descent.
        """
        for epoch in range(epochs):
            for i in range(0, X_train.shape[0], batch_size):
                X_batch = X_train[i:i+batch_size]
                y_batch = y_train[i:i+batch_size]

                activations, zs = self.forward(X_batch)
                self.backward(X_batch, y_batch, activations, zs, learning_rate)

            if epoch % 5 == 0:
                acc = self.evaluate(X_train, y_train)
                print(f"Epoch {epoch}: Training Accuracy = {acc:.4f}")

    def predict(self, X):
        """
        Make predictions for input X.
        """
        activations, _ = self.forward(X)
        return np.argmax(activations[-1], axis=1)

    def evaluate(self, X, y):
        """
        Evaluate accuracy on given data.
        """
        y_pred = self.predict(X)
        return np.mean(y_pred == y)

# Load and preprocess Fashion-MNIST dataset
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

# Normalize and flatten images
X_train = X_train.reshape(-1, 28*28) / 255.0
X_test = X_test.reshape(-1, 28*28) / 255.0

# Define the neural network architecture
hidden_layers = [128, 64]  # Example: 2 hidden layers with 128 and 64 neurons
nn = NeuralNetwork(input_size=28*28, hidden_layers=hidden_layers, output_size=10)

# Train the model
nn.train(X_train, y_train, epochs=20, batch_size=64, learning_rate=0.1)

# Evaluate on test set
test_accuracy = nn.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 0: Training Accuracy = 0.1000
Epoch 5: Training Accuracy = 0.7583
Epoch 10: Training Accuracy = 0.8310
Epoch 15: Training Accuracy = 0.8520
Test Accuracy: 0.8436


In [3]:
!pip install idx2numpy
!pip install wandb -qqq

Collecting idx2numpy
  Downloading idx2numpy-1.2.3.tar.gz (6.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: idx2numpy
  Building wheel for idx2numpy (setup.py) ... [?25l[?25hdone
  Created wheel for idx2numpy: filename=idx2numpy-1.2.3-py3-none-any.whl size=7904 sha256=1eea4f2ad624ba7bc10da7c84afd5357dae324e28a5776415664bb2098b51d03
  Stored in directory: /root/.cache/pip/wheels/87/e5/e7/70fc742b3645ddf9d392f766feccbcc95cb3a3c806f8588af0
Successfully built idx2numpy
Installing collected packages: idx2numpy
Successfully installed idx2numpy-1.2.3


In [6]:
import numpy as np
import tensorflow as tf
import wandb
import matplotlib.pyplot as plt

# Initialize wandb
wandb.init(project="fashion-mnist-backprop", name="NN-All-Optimizers")

# Load Fashion-MNIST dataset using TensorFlow
def load_fashion_mnist():
    (X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
    X_train, X_test = X_train.reshape(-1, 28*28) / 255.0, X_test.reshape(-1, 28*28) / 255.0
    return X_train, y_train, X_test, y_test

# Activation functions
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

class NeuralNetwork:
    def __init__(self, input_size, hidden_layers, output_size, optimizer="sgd", learning_rate=0.01):
        self.learning_rate = learning_rate
        self.optimizer = optimizer
        self.layers = [input_size] + hidden_layers + [output_size]

        self.weights = [np.random.randn(self.layers[i], self.layers[i+1]) * 0.01 for i in range(len(self.layers) - 1)]
        self.biases = [np.zeros((1, self.layers[i+1])) for i in range(len(self.layers) - 1)]

        # Optimization parameters
        self.velocities_w = [np.zeros_like(w) for w in self.weights]
        self.velocities_b = [np.zeros_like(b) for b in self.biases]
        self.m_w = [np.zeros_like(w) for w in self.weights]
        self.m_b = [np.zeros_like(b) for b in self.biases]
        self.v_w = [np.zeros_like(w) for w in self.weights]
        self.v_b = [np.zeros_like(b) for b in self.biases]
        self.t = 1

    def forward(self, X):
        activations, zs = [X], []
        for w, b in zip(self.weights, self.biases):
            z = activations[-1] @ w + b
            zs.append(z)
            activations.append(relu(z) if w is not self.weights[-1] else softmax(z))
        return activations, zs

    def compute_loss(self, y_pred, y_true):
        y_one_hot = np.eye(self.layers[-1])[y_true]
        return -np.mean(np.sum(y_one_hot * np.log(y_pred + 1e-8), axis=1))

    def backward(self, X, y, activations, zs, batch_size):
        grads_w, grads_b = [], []
        y_one_hot = np.eye(self.layers[-1])[y]

        delta = activations[-1] - y_one_hot
        for i in reversed(range(len(self.weights))):
            grads_w.insert(0, activations[i].T @ delta / batch_size)
            grads_b.insert(0, np.sum(delta, axis=0, keepdims=True) / batch_size)
            if i != 0:
                delta = (delta @ self.weights[i].T) * relu_derivative(zs[i - 1])

        self.update_weights(grads_w, grads_b)
        self.t += 1

    def update_weights(self, grads_w, grads_b):
        if self.optimizer == "sgd":
            self.sgd(grads_w, grads_b)
        elif self.optimizer == "momentum":
            self.momentum(grads_w, grads_b)
        elif self.optimizer == "nesterov":
            self.nesterov(grads_w, grads_b)
        elif self.optimizer == "rmsprop":
            self.rmsprop(grads_w, grads_b)
        elif self.optimizer == "adam":
            self.adam(grads_w, grads_b)
        elif self.optimizer == "nadam":
            self.nadam(grads_w, grads_b)

    def sgd(self, grads_w, grads_b):
        for i in range(len(self.weights)):
            self.weights[i] -= self.learning_rate * grads_w[i]
            self.biases[i] -= self.learning_rate * grads_b[i]

    def momentum(self, grads_w, grads_b, beta=0.9):
        for i in range(len(self.weights)):
            self.velocities_w[i] = beta * self.velocities_w[i] + self.learning_rate * grads_w[i]
            self.velocities_b[i] = beta * self.velocities_b[i] + self.learning_rate * grads_b[i]
            self.weights[i] -= self.velocities_w[i]
            self.biases[i] -= self.velocities_b[i]

    def nesterov(self, grads_w, grads_b, beta=0.9):
        for i in range(len(self.weights)):
            v_prev_w, v_prev_b = self.velocities_w[i], self.velocities_b[i]
            self.velocities_w[i] = beta * self.velocities_w[i] + self.learning_rate * grads_w[i]
            self.velocities_b[i] = beta * self.velocities_b[i] + self.learning_rate * grads_b[i]
            self.weights[i] -= beta * v_prev_w + (1 + beta) * self.velocities_w[i]
            self.biases[i] -= beta * v_prev_b + (1 + beta) * self.velocities_b[i]

    def rmsprop(self, grads_w, grads_b, beta=0.9, epsilon=1e-8):
        for i in range(len(self.weights)):
            self.v_w[i] = beta * self.v_w[i] + (1 - beta) * grads_w[i] ** 2
            self.weights[i] -= self.learning_rate * grads_w[i] / (np.sqrt(self.v_w[i]) + epsilon)

    def adam(self, grads_w, grads_b, beta1=0.9, beta2=0.999, epsilon=1e-8):
        for i in range(len(self.weights)):
            self.m_w[i] = beta1 * self.m_w[i] + (1 - beta1) * grads_w[i]
            self.v_w[i] = beta2 * self.v_w[i] + (1 - beta2) * (grads_w[i] ** 2)
            m_w_hat = self.m_w[i] / (1 - beta1 ** self.t)
            v_w_hat = self.v_w[i] / (1 - beta2 ** self.t)
            self.weights[i] -= self.learning_rate * m_w_hat / (np.sqrt(v_w_hat) + epsilon)

    def nadam(self, grads_w, grads_b):
        self.adam(grads_w, grads_b)

    def train(self, X, y, X_val, y_val, epochs=10, batch_size=64):
        for epoch in range(epochs):
            indices = np.random.permutation(X.shape[0])
            X, y = X[indices], y[indices]

            for i in range(0, X.shape[0], batch_size):
                X_batch = X[i:i+batch_size]
                y_batch = y[i:i+batch_size]

                activations, zs = self.forward(X_batch)
                self.backward(X_batch, y_batch, activations, zs, batch_size)

            y_pred = self.predict(X_val)
            acc = np.mean(y_pred == y_val)
            loss = self.compute_loss(self.forward(X_val)[0][-1], y_val)
            wandb.log({"Loss": loss, "Accuracy": acc, "Epoch": epoch + 1})

            print(f"Epoch {epoch+1}/{epochs} - Loss: {loss:.4f} - Accuracy: {acc:.4f}")

    def predict(self, X):
        return np.argmax(self.forward(X)[0][-1], axis=1)

X_train, y_train, X_test, y_test = load_fashion_mnist()
nn = NeuralNetwork(input_size=784, hidden_layers=[128, 64], output_size=10, optimizer="adam", learning_rate=0.001)
nn.train(X_train, y_train, X_test, y_test, epochs=10, batch_size=64)


Epoch 1/10 - Loss: 0.5366 - Accuracy: 0.8080
Epoch 2/10 - Loss: 0.5100 - Accuracy: 0.8181
Epoch 3/10 - Loss: 0.4321 - Accuracy: 0.8443
Epoch 4/10 - Loss: 0.4189 - Accuracy: 0.8532
Epoch 5/10 - Loss: 0.4094 - Accuracy: 0.8517
Epoch 6/10 - Loss: 0.3812 - Accuracy: 0.8630
Epoch 7/10 - Loss: 0.3653 - Accuracy: 0.8692
Epoch 8/10 - Loss: 0.3807 - Accuracy: 0.8643
Epoch 9/10 - Loss: 0.3553 - Accuracy: 0.8717
Epoch 10/10 - Loss: 0.3586 - Accuracy: 0.8731
