### Data


In [1]:
#import standard libraries
import torch
import sklearn
import numpy as np

In [2]:
import sklearn.datasets
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
def load_data():
    mnist = sklearn.datasets.fetch_openml('mnist_784')
    X, y = np.array(mnist.data), np.array(mnist.target).reshape(-1, 1)
    return X, encoder.fit_transform(y).toarray()

    

In [3]:
X, y = load_data()

In [4]:
X[0].shape

(784,)

### building the model

In [5]:
class Network:
    def __init__(self, sizes, X, y):
        self.X = X
        self.y = y
        np.random.seed(4)
        # The list ``sizes`` contains the number of neurons in the respective layers of the network.
        self.num_layers = len(sizes)
        self.sizes = sizes
        # This create an array of random integers of shape (y, 1), y being the number of neuron in a layer
        # We can also see another observation which is for every layer we get of length y we get a vector
        # Each entry in this vector of the bias of one neuron
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) for x, y in zip(sizes[:-1], sizes[1:])]

    def sigmoid(self, x):
        x = np.clip(x, -500, 500)
        return 1 / (1 + np.exp(-x))
    
    def Forward_propogation(self):
        self.a = self.X.T
        self.previous_activation = []
        
        for layer in range(0, len(self.sizes) - 1):
            z = np.dot(self.weights[layer], self.a) + self.biases[layer]
            if layer == len(self.sizes) - 2:
                z_max = np.max(z, axis=0, keepdims=True)
                self.a = np.exp(z - z_max) / np.sum(np.exp(z - z_max), axis=0)
            else:
                self.a = self.sigmoid(z)
            self.previous_activation.append(self.a)
        return self.a
    def Cost(self):
        # we will use categorical cross entorpy 
        y = self.y
        epsilon = 10 ** -15
        y_pred = self.a.T
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return  -1 * np.sum((y * np.log(y_pred))) / len(y)
    def backwards(self):
        delta = (self.a - self.y.T)
        weights = []
        biases = []
        for layer in range(len(self.weights) - 1, -1, -1):
            if layer > 0:
                a_prev = self.previous_activation[layer - 1]  
            if layer == 0:
                a_prev = self.X.T  
            grad_weights = np.zeros(self.weights[layer].shape)
            grad_bias = np.zeros(self.biases[layer].shape)
            grad_weights = np.dot(delta, a_prev.T) 
            grad_bias = np.sum(delta, axis=1, keepdims=True)
            if layer > 0:
                delta = np.dot(self.weights[layer].T, delta) * (a_prev * (1 - a_prev))
            weights.append(grad_weights)
            biases.append(grad_bias)
        self.grad_weights = weights[::-1]
        self.grad_biases = biases[::-1]
        return self.grad_weights, self.grad_biases

    def optimizer(self, learning_rate=0.01 / 3, training_epochs=200, beta1=0.99, beta2= 0.999):
        t = 1
        epsilon = 1e-6
        weights_moments = [np.zeros(weight.shape) for weight in self.weights]
        biases_moments = [np.zeros(weight.shape) for weight in self.biases]
        weights_velocities = [np.zeros(weight.shape) for weight in self.weights]      
        biases_velocities = [np.zeros(weight.shape) for weight in self.biases]
        for _ in range(training_epochs):
            self.Forward_propogation()
            self.backwards()
            for layer in range(len(self.weights)):
                # weights_velocities[layer] = weights_velocities[layer] + (self.grad_weights[layer] * self.grad_weights[layer])
                # biases_velocities[layer] = biases_velocities[layer]+ (self.grad_biases[layer] * self.grad_biases[layer])
                weights_moments[layer] = beta1 * weights_moments[layer] + (1 - beta1) * self.grad_weights[layer]
                biases_moments[layer] = beta1 * biases_moments[layer] + (1 - beta1) * self.grad_biases[layer]
                weights_velocities[layer] = beta2 * weights_velocities[layer] + (1 - beta2) * (self.grad_weights[layer] * self.grad_weights[layer]) 
                biases_velocities[layer] = beta2 * biases_velocities[layer]+ (1 - beta2) * (self.grad_biases[layer] * self.grad_biases[layer])
                weights_moments_prime = weights_moments[layer] / (1 - beta1 ** t)
                biases_moments_prime = biases_moments[layer] / (1 - beta1 ** t)
                weights_velocities_prime = weights_velocities[layer] / (1 - beta2 ** t)
                biases_velocities_prime = biases_velocities[layer] / (1 - beta2 ** t)
                self.weights[layer] -= learning_rate * weights_moments_prime / np.sqrt(weights_velocities_prime+ epsilon)
                # self.weights[layer] -= learning_rate * self.grad_weights[layer]
                self.biases[layer] -= learning_rate * biases_moments_prime  / np.sqrt(biases_velocities_prime  + epsilon)
            t += 1
            print(f"Epoch {_}/{training_epochs} | Iteration {_} | Cost: {self.Cost():.6f}")

        #     grad_norms = [np.linalg.norm(gw) for gw in self.grad_weights]
        #     exploading_threshold = 1.0
        #     vanishing_threshold = 1e-6
        #     if all(norm < vanishing_threshold for norm in grad_norms):
        #         learning_rate *=5 
        #     if all(norm > exploading_threshold for norm in grad_norms):
        #         learning_rate *= 0.5
        # if abs(min(grad_norms) - max(grad_norms)) < 1e-8:  # Floating-point tolerant comparison
        #     print(f"CRITICAL: All gradients have identical magnitude ({grad_norms[0]:.6e})")
        #     # Emergency measures
        #     learning_rate *= 10  # Drastic LR increase
        #     for gw in self.grad_weights:  # Add noise to break symmetry
        #         gw += np.random.normal(0, 1e-6, gw.shape)
                
        
    def train(self):
        self.Forward_propogation()
        print("Starting training...")
        print(f"Initial Cost: {self.Cost():.6f}")  
        # Apply gradient descent with the specified parameters
        self.optimizer()
        
        print(f"Final Cost: {self.Cost():.6f}")
        print("Training complete!")
    def accuracy(self):
        preds = np.argmax(self.a.T, axis=1)
        targets = np.argmax(self.y, axis=1)
        return np.mean(preds == targets)



In [6]:
X = X / 255.0  # Scale pixel values to 0-1 range

In [7]:
nn = Network([784, 256, 128, 10], X, y)  # More efficient gradient flow
#nn = Network([784, 128, 10], X, y)

In [8]:
nn.train()

Starting training...
Initial Cost: 7.749283
Epoch 0/200 | Iteration 0 | Cost: 7.749283
Epoch 1/200 | Iteration 1 | Cost: 6.248049
Epoch 2/200 | Iteration 2 | Cost: 5.343812
Epoch 3/200 | Iteration 3 | Cost: 4.782090
Epoch 4/200 | Iteration 4 | Cost: 4.394985
Epoch 5/200 | Iteration 5 | Cost: 4.097846
Epoch 6/200 | Iteration 6 | Cost: 3.846191
Epoch 7/200 | Iteration 7 | Cost: 3.618595
Epoch 8/200 | Iteration 8 | Cost: 3.404706
Epoch 9/200 | Iteration 9 | Cost: 3.197660
Epoch 10/200 | Iteration 10 | Cost: 2.992148
Epoch 11/200 | Iteration 11 | Cost: 2.785703
Epoch 12/200 | Iteration 12 | Cost: 2.580287
Epoch 13/200 | Iteration 13 | Cost: 2.381518
Epoch 14/200 | Iteration 14 | Cost: 2.196442
Epoch 15/200 | Iteration 15 | Cost: 2.031337
Epoch 16/200 | Iteration 16 | Cost: 1.890116
Epoch 17/200 | Iteration 17 | Cost: 1.773552
Epoch 18/200 | Iteration 18 | Cost: 1.679662
Epoch 19/200 | Iteration 19 | Cost: 1.604822
Epoch 20/200 | Iteration 20 | Cost: 1.544847
Epoch 21/200 | Iteration 21 | C

In [9]:
nn.accuracy()

0.9118714285714286