In [67]:
import numpy as np
import matplotlib.pyplot as plt
import struct

layer_sizes = [784, 256, 128, 64, 32, 10]
learning_rate = 0.01
epochs = 100
batch_size = 128

class NeuralNetwork:
    def __init__(self, activation_func, activation_derivative, weight_init_func, N=5, layer_sizes=[784, 256, 128, 64, 32, 10], lr=0.01, epochs=100, batch_size=128):
        self.N = N   
        self.layer_sizes = layer_sizes 
        self.lr = lr  
        self.activation_func = activation_func 
        self.activation_derivative = activation_derivative 
        self.weight_init_func = weight_init_func 
        self.epochs = epochs 
        self.batch_size = batch_size 
        self.weights = []
        self.biases = [] 
 
        self.initialize_weights()
    
    def fit(self, X, Y, X_val, Y_val):
        train_losses = []
        val_losses = []

        for epoch in range(self.epochs): 
            indices = np.random.permutation(X.shape[0])
            X, Y = X[indices], Y[indices]
 
            batch_losses = []
            for i in range(0, X.shape[0], self.batch_size):
                X_batch = X[i:i + self.batch_size]
                Y_batch = Y[i:i + self.batch_size]
                 
                activations, z_values = self.forward(X_batch)
                loss = self.cross_entropy_loss(Y_batch, activations[-1])
                batch_losses.append(loss)
 
                weight_grads, bias_grads = self.backpropagate(activations, z_values, Y_batch)
 
                for j in range(len(self.weights)):
                    self.weights[j] -= self.lr * weight_grads[j]
                    self.biases[j] -= self.lr * bias_grads[j]

            train_losses.append(np.mean(batch_losses))
 
            val_activations, _ = self.forward(X_val)
            val_loss = self.cross_entropy_loss(Y_val, val_activations[-1])
            val_losses.append(val_loss)

            # print(f"Epoch {epoch+1}/{self.epochs}, Training Loss: {train_losses[-1]}, Validation Loss: {val_losses[-1]}")

        return train_losses, val_losses
    
    def score(self, X, Y):
        predictions = self.predict(X)
        return np.mean(predictions == Y)

    def predict(self, X):
        activations, _ = self.forward(X)
        return np.argmax(activations[-1], axis=1)

    def predict_proba(self, X):
        activations, _ = self.forward(X)
        return activations[-1]
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def cross_entropy_loss(self, y_true, y_pred): 
        return -np.mean(np.sum(y_true * np.log(y_pred + 1e-8), axis=1))
    
    def initialize_weights(self): 
        for i in range(1, self.N):
            self.weights.append(self.weight_init_func((self.layer_sizes[i-1], self.layer_sizes[i])))
            self.biases.append(np.zeros((1, self.layer_sizes[i])))

    def forward(self, X): 
        activ = [X]
        D = []
 
        for i in range(len(self.weights) - 1):
            z = np.dot(activ[-1], self.weights[i]) + self.biases[i]
            D.append(z)
            activ.append(self.activation_func(z))
 
        z = np.dot(activ[-1], self.weights[-1]) + self.biases[-1]
        activ.append(self.softmax(z))
        D.append(z)

        return activ, D

    def backpropagate(self, activations, z_values, y_true): 
        biasGrad = [np.zeros_like(b) for b in self.biases] 
        weightGrad = [np.zeros_like(w) for w in self.weights]
        
        delta = activations[-1] - y_true
        weightGrad[-1] = np.dot(activations[-2].T, delta) / y_true.shape[0]
        biasGrad[-1] = np.sum(delta, axis=0, keepdims=True) / y_true.shape[0]

        for i in range(len(self.weights) - 2, -1, -1):
            delta = np.dot(delta, self.weights[i + 1].T) * self.activation_derivative(z_values[i])
            
            weightGrad[i] = np.dot(activations[i].T, delta) / y_true.shape[0]
            biasGrad[i] = np.sum(delta, axis=0, keepdims=True) / y_true.shape[0]
            
        return weightGrad, biasGrad

In [68]:
import numpy as np
import struct
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

def returnDataset():
    def load_Lab(filename):
        with open(filename, 'rb') as f:
            _ = struct.unpack(">II", f.read(8))
            labels = np.frombuffer(f.read(), dtype=np.uint8)
        return labels
    
    def load_Im(filename):
        with open(filename, 'rb') as f:
            _ = struct.unpack(">IIII", f.read(16))
            data = np.frombuffer(f.read(), dtype=np.uint8).reshape(-1, 28*28) / 255.0
        return data

    train_images_path = '/Users/shamiksinha/Desktop/MLassign3/train-images-idx3-ubyte/train-images-idx3-ubyte'
    train_labels_path = '/Users/shamiksinha/Desktop/MLassign3/train-labels-idx1-ubyte/train-labels-idx1-ubyte'
    X_train = load_Im(train_images_path)
    y_train = load_Lab(train_labels_path)
    
    test_images_path = '/Users/shamiksinha/Desktop/MLassign3/test-images-idx3-ubyte/t10k-images-idx3-ubyte'
    test_labels_path = '/Users/shamiksinha/Desktop/MLassign3/test-labels-idx1-ubyte/t10k-labels-idx1-ubyte'
    X_test = load_Im(test_images_path)
    y_test = load_Lab(test_labels_path)

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

    encoder = OneHotEncoder(sparse_output=False)
    y_train = encoder.fit_transform(y_train.reshape(-1, 1))
    y_val = encoder.transform(y_val.reshape(-1, 1))
    y_test = encoder.transform(y_test.reshape(-1, 1))
    
    print()
    
    return X_train, X_val, X_test, y_train, y_val, y_test

X_train, X_val, X_test, y_train, y_val, y_test = returnDataset()

print(X_train.shape) 
print(X_test.shape) 
print(X_val.shape) 
print(y_train.shape) 
print(y_test.shape) 
print(y_val.shape) 


(48000, 784)
(6000, 784)
(6000, 784)
(48000, 10)
(6000, 10)
(6000, 10)


In [69]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
 
def sigmoid(x): 
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x): 
    return sigmoid(x) * (1 - sigmoid(x))

def relu(x): 
    return np.maximum(0, x)

def relu_derivative(x): 
    return (x > 0).astype(float)

def leaky_relu(x, alpha=0.01): 
    return np.where(x > 0, x, alpha * x)

def leaky_relu_derivative(x, alpha=0.01): 
    dx = np.ones_like(x); dx[x < 0] = alpha; return dx

def tanh(x): 
    return np.tanh(x)

def tanh_derivative(x): 
    return 1 - np.tanh(x) ** 2

In [70]:
def zero_init(shape): 
    return np.zeros(shape)

def random_init(shape): 
    return np.random.uniform(-1/np.sqrt(shape[0]), 1/np.sqrt(shape[0]), shape)

def normal_init(shape):
    return np.random.normal(0, np.sqrt(2/shape[0]), shape)

In [71]:
activations = { "sigmoid": (sigmoid, sigmoid_derivative), "relu": (relu, relu_derivative), "leaky_relu": (leaky_relu, leaky_relu_derivative), "tanh": (tanh, tanh_derivative)}
initializations = { "random_init": random_init, "zero_init": zero_init, "normal_init": normal_init }

In [72]:
results = {}
 
for init_name, init_func in initializations.items():
    for act_name, (activation_func, activation_derivative) in activations.items(): 
         
        model = NeuralNetwork( 
            N=len(layer_sizes),
            activation_func=activation_func,
            activation_derivative=activation_derivative,
            weight_init_func=init_func,  
        )
        
        print(X_train.shape) 
        print(X_test.shape) 
        print(X_val.shape) 
        print(y_train.shape) 
        print(y_test.shape) 
        print(y_val.shape) 
 
        trainLosses, valLosses = model.fit(X_train, y_train, X_val, y_val)
        testPredictions = model.predict(X_test)
        testLabels = np.argmax(y_test, axis=1)
        testAccuracy = np.score(testPredictions, testLabels)
 
        results[f"{init_name}_{act_name}"] = {
            "val_losses": valLosses,
            "train_losses": trainLosses,
            "test_accuracy": testAccuracy
        }

        filename = f"model_{init_name}_{act_name}.pkl"
        with open(filename, 'wb') as f:
            pickle.dump(model, f)

(48000, 784)
(6000, 784)
(6000, 784)
(48000, 10)
(6000, 10)
(6000, 10)


KeyboardInterrupt: 

In [None]:
 
for init_name in initializations.keys():
    plt.figure(figsize=(10, 8))
    for act_name in activations.keys():
        combo_key = f"{init_name}_{act_name}"
        if combo_key in results: 
            plt.plot(results[combo_key]["train_losses"], label=f"{act_name} - Train Loss")
            plt.plot(results[combo_key]["val_losses"], linestyle="--", label=f"{act_name} - Val Loss")
    plt.ylabel('Loss')
    plt.xlabel('Epochs')
    plt.legend()
    plt.title(f'Training and Validation Loss for {init_name} Initialization')
    plt.show()

for combo, metrics in results.items():
    print(f"{combo.capitalize()} Test Accuracy: {metrics['test_accuracy'] * 100:.2f}%")


In [None]:
activations = ["sigmoid", "tanh", "relu", "leaky_relu"]
initializations = ["zero_init", "random_init", "normal_init"]

In [9]:
import pickle
 
for init in initializations:
    for act in activations:
        filename = f"model_{init}_{act}.pkl"
        try:
            with open(filename, 'rb') as f:
                model = pickle.load(f) 
            test_pred = model.predict(X_test)
            test_lab = np.argmax(y_test, axis=1)
            test_acc = np.score(test_pred, test_lab)
            print(f"Test Accuracy : {act} with {init}: {test_acc * 100:.2f}%")
        except FileNotFoundError:
            print(f"ERROR")


Test Accuracy for sigmoid with zero_init: 10.67%
Test Accuracy for tanh with zero_init: 10.67%
Test Accuracy for relu with zero_init: 10.67%
Test Accuracy for leaky_relu with zero_init: 10.67%
Test Accuracy for sigmoid with random_init: 10.67%
Test Accuracy for tanh with random_init: 96.85%
Test Accuracy for relu with random_init: 97.22%
Test Accuracy for leaky_relu with random_init: 96.80%
Test Accuracy for sigmoid with normal_init: 87.30%
Test Accuracy for tanh with normal_init: 97.25%
Test Accuracy for relu with normal_init: 97.38%
Test Accuracy for leaky_relu with normal_init: 97.23%
