**Defining OOP class for Hyper Parameters, Weight, Bias, Activation functions :**
---
---

In [1]:
import numpy as np
import pandas as pd

In [2]:
class HyperParameters:
    def __init__(self, learning_rate=0.01, epochs=10, mini_batch_size=None, beta=.9, layers=None, beta1=.9, beta2=.998, lambd=0):
        if layers is None:
            layers = [10, 20, 10]
        self.layers = layers
        self.no_l = len(layers)
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = mini_batch_size
        self.beta = beta
        self.beta1 = beta1
        self.beta2 = beta2
        self.lambd = lambd

class WeightAndBias:
    def __init__(self, number_features, layers, initialisation_type="random", weights=None, biases=None):

        self.initialisation_type = initialisation_type
        self.layers = [number_features] + layers
        if self.initialisation_type == "random":
            self.weights = [pd.DataFrame()] + [np.random.randn(self.layers[i+1], self.layers[i]) * 0.01 for i in range(len(self.layers)-1)]
            self.biases = [pd.DataFrame()] + [np.zeros([self.layers[i+1], 1]) for i in range(len(self.layers)-1)]
        elif self.initialisation_type == "manual":
            self.weights = [pd.DataFrame()] + weights
            self.biases = [pd.DataFrame()] + biases

    def update_learning_parameters(self, no_l, hp_obj, dW, db, m_training) :
        for l in range(1, no_l+1):
            self.biases[l] =  self.biases[l] - hp_obj.learning_rate * db[l]
            self.weights[l] = (1 - (hp_obj.lambd *  hp_obj.learning_rate)/m_training) * self.weights[l] - hp_obj.learning_rate * dW[l]

class ActivationFunctions:
    def __init__(self, layers, activation_functions=None) :
        if activation_functions is None:
            activation_functions= ['tanh'] * (len(layers) - 1) + ['softmax']

        self.activation_functions = [None] + [eval(f'ActivationFunctions.{activation_function}')
                                     for activation_function in activation_functions]

        self.derivative_functions = [None] + [eval(f'ActivationFunctions.{activation_function}_derivative')
                                     for activation_function in activation_functions]

    @staticmethod
    def sigmoid(z) :
        return 1 / (1 + np.exp( -z ))

    @staticmethod
    def relu(z) :
        return np.where(z>0, z, 0.0001 * z )

    @staticmethod
    def tanh(z) :
        # return np.tanh(z
        z = np.clip(z, -20, 20)
        return (np.exp(z) - np.exp(-z))/ (np.exp(z) + np.exp(-z))

    @staticmethod
    def softmax(z):
        z = np.clip(z, -20, 20)
        return np.exp(z) / np.sum(np.exp(z), axis=0)

    @staticmethod
    def softmax_derivative(y, a) :
        return a - y

    @staticmethod
    def sigmoid_derivative(z) :
        return  (1 / (1 + np.exp( -z ))) * (1 - ( 1 / (1 + np.exp( -z ))))

    @staticmethod
    def tanh_derivative(z) :
        return (1 - np.tanh(z) ** 2)

    @staticmethod
    def relu_derivative(z) :
        return (z > 0) * 1

    @staticmethod
    def calculate_loss(a, y, m, hp, lp) :
            return (-1/m * np.sum(np.multiply(y, np.log(a))), -1/m * np.sum(np.multiply(y, np.log(a))) + hp.lambd/(2 *m ) * sum(np.sum(np.square(lp.weights[i])) for i in range(1, hp.no_l+1)))

**Defining OOP class for Neural Network :**
---
---

In [3]:
class NeuralNetwork:
    def __init__(self, X_train, y_train, HyperParameters, activation_functions=None) :

        self.X_train, self.y_train = X_train, y_train
        self.n, self.m = X_train.shape

        print(f"number of training examples: {self.m}\nnumber of features: {self.n}"
              f"\nshape of y_train {self.y_train.shape}")

        #hp --> hyperparameters
        self.hp = HyperParameters
        self.layers = self.hp.layers
        self.no_l = self.hp.no_l

        self.act_function_obj = ActivationFunctions(self.layers, activation_functions=activation_functions)

        #lp --> learning parameters -> weights and biases
        self.lp = WeightAndBias(self.n, self.layers)

        if self.hp.batch_size is None:
            self.hp.batch_size = self.m


    def forward_propagation(self, X_batch) :
        self.Z, self.A = [0] + [None] * self.no_l, [X_batch ] + [None] * self.no_l
        activation_functions = self.act_function_obj.activation_functions

        for l in range(1, self.no_l + 1):
            self.Z[l] = np.dot(self.lp.weights[l], self.A[l-1]) + self.lp.biases[l]
            self.A[l] = activation_functions[l](self.Z[l])

    def back_propagation(self, y_batch) :

        derivative_functions = self.act_function_obj.derivative_functions
        batch_size = y_batch.shape[1]

        self.dZ =[None] +  [None] * self.no_l
        self.dW =[None] +  [None] * self.no_l
        self.db =[None] +  [None] * self.no_l

        self.dZ[self.no_l] = derivative_functions[self.no_l](y_batch, self.A[self.no_l])
        self.dW[self.no_l] = 1/batch_size * np.dot(self.dZ[self.no_l] , self.A[self.no_l - 1].T)
        self.db[self.no_l] = 1/batch_size * np.sum(self.dZ[self.no_l], axis=1, keepdims=True)

        assert self.dZ[self.no_l].shape == self.Z[self.no_l].shape
        assert self.db[self.no_l].shape == self.lp.biases[self.no_l].shape
        assert self.dW[self.no_l].shape == self.lp.weights[self.no_l].shape

        for l in range(self.no_l - 1, 0, -1) :

            self.dZ[l] = np.dot(self.lp.weights[l+1].T, self.dZ[l+1] )* derivative_functions[l](self.Z[l])
            self.dW[l] = 1/batch_size * np.dot(self.dZ[l], self.A[l-1].T)
            self.db[l] = 1/batch_size * np.sum(self.dZ[l], axis=1, keepdims=True)

            assert self.dZ[l].shape == self.Z[l].shape
            assert self.dW[l].shape == self.lp.weights[l].shape
            assert self.db[l].shape == self.lp.biases[l].shape


    def train_nn(self, verbose=False, per_epoch_log=100) :
        for epoch in range(self.hp.epochs):
            for batch_s in range(0, self.m, self.hp.batch_size) :

                batch_e = min(batch_s + self.hp.batch_size, self.m)

                X_batch = self.X_train[:, batch_s: batch_e]
                y_batch = self.y_train[:, batch_s: batch_e]
                m_batch_size = batch_e - batch_s

                self.forward_propagation(X_batch)
                self.back_propagation(y_batch)
                self.lp.update_learning_parameters(self.no_l, self.hp,  self.dW, self.db, m_batch_size)

            if verbose and epoch % per_epoch_log == 0:
                print(f"epochs {epoch} loss: ",ActivationFunctions.calculate_loss(self.A[self.no_l], y_batch, m_batch_size, self.hp, self.lp))

    def predict(self, X_test):
        self.forward_propagation(X_test)
        preds=  self.A[self.no_l].T
        return (preds == preds.max(axis=1)[:,None]).astype(int)

def one_hot_encode(labels, num_classes):
    return np.eye(num_classes)[labels.reshape(-1)]

**Answer-1**
---
---

In [4]:
def data_reader(path):
    data = pd.read_csv(path)
    X = np.array(data.drop(["label"], axis=1)) / 255.0 # normalizing the data to prevent overflow in np.exp
    Y = np.array(data.label).reshape(-1, 1)
    return X, Y

def train_test_split(X, Y):
    train_X = X[0:50000, :]
    test_X = X[50000:60000, :]
    train_Y = Y[0:50000, :]
    test_Y = Y[50000:60000, :]
    return train_X, train_Y, test_X, test_Y

X, Y = data_reader("mnist_train.csv")
train_X, train_Y, test_X, test_Y = train_test_split(X,Y)

**Answer-2**
---
---

In [5]:
def zero_hidden_network(X, Y, weight_matrix, bias_matrix, learning_rate):
    m = X.shape[0]
    y = one_hot_encode(Y, num_classes=10)

    layers = [10]
    activation_functions = ['softmax']
    hp = HyperParameters(layers=layers, learning_rate=learning_rate, epochs=1)
    nn = NeuralNetwork(X.T, y.T, hp, activation_functions=activation_functions)

    nn.lp = WeightAndBias(nn.n, nn.layers, initialisation_type="manual", weights=weight_matrix, biases=bias_matrix)
    nn.train_nn(verbose=True, per_epoch_log=1)

    return nn.lp.weights, nn.lp.biases

weights = np.ones((10, 784))
biases = np.ones((10, 1))

zero_hidden_network(train_X, train_Y, [weights], [biases], learning_rate=0.5)


number of training examples: 50000
number of features: 784
shape of y_train (10, 50000)
epochs 0 loss:  (2.3025850929940463, 2.3025850929940463)


([Empty DataFrame
  Columns: []
  Index: [],
  array([[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]])],
 [Empty DataFrame
  Columns: []
  Index: [],
  array([[0.99932],
         [1.00678],
         [0.99968],
         [1.00101],
         [0.99859],
         [0.99506],
         [0.99951],
         [1.00175],
         [0.99842],
         [0.99988]])])

**Answer-3**
---
---

In [6]:
def one_hidden_layer_network(X, Y, weight_matrices, bias_vectors, learning_rate):
    m = X.shape[0]
    y = one_hot_encode(Y, num_classes=10)

    layers = [10, 10]
    activation_functions = ['sigmoid', 'softmax']
    hp = HyperParameters(layers=layers, learning_rate=learning_rate, epochs=1)
    nn = NeuralNetwork(X.T, y.T, hp, activation_functions=activation_functions)

    nn.lp = WeightAndBias(nn.n, nn.layers, initialisation_type="manual", weights=weight_matrices, biases=bias_vectors)
    nn.train_nn(verbose=True, per_epoch_log=1)

    return nn.lp.weights, nn.lp.biases

weights = [np.ones((10, 784)), np.ones((10, 10))]
biases = [np.ones((10, 1)), np.ones((10, 1))]
one_hidden_layer_network(train_X, train_Y, weight_matrices=weights, bias_vectors=biases, learning_rate=0.5)

number of training examples: 50000
number of features: 784
shape of y_train (10, 50000)
epochs 0 loss:  (2.3025850929940463, 2.3025850929940463)


([Empty DataFrame
  Columns: []
  Index: [],
  array([[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]]),
  array([[0.99932, 0.99932, 0.99932, 0.99932, 0.99932, 0.99932, 0.99932,
          0.99932, 0.99932, 0.99932],
         [1.00678, 1.00678, 1.00678, 1.00678, 1.00678, 1.00678, 1.00678,
          1.00678, 1.00678, 1.00678],
         [0.99968, 0.99968, 0.99968, 0.99968, 0.99968, 0.99968, 0.99968,
          0.99968, 0.99968, 0.99968],
         [1.00101, 1.00101, 1.00101, 1.00101, 1.00101, 1.00101, 1.00101,
          1.00101, 1.00101, 1.00101],
         [0.99859, 0.99859, 0.99859, 0.99859, 0.99859, 0.99859, 0.99859,
          0.99859, 0.99859, 0.99859],
         [0.99506, 0.99506, 0.99506, 0.99506, 0.99506, 0.99506, 0.99506,
          0.99506, 0.99506, 0.99506],
         [0.99951, 0.99951, 0.999

**Answer-4**
---
---

In [7]:
def n_hidden_layer_network(X, Y, weight_matrices, bias_vectors, learning_rate):
    m = X.shape[0]
    y = one_hot_encode(Y, num_classes=10)

    layers = [10, 10, 10]
    activation_functions = ['sigmoid', 'sigmoid', 'softmax']
    hp = HyperParameters(layers=layers, learning_rate=learning_rate, epochs=1)
    nn = NeuralNetwork(X.T, y.T, hp, activation_functions=activation_functions)

    nn.lp = WeightAndBias(nn.n, nn.layers, initialisation_type="manual", weights=weight_matrices, biases=bias_vectors)
    nn.train_nn(verbose=True, per_epoch_log=1)

    return nn.lp.weights, nn.lp.biases

weights = [np.ones((10, 784)), np.ones((10, 10)), np.ones((10, 10))]
biases = [np.ones((10, 1)), np.ones((10, 1)), np.ones((10, 1))]

n_hidden_layer_network(train_X, train_Y, weights, biases, learning_rate=0.5)

number of training examples: 50000
number of features: 784
shape of y_train (10, 50000)
epochs 0 loss:  (2.3025850929940463, 2.3025850929940463)


([Empty DataFrame
  Columns: []
  Index: [],
  array([[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]]),
  array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]),
  array([[0.99932001, 0.99932001, 0.99932001, 0.99932001, 0.99932001,
          0.99932001, 0.99932001, 0.99932001, 0.99932001, 0.99932001],
         [1.00677989, 1.00677989, 1.00677989, 

**Answer-5**
---
---

In [8]:
def n_hidden_layer_network_with_different_activations(X, Y, weight_matrices, bias_vectors, learning_rate, activations = None):
    m = X.shape[0]
    y = one_hot_encode(Y, num_classes=10)

    layers = [10, 10, 10]
    activation_functions = activations + ['softmax']
    hp = HyperParameters(layers=layers, learning_rate=learning_rate, epochs=1)
    nn = NeuralNetwork(X.T, y.T, hp, activation_functions=activation_functions)

    nn.lp = WeightAndBias(nn.n, nn.layers, initialisation_type="manual", weights=weight_matrices, biases=bias_vectors)
    nn.train_nn(verbose=True, per_epoch_log=1)

    return nn.lp.weights, nn.lp.biases

weights = [np.ones((10, 784)), np.ones((10, 10)), np.ones((10, 10))]
biases = [np.ones((10, 1)), np.ones((10, 1)), np.ones((10, 1))]

n_hidden_layer_network_with_different_activations(train_X, train_Y, weights, biases, learning_rate=0.5, activations=['relu', 'sigmoid'])



number of training examples: 50000
number of features: 784
shape of y_train (10, 50000)
epochs 0 loss:  (2.3025850929940463, 2.3025850929940463)


([Empty DataFrame
  Columns: []
  Index: [],
  array([[1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         ...,
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.],
         [1., 1., 1., ..., 1., 1., 1.]]),
  array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]),
  array([[0.99932, 0.99932, 0.99932, 0.99932, 0.99932, 0.99932, 0.99932,
          0.99932, 0.99932, 0.99932],
         [1.00678, 1.00678, 1.00678, 1.00678, 1.00678, 1.00678, 1.00678,
   

**Answer-6**
---
---

In [9]:
def n_hidden_layer_network_with_different_activations_with_momentum(X, Y, weight_matrices, bias_vectors, learning_rate, activations, momentum):
    m = X.shape[0]
    y = one_hot_encode(Y, num_classes=10)

    layers = [10, 10, 10]
    activation_functions = activations + ['softmax']
    hp = HyperParameters(layers=layers, learning_rate=learning_rate, epochs=1, lambd=momentum)
    nn = NeuralNetwork(X.T, y.T, hp, activation_functions=activation_functions)

    nn.lp = WeightAndBias(nn.n, nn.layers, initialisation_type="manual", weights=weight_matrices, biases=bias_vectors)
    nn.train_nn(verbose=True, per_epoch_log=1)

    return nn.lp.weights, nn.lp.biases

weights = [np.ones((10, 784)), np.ones((10, 10)), np.ones((10, 10))]
biases = [np.ones((10, 1)), np.ones((10, 1)), np.ones((10, 1))]
n_hidden_layer_network_with_different_activations_with_momentum(train_X, train_Y, weights, biases, learning_rate=0.5, activations=['relu', 'sigmoid'], momentum=0.9)

number of training examples: 50000
number of features: 784
shape of y_train (10, 50000)
epochs 0 loss:  (2.3025850929940463, 2.3749437976981636)


([Empty DataFrame
  Columns: []
  Index: [],
  array([[0.999991, 0.999991, 0.999991, ..., 0.999991, 0.999991, 0.999991],
         [0.999991, 0.999991, 0.999991, ..., 0.999991, 0.999991, 0.999991],
         [0.999991, 0.999991, 0.999991, ..., 0.999991, 0.999991, 0.999991],
         ...,
         [0.999991, 0.999991, 0.999991, ..., 0.999991, 0.999991, 0.999991],
         [0.999991, 0.999991, 0.999991, ..., 0.999991, 0.999991, 0.999991],
         [0.999991, 0.999991, 0.999991, ..., 0.999991, 0.999991, 0.999991]]),
  array([[0.999991, 0.999991, 0.999991, 0.999991, 0.999991, 0.999991,
          0.999991, 0.999991, 0.999991, 0.999991],
         [0.999991, 0.999991, 0.999991, 0.999991, 0.999991, 0.999991,
          0.999991, 0.999991, 0.999991, 0.999991],
         [0.999991, 0.999991, 0.999991, 0.999991, 0.999991, 0.999991,
          0.999991, 0.999991, 0.999991, 0.999991],
         [0.999991, 0.999991, 0.999991, 0.999991, 0.999991, 0.999991,
          0.999991, 0.999991, 0.999991, 0.999991],

**Full Training on the MNIST data :**
---
---

In [11]:
y = one_hot_encode(train_Y, num_classes=10)
layers = [128, 128, 10]

activation_functions = ['relu', 'relu', 'softmax']
hp = HyperParameters(layers=layers, learning_rate=0.1, epochs=500, mini_batch_size=2048, lambd=.1)
nn = NeuralNetwork(train_X.T, y.T, hp, activation_functions=activation_functions)

nn.train_nn( verbose=True, per_epoch_log=10)

number of training examples: 50000
number of features: 784
shape of y_train (10, 50000)
epochs 0 loss:  (2.301991806586728, 2.302686457384858)
epochs 10 loss:  (2.1296114760823146, 2.1304435337149843)
epochs 20 loss:  (0.5920577935553815, 0.5948145000653382)
epochs 30 loss:  (0.4593165458278102, 0.46273964034121373)
epochs 40 loss:  (0.38272657989650166, 0.3866551688378101)
epochs 50 loss:  (0.31800784663937504, 0.32244456138854843)
epochs 60 loss:  (0.26604525139636676, 0.27097931136857667)
epochs 70 loss:  (0.22509320331415628, 0.2305137165850732)
epochs 80 loss:  (0.19502015352369334, 0.2009041024757114)
epochs 90 loss:  (0.170316661101825, 0.17664178519767426)
epochs 100 loss:  (0.14937101154373533, 0.15611714075866304)
epochs 110 loss:  (0.1317975277631281, 0.13894915143701253)
epochs 120 loss:  (0.11672276252018518, 0.12426671004961781)
epochs 130 loss:  (0.10371647946054009, 0.11164050990300521)
epochs 140 loss:  (0.09243151069399942, 0.10072291245576817)
epochs 150 loss:  (0.08

**Accuracy :**
---
---

In [15]:
print(f"The Accuracy of our MLP model is : {np.count_nonzero(nn.predict(test_X.T) == test_Y)/100}%")

The Accuracy of our MLP model is : 99.83%
