In [35]:
import numpy as np
import pandas as pd

In [36]:
class Dense:
    def __init__(self,n_inputs,n_neurons,weight_regularizer_l1=0,weight_regularizer_l2=0,bias_regularizer_l1=0,bias_regularizer_l2=0):
        self.weights=0.01*np.random.randn(n_inputs,n_neurons) # Initialize weights with small random values (scaled by 0.01) for better convergence
        self.biases=np.zeros((1,n_neurons)) # Initialize biases to zeros
        # Set regularization strengths for weights (L1 and L2)
        self.weight_regularizer_l1=weight_regularizer_l1
        self.weight_regularizer_l2=weight_regularizer_l2
        # Set regularization strengths for biases (L1 and L2)
        self.bias_regularizer_l1=bias_regularizer_l1
        self.bias_regularizer_l2=bias_regularizer_l2

    # compute the layer's output using the formula: output = inputs • weights + biases
    def forward(self, inputs):
        self.inputs=inputs
        self.output=np.dot(inputs,self.weights)+self.biases

    # compute gradients of the loss with respect to inputs, weights, and biases
    def backward(self,dvalues):
        self.dweights=np.dot(self.inputs.T,dvalues) # Gradient with respect to weights: (inputs^T x dvalues)
        self.dbiases=np.sum(dvalues,axis=0,keepdims=True) # Gradient with respect to biases: sum over dvalues

        # L1 Weight - calculate only when l1 is greater than 0
        if self.weight_regularizer_l1 >0:
            dl1=np.ones_like(self.weights) # Create a matrix of ones with the same shape as weights
            dl1[self.weights<0]=-1  # For negative weight values, the gradient is -1
            self.dweights+=self.weight_regularizer_l1 *dl1  # Add the L1 regularization term to the gradient of the weights

        # L2 Weight - calculate only when l2 is greater than 0
        if self.weight_regularizer_l2 >0:
            self.dweights += 2* self.weight_regularizer_l2*self.weights # Add the L2 regularization term (derivative of squared weights)

        # L1 Bias - calculate only when l1 is greater than 0
        if self.bias_regularizer_l1 >0:
            dl1=np.ones_like(self.biases)
            dl1[self.biases<0]=-1
            self.dbiases+=self.bias_regularizer_l1 *dl1

        # L2 Bias - calculate only when l2 is greater than 0
        if self.bias_regularizer_l2 >0:
            self.dbiases += 2* self.bias_regularizer_l2*self.biases

        # Gradient with respect to inputs
        self.dinputs=np.dot(dvalues,self.weights.T) # Compute using the chain rule: (dvalues dot weights^T)

In [37]:
class ReLU:
    def forward(self,inputs):
        self.inputs=inputs
        self.output=np.maximum(0,inputs) # It returns the maximum value between 0 and each element in the inputs

    def backward(self,dvalues):
        self.dinputs=dvalues.copy()
        self.dinputs[self.inputs<=0]=0# Zero out gradients where the input values were less than or equal to 0 because the derivative of ReLU is 0 for negative inputs (and 0 at 0)


In [38]:
class Softmax:
    def forward(self,inputs):
      # Subtract the maximum value in each row for numerical stability
      exp_values=np.exp(inputs-np.max(inputs,axis=1,keepdims=True)) # This prevents very large exponentials that could lead to overflow

      # Sum the exponentiated values along each row
      prob=exp_values/np.sum(exp_values,axis=1,keepdims=True) # This sum is used to normalize the values so that they add up to 1
      self.output=prob

In [39]:
class Loss:
  def calculate(self,output,y):
        sample_losses=self.forward(output,y) # Compute individual sample losses using the forward method (which should be defined in a subclass)
        data_loss=np.mean(sample_losses) # Compute the mean of these losses to get the overall data loss
        return data_loss

  def regularization_loss(self,layer):
      regularization_loss=0

      # If L1 regularization is applied on weights, add its contribution
      if layer.weight_regularizer_l1 >0:
          regularization_loss+= layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights)) # L1 regularization is the sum of absolute values of the weights, scaled by the L1 strength

      # If L2 regularization is applied on weights, add its contribution
      if layer.weight_regularizer_l2 > 0:
          regularization_loss+= layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights) # L2 regularization is the sum of squared weights, scaled by the L2 strength

      # If L1 regularization is applied on biases, add its contribution
      if layer.bias_regularizer_l1 > 0:
          regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases)) # L1 regularization for biases is the sum of absolute values of the biases

      # If L2 regularization is applied on biases, add its contribution
      if layer.bias_regularizer_l2 > 0:
          regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases) # L2 regularization for biases is the sum of squared biases

      return regularization_loss


In [40]:
class Loss_CategoricalCrossEntropy(Loss):
    def forward(self,y_pred,y_true):
        samples=len(y_true)
        y_pred_clipped=np.clip(y_pred,1e-7,1-1e-7)# The predictions are constrained between 1e-7 and 1 - 1e-7

        # If y_true is a 1D array (sparse labels), get the probability for the correct class for each sample
        if len(y_true.shape)==1:
            correct_confidence=y_pred_clipped[range(samples),y_true] # Using advanced indexing to select the predicted probability of the true class
        elif len(y_true.shape)==2: # If y_true is a 2D array (one-hot encoded labels), compute the confidence by element-wise multiplication
            correct_confidence=np.sum(y_pred_clipped*y_true,axis=1)  # Sum across classes to get the predicted probability corresponding to the true class.

        negative_log=-np.log(correct_confidence)
        return negative_log

    def backward(self, dvalues, y_true):
        samples = len(dvalues)
        labels = len(dvalues[0])

        # If y_true is a 1D array (sparse labels), convert it to one-hot encoding
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculating the gradient of the loss with respect to the predictions
        self.dinputs = -y_true / dvalues
        # Normalize the gradient by dividing by the number of samples to average the effect
        self.dinputs = self.dinputs / samples

In [41]:
class Softmax_CrossEntropy:
    def __init__(self):
        # Creating an instance of the Softmax activation function and categorical cross-entropy loss
        self.activation=Softmax()
        self.loss=Loss_CategoricalCrossEntropy()

    def forward(self,inputs,y_true):
        self.activation.forward(inputs) # Apply the softmax activation to the input values
        self.output=self.activation.output # Retrieving the computed probabilities
        return self.loss.calculate(self.output,y_true)  # Calculate and return the loss using the cross-entropy loss's calculate method, which internally calls its forward method

    def backward(self,dvalue,y_true):
        samples=len(dvalue)

        # If y_true is one-hot encoded (i.e., a 2D array), convert it to class indices
        if len(y_true.shape)==2:
            y_true=np.argmax(y_true,axis=1)

        self.dinputs=dvalue.copy()

        self.dinputs[range(samples),y_true]-=1  # For each sample, subtract 1 from the probability corresponding to the true class.This step computes the derivative of the combined softmax and cross-entropy loss
        self.dinputs=self.dinputs/samples # Normalize the gradients by dividing by the number of samples, so that the gradient is averaged over the batch

In [42]:
class Dropout:
    def __init__(self,rate):
        # The 'rate' parameter is the dropout rate (fraction of neurons to drop), We calculate the "keep probability" by subtracting the dropout rate from 1
        self.rate=1-rate

    def forward(self,inputs):
        self.inputs=inputs
        #Generate a binary mask with the same shape as 'inputs':
        # Each element is drawn from a binomial distribution (0 or 1) with probability 'self.rate', Dividing by self.rate scales the activations so that the expected value remains unchanged
        self.binary_mask=np.random.binomial(1,self.rate,size=inputs.shape)/self.rate
        self.output=inputs * self.binary_mask # Apply the dropout mask to the inputs (element-wise multiplication)

    def backward(self,dvalues):
        # During backpropagation, only the neurons that were kept (mask value 1) will receive the gradient, Multiply the incoming gradient (dvalues) by the binary mask to pass the gradient only through active neurons
        self.dinputs=dvalues*self.binary_mask

In [43]:
# momentum opti
class SGD:
    def __init__(self,learning_rate=1.,decay=0.,momentum=0.):
        self.learning_rate = learning_rate
        self.current_learning_rate =learning_rate # Initialize the current learning rate (this may change over iterations due to decay)
        self.decay=decay
        self.iterations = 0  # Initialize the iteration counter
        self.momentum =momentum

    def pre_update_param(self):
        # If decay is set, adjust the current learning rate based on the iteration number
        if self.decay:
            self.current_learning_rate=self.learning_rate*(1./(1.+self.decay *self.iterations)) # This applies a simple decay formula: lr = initial_lr / (1 + decay * iterations)

    def update_param(self,layer):
        if self.momentum:
            # If momentum is enabled, first check if the layer has momentum arrays
            if not hasattr(layer, 'weight_momentums'):
                # Initialize momentum arrays for weights and biases as zeros with the same shape as the parameters
                layer.weight_momentums=np.zeros_like(layer.weights)
                layer.bias_momentums=np.zeros_like(layer.biases)

            # Compute the update for weights using momentum:
            # new_update = (momentum * previous_momentum) - (current_learning_rate * current_gradient)
            weight_updates=self.momentum*layer.weight_momentums - self.current_learning_rate * layer.dweights
            layer.weight_momentums=weight_updates

            bias_updates=self.momentum*layer.bias_momentums - self.current_learning_rate * layer.dbiases
            layer.bias_momentums=bias_updates
        else:
            # If momentum is not used, compute the update directly as the negative gradient scaled by the learning rate using vanilla SGD
            layer.weights+= -self.current_learning_rate * layer.dweights
            layer.biases+= -self.current_learning_rate * layer.dbiases

        # Update the layer's weights and biases by adding the computed updates
        layer.weights+=weight_updates
        layer.biases+=bias_updates

    def post_update_param(self):
        self.iterations+=1 # After updating parameters, increment the iteration counter



In [44]:
class Adagrad:
    def __init__(self,learning_rate=1.,decay=0.,epsilon=1e-7):
        self.learning_rate =learning_rate
        self.current_learning_rate=learning_rate # Initialize the current learning rate (this may change over iterations due to decay)
        self.decay=decay
        self.iterations=0 # Initialize the iteration counter
        self.epsilon=epsilon

    def pre_update_param(self):
        # If decay is set, adjust the current learning rate based on the iteration number
        if self.decay:
            self.current_learning_rate=self.learning_rate*(1./(1.+self.decay *self.iterations)) # This applies a simple decay formula: lr = initial_lr / (1 + decay * iterations)

    def update_param(self,layer):
        # Check if the layer already has a cache for weights and biases; if not, initialize them
        if not  hasattr(layer,'weight_cache'):
            # Initialize the cache for weights and bias with zeros, matching the shape of the weights and biases respectively
            layer.weight_cache =np.zeros_like(layer.weights)
            layer.bias_cache =np.zeros_like(layer.biases)

        # Update the cache by accumulating the squared gradients, This cache is used to adapt the learning rate for each parameter individually
        layer.weight_cache+=layer.dweights**2
        layer.bias_cache+=layer.dbiases**2

        # Update the weights using New weight = old weight - (current learning rate * gradient) / (sqrt(weight_cache) + epsilon)
        layer.weights+= -self.current_learning_rate * layer.dweights/(np.sqrt(layer.weight_cache)+self.epsilon)
        layer.biases+= -self.current_learning_rate * layer.dbiases/(np.sqrt(layer.bias_cache)+self.epsilon)

    def post_update_param(self):
        self.iterations += 1  # Increment the iteration count after each parameter update

In [45]:
class RMSprop:
    def __init__(self,learning_rate=0.001,decay=0.,epsilon=1e-7,rho=0.9):
        self.learning_rate=learning_rate
        self.current_learning_rate = learning_rate  # Initialize the current learning rate (this may change over iterations due to decay)
        self.decay = decay
        self.iteration = 0 # Initialize the iteration counter
        self.epsilon=epsilon
        self.rho=rho

    def pre_update_param(self):
        # If decay is set, adjust the current learning rate based on the iteration number
        if self.decay:
            self.current_learning_rate=self.learning_rate*(1./(1.+self.decay*self.iteration)) # This applies a simple decay formula: lr = initial_lr / (1 + decay * iterations)

    def update_param(self,layer):
        # Check if the layer already has a cache for weights and biases; if not, initialize them
        if not hasattr(layer,'weight_cache'):
            # Initialize the cache for weights and bias with zeros, matching the shape of the weights and biases respectively
            layer.weight_cache=np.zeros_like(layer.weights)
            layer.bias_cache=np.zeros_like(layer.biases)

        # Update the cache for weights using the RMSprop formula: cache = rho * cache + (1 - rho) * (dweights)^2
        layer.weight_cache=self.rho*layer.weight_cache +(1-self.rho)*layer.dweights**2
        layer.bias_cache=self.rho*layer.bias_cache +(1-self.rho)*layer.dbiases**2

        # Update weights using the RMSprop update rule: weights = weights - lr * dweights / (sqrt(weight_cache) + epsilon)
        layer.weights+= -self.current_learning_rate* layer.dweights/(np.sqrt(layer.weight_cache)+self.epsilon)
        layer.biases+= -self.current_learning_rate* layer.dbiases/(np.sqrt(layer.bias_cache)+self.epsilon)

    def post_update_param(self):
        self.iteration+=1 # Increment the iteration count after each parameter update

In [46]:
class Adam:
    def __init__(self,learning_rate=0.001,decay=0.,epsilon=1e-7,beta_1=0.9,beta_2=0.999):
        self.learning_rate =learning_rate
        self.current_learning_rate =learning_rate # Initialize the current learning rate (this may change over iterations due to decay)
        self.decay =decay
        self.iteration =0 # Initialize the iteration counter
        self.epsilon =epsilon
        self.beta_1 =beta_1 # Exponential decay rate for the first moment(momentum) estimates
        self.beta_2 =beta_2 # Exponential decay rate for the second moment(cache) estimates

    def pre_update_param(self):
        # If decay is set, adjust the current learning rate based on the iteration number
        if self.decay:
            self.current_learning_rate = self.learning_rate*(1./(1. + self.decay*self.iteration)) # This applies a simple decay formula: lr = initial_lr / (1 + decay * iterations)

    def update_param(self,layer):
        # If the layer does not have momentum and cache variables, initialize them
        if not hasattr(layer,'weight_cache'):
            layer.weight_momentum =np.zeros_like(layer.weights)
            layer.weight_cache =np.zeros_like(layer.weights)
            layer.bias_momentum =np.zeros_like(layer.biases)
            layer.bias_cache =np.zeros_like(layer.biases)

        # momentum
        # Update the first moment estimate (momentum): m_t = beta_1 * m_(t-1) + (1 - beta_1) * current gradient
        layer.weight_momentum = self.beta_1*layer.weight_momentum + (1-self.beta_1)*layer.dweights
        layer.bias_momentum = self.beta_1*layer.bias_momentum + (1-self.beta_1)*layer.dbiases

        # Correct bias in the first moment: momentum_term = m_t / (1 - beta_1^(t)) , where t is the iteration
        weight_momentum_corrected = layer.weight_momentum / (1-self.beta_1**(self.iteration+1))
        bias_momentum_corrected = layer.bias_momentum / (1-self.beta_1**(self.iteration+1))

        # cache
        # Update the second moment estimate (cache): v_t = beta_2 * v_(t-1) + (1 - beta_2) * (current gradient)^2
        layer.weight_cache=self.beta_2 * layer.weight_cache + (1-self.beta_2)*layer.dweights**2
        layer.bias_cache=self.beta_2 * layer.bias_cache + (1-self.beta_2)*layer.dbiases**2

        # Correct bias in the second moment: cache_term = v_t / (1 - beta_2^(t))
        weight_cache_corrected= layer.weight_cache / (1-self.beta_2 **(self.iteration +1))
        bias_cache_corrected= layer.bias_cache/(1-self.beta_2**(self.iteration +1))

        ## SGD
        # Update parameters: parameter = parameter - lr * (momentum_term / (sqrt(cache_term) + epsilon))
        layer.weights+= -self.current_learning_rate * weight_momentum_corrected/ (np.sqrt(weight_cache_corrected)+self.epsilon)
        layer.biases+= -self.current_learning_rate *bias_momentum_corrected/ (np.sqrt(bias_cache_corrected)+self.epsilon)



    def post_update_param(self):
        self.iteration+=1  # Increment the iteration count after each parameter update

In [47]:
df=pd.read_csv('/content/diabetes.csv.xls')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [48]:
# Statistical summary
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [49]:
# Missing values in each column
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [50]:
# contains duplicate
df.duplicated().sum()

0

In [51]:
x=df.drop(columns=['Outcome'],axis=1)
y=df['Outcome']

In [52]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=49)

x_train = x_train.values
x_test = x_test.values
y_train = y_train.values
y_test = y_test.values

### Set up the network layers:

In [53]:
# First Dense layer with 8 input features to 64 neurons, with L2 regularization on both weights and biases
dense1=Dense(8,64,weight_regularizer_l2=5e-4,bias_regularizer_l2=5e-4)
# ReLU activation layer applied after dense1
act1=ReLU()
# Dropout layer with a dropout rate of 10% to reduce overfitting
drop1=Dropout(0.1)
# Second Dense layer with 64 inputs (from the previous layer) to 2 output neurons (for binary classification)
dense2=Dense(64,2)
# Combined Softmax activation and Categorical Cross-Entropy loss
loss_act=Softmax_CrossEntropy()

#### Adagrad Optimizer

In [61]:
opt=Adagrad(decay=1e-4)

#Training
for i in range(10001):
    # -------- Forward Pass --------
    dense1.forward(x_train) # First dense layer processing
    act1.forward(dense1.output) # ReLU activation on dense1's output
    drop1.forward(act1.output) # Dropout applied to the activated output
    dense2.forward(drop1.output) # Second dense layer processing

    # Computing the data loss using softmax and cross-entropy
    data_loss=loss_act.forward(dense2.output,y_train)

    # Computing the regularization loss from both dense layers
    regularization_loss=(loss_act.loss.regularization_loss(dense1) + loss_act.loss.regularization_loss(dense2))

    # Total loss is the sum of the data loss and the regularization loss
    loss=data_loss+regularization_loss

    # -------- Accuracy Calculation --------
    # Determine the predicted classes by taking the index of the maximum probability
    pred=np.argmax(loss_act.output,axis=1)
    # If y_train is one-hot encoded, convert it to class indices
    if len(y_train.shape)==2:
        y_train=np.argmax(y_train,axis=1)
    # Calculating the accuracy as the proportion of correct predictions
    acc=np.mean(pred==y_train)

    if not i%100:
        print(f'epoch: {i}, acc: {acc:.3f}, loss: {loss:.3f} (data_loss: {data_loss:.3f}, reg_loss: {regularization_loss:.3f}), lr: {opt.current_learning_rate} ')

    # -------- Backward Pass --------
    # Backpropagate through the loss layer to get the initial gradient
    loss_act.backward(loss_act.output,y_train)
    dense2.backward(loss_act.dinputs) # Backprop through second dense layer
    drop1.backward(dense2.dinputs) # Backprop through dropout layer
    act1.backward(drop1.dinputs) # Backprop through ReLU activation
    dense1.backward(act1.dinputs) # Backprop through first dense layer

    # -------- Parameter Update --------
    opt.pre_update_param() # Adjust the learning rate if decay is used
    opt.update_param(dense1) # Update parameters for the first dense layer
    opt.update_param(dense2) # Update parameters for the second dense layer
    opt.post_update_param() # Increment the iteration counter

epoch: 0, acc: 0.666, loss: 0.637 (data_loss: 0.637, reg_loss: 0.000), lr: 1.0 
epoch: 100, acc: 0.378, loss: 9.826 (data_loss: 9.712, reg_loss: 0.114), lr: 0.9901970492127933 
epoch: 200, acc: 0.689, loss: 0.708 (data_loss: 0.599, reg_loss: 0.109), lr: 0.9804882831650161 
epoch: 300, acc: 0.687, loss: 0.690 (data_loss: 0.587, reg_loss: 0.102), lr: 0.9709680551509855 
epoch: 400, acc: 0.694, loss: 0.671 (data_loss: 0.572, reg_loss: 0.098), lr: 0.9616309260505818 
epoch: 500, acc: 0.699, loss: 0.677 (data_loss: 0.582, reg_loss: 0.095), lr: 0.9524716639679969 
epoch: 600, acc: 0.712, loss: 0.659 (data_loss: 0.566, reg_loss: 0.092), lr: 0.9434852344560807 
epoch: 700, acc: 0.700, loss: 0.660 (data_loss: 0.570, reg_loss: 0.090), lr: 0.9346667912889054 
epoch: 800, acc: 0.707, loss: 0.657 (data_loss: 0.569, reg_loss: 0.088), lr: 0.9260116677470135 
epoch: 900, acc: 0.702, loss: 0.650 (data_loss: 0.563, reg_loss: 0.087), lr: 0.9175153683824203 
epoch: 1000, acc: 0.713, loss: 0.632 (data_loss

In [62]:
#Testing
# Forward pass on the test set
dense1.forward(x_test)
act1.forward(dense1.output)
dense2.forward(act1.output)

# Computing the loss on the test set
loss=loss_act.forward(dense2.output,y_test)

pred=np.argmax(loss_act.output,axis=1)
if len(y_test.shape)==2:
    y_test=np.argmax(y,axis=1)
acc=np.mean(pred==y_test)

print(f'Validation accuracy of Adagrad optimizer\nacc: {acc:.3f}, loss: {loss:.3f}')

Validation accuracy of Adagrad optimizer
acc: 0.623, loss: 0.786


#### SGD Optimizer

In [58]:
opt=SGD(decay=1e-3,momentum=0.9)

#Training
for i in range(10001):
    # -------- Forward Pass --------
    dense1.forward(x_train) # First dense layer processing
    act1.forward(dense1.output) # ReLU activation on dense1's output
    drop1.forward(act1.output) # Dropout applied to the activated output
    dense2.forward(drop1.output) # Second dense layer processing

    # Computing the data loss using softmax and cross-entropy
    data_loss=loss_act.forward(dense2.output,y_train)

    # Computing the regularization loss from both dense layers
    regularization_loss=(loss_act.loss.regularization_loss(dense1) + loss_act.loss.regularization_loss(dense2))

    # Total loss is the sum of the data loss and the regularization loss
    loss=data_loss+regularization_loss

    # -------- Accuracy Calculation --------
    # Determine the predicted classes by taking the index of the maximum probability
    pred=np.argmax(loss_act.output,axis=1)
    # If y_train is one-hot encoded, convert it to class indices
    if len(y_train.shape)==2:
        y_train=np.argmax(y_train,axis=1)
    # Calculating the accuracy as the proportion of correct predictions
    acc=np.mean(pred==y_train)

    if not i%100:
        print(f'epoch: {i}, acc: {acc:.3f}, loss: {loss:.3f} (data_loss: {data_loss:.3f}, reg_loss: {regularization_loss:.3f}), lr: {opt.current_learning_rate} ')

    # -------- Backward Pass --------
    # Backpropagate through the loss layer to get the initial gradient
    loss_act.backward(loss_act.output,y_train)
    dense2.backward(loss_act.dinputs) # Backprop through second dense layer
    drop1.backward(dense2.dinputs) # Backprop through dropout layer
    act1.backward(drop1.dinputs) # Backprop through ReLU activation
    dense1.backward(act1.dinputs) # Backprop through first dense layer

    # -------- Parameter Update --------
    opt.pre_update_param() # Adjust the learning rate if decay is used
    opt.update_param(dense1) # Update parameters for the first dense layer
    opt.update_param(dense2) # Update parameters for the second dense layer
    opt.post_update_param() # Increment the iteration counter

epoch: 0, acc: 0.666, loss: 0.637 (data_loss: 0.637, reg_loss: 0.000), lr: 1.0 
epoch: 100, acc: 0.666, loss: 0.637 (data_loss: 0.637, reg_loss: 0.000), lr: 0.9099181073703367 
epoch: 200, acc: 0.666, loss: 0.637 (data_loss: 0.637, reg_loss: 0.000), lr: 0.8340283569641367 
epoch: 300, acc: 0.666, loss: 0.637 (data_loss: 0.637, reg_loss: 0.000), lr: 0.7698229407236336 
epoch: 400, acc: 0.334, loss: 481.369 (data_loss: 0.715, reg_loss: 480.654), lr: 0.7147962830593281 
epoch: 500, acc: 0.666, loss: 295.821 (data_loss: 0.637, reg_loss: 295.184), lr: 0.66711140760507 
epoch: 600, acc: 0.666, loss: 74.021 (data_loss: 0.637, reg_loss: 73.384), lr: 0.6253908692933083 
epoch: 700, acc: 0.666, loss: 20.617 (data_loss: 0.637, reg_loss: 19.980), lr: 0.5885815185403178 
epoch: 800, acc: 0.666, loss: 6.529 (data_loss: 0.637, reg_loss: 5.892), lr: 0.5558643690939411 
epoch: 900, acc: 0.666, loss: 2.501 (data_loss: 0.637, reg_loss: 1.865), lr: 0.526592943654555 
epoch: 1000, acc: 0.666, loss: 1.265 (

In [60]:
#Testing
# Forward pass on the test set
dense1.forward(x_test)
act1.forward(dense1.output)
dense2.forward(act1.output)

# Computing the loss on the test set
loss=loss_act.forward(dense2.output,y_test)

pred=np.argmax(loss_act.output,axis=1)
if len(y_test.shape)==2:
    y_test=np.argmax(y,axis=1)
acc=np.mean(pred==y_test)

print(f'Validation accuracy of SGD optimizer\n acc: {acc:.3f}, loss: {loss:.3f}')

Validation accuracy of SGD optimizer
 acc: 0.591, loss: 0.689


#### RMSprop Optimizer

In [56]:
opt=RMSprop(learning_rate=0.02,decay=1e-5,rho=0.999)

#Training
for i in range(10001):
    # -------- Forward Pass --------
    dense1.forward(x_train) # First dense layer processing
    act1.forward(dense1.output) # ReLU activation on dense1's output
    drop1.forward(act1.output) # Dropout applied to the activated output
    dense2.forward(drop1.output) # Second dense layer processing

    # Computing the data loss using softmax and cross-entropy
    data_loss=loss_act.forward(dense2.output,y_train)

    # Computing the regularization loss from both dense layers
    regularization_loss=(loss_act.loss.regularization_loss(dense1) + loss_act.loss.regularization_loss(dense2))

    # Total loss is the sum of the data loss and the regularization loss
    loss=data_loss+regularization_loss

    # -------- Accuracy Calculation --------
    # Determine the predicted classes by taking the index of the maximum probability
    pred=np.argmax(loss_act.output,axis=1)
    # If y_train is one-hot encoded, convert it to class indices
    if len(y_train.shape)==2:
        y_train=np.argmax(y_train,axis=1)
    # Calculating the accuracy as the proportion of correct predictions
    acc=np.mean(pred==y_train)

    if not i%100:
        print(f'epoch: {i}, acc: {acc:.3f}, loss: {loss:.3f} (data_loss: {data_loss:.3f}, reg_loss: {regularization_loss:.3f}), lr: {opt.current_learning_rate} ')

    # -------- Backward Pass --------
    # Backpropagate through the loss layer to get the initial gradient
    loss_act.backward(loss_act.output,y_train)
    dense2.backward(loss_act.dinputs) # Backprop through second dense layer
    drop1.backward(dense2.dinputs) # Backprop through dropout layer
    act1.backward(drop1.dinputs) # Backprop through ReLU activation
    dense1.backward(act1.dinputs) # Backprop through first dense layer

    # -------- Parameter Update --------
    opt.pre_update_param() # Adjust the learning rate if decay is used
    opt.update_param(dense1) # Update parameters for the first dense layer
    opt.update_param(dense2) # Update parameters for the second dense layer
    opt.post_update_param() # Increment the iteration counter

epoch: 0, acc: 0.666, loss: 0.645 (data_loss: 0.637, reg_loss: 0.008), lr: 0.02 
epoch: 100, acc: 0.666, loss: 0.646 (data_loss: 0.637, reg_loss: 0.009), lr: 0.01998021958261321 
epoch: 200, acc: 0.666, loss: 0.645 (data_loss: 0.637, reg_loss: 0.008), lr: 0.019960279044701046 
epoch: 300, acc: 0.666, loss: 0.645 (data_loss: 0.637, reg_loss: 0.008), lr: 0.019940378268975763 
epoch: 400, acc: 0.666, loss: 0.645 (data_loss: 0.637, reg_loss: 0.008), lr: 0.01992051713662487 
epoch: 500, acc: 0.666, loss: 0.645 (data_loss: 0.637, reg_loss: 0.008), lr: 0.01990069552930875 
epoch: 600, acc: 0.666, loss: 0.644 (data_loss: 0.637, reg_loss: 0.008), lr: 0.019880913329158343 
epoch: 700, acc: 0.666, loss: 0.645 (data_loss: 0.637, reg_loss: 0.008), lr: 0.019861170418772778 
epoch: 800, acc: 0.666, loss: 0.644 (data_loss: 0.637, reg_loss: 0.007), lr: 0.019841466681217078 
epoch: 900, acc: 0.666, loss: 0.643 (data_loss: 0.637, reg_loss: 0.006), lr: 0.01982180200001982 
epoch: 1000, acc: 0.666, loss: 0

In [57]:
#Testing
# Forward pass on the test set
dense1.forward(x_test)
act1.forward(dense1.output)
dense2.forward(act1.output)

# Computing the loss on the test set
loss=loss_act.forward(dense2.output,y_test)

pred=np.argmax(loss_act.output,axis=1)
if len(y_test.shape)==2:
    y_test=np.argmax(y,axis=1)
acc=np.mean(pred==y_test)

print(f'Validation accuracy of RMSprop optimizer\nacc: {acc:.3f}, loss: {loss:.3f}')

Validation accuracy of RMSprop optimizer
acc: 0.591, loss: 0.689


#### Adam Optimizer

In [54]:
opt=Adam(learning_rate=0.02,decay=1e-5)

#Training
for i in range(10001):
    # -------- Forward Pass --------
    dense1.forward(x_train) # First dense layer processing
    act1.forward(dense1.output) # ReLU activation on dense1's output
    drop1.forward(act1.output) # Dropout applied to the activated output
    dense2.forward(drop1.output) # Second dense layer processing

    # Computing the data loss using softmax and cross-entropy
    data_loss=loss_act.forward(dense2.output,y_train)

    # Computing the regularization loss from both dense layers
    regularization_loss=(loss_act.loss.regularization_loss(dense1) + loss_act.loss.regularization_loss(dense2))

    # Total loss is the sum of the data loss and the regularization loss
    loss=data_loss+regularization_loss

    # -------- Accuracy Calculation --------
    # Determine the predicted classes by taking the index of the maximum probability
    pred=np.argmax(loss_act.output,axis=1)
    # If y_train is one-hot encoded, convert it to class indices
    if len(y_train.shape)==2:
        y_train=np.argmax(y_train,axis=1)
    # Calculating the accuracy as the proportion of correct predictions
    acc=np.mean(pred==y_train)

    if not i%100:
        print(f'epoch: {i}, acc: {acc:.3f}, loss: {loss:.3f} (data_loss: {data_loss:.3f}, reg_loss: {regularization_loss:.3f}), lr: {opt.current_learning_rate} ')

    # -------- Backward Pass --------
    # Backpropagate through the loss layer to get the initial gradient
    loss_act.backward(loss_act.output,y_train)
    dense2.backward(loss_act.dinputs) # Backprop through second dense layer
    drop1.backward(dense2.dinputs) # Backprop through dropout layer
    act1.backward(drop1.dinputs) # Backprop through ReLU activation
    dense1.backward(act1.dinputs) # Backprop through first dense layer

    # -------- Parameter Update --------
    opt.pre_update_param() # Adjust the learning rate if decay is used
    opt.update_param(dense1) # Update parameters for the first dense layer
    opt.update_param(dense2) # Update parameters for the second dense layer
    opt.post_update_param() # Increment the iteration counter

epoch: 0, acc: 0.666, loss: 0.680 (data_loss: 0.680, reg_loss: 0.000), lr: 0.02 
epoch: 100, acc: 0.669, loss: 0.590 (data_loss: 0.588, reg_loss: 0.002), lr: 0.01998021958261321 
epoch: 200, acc: 0.668, loss: 0.601 (data_loss: 0.598, reg_loss: 0.003), lr: 0.019960279044701046 
epoch: 300, acc: 0.612, loss: 0.641 (data_loss: 0.636, reg_loss: 0.005), lr: 0.019940378268975763 
epoch: 400, acc: 0.668, loss: 0.592 (data_loss: 0.586, reg_loss: 0.007), lr: 0.01992051713662487 
epoch: 500, acc: 0.666, loss: 0.594 (data_loss: 0.586, reg_loss: 0.008), lr: 0.01990069552930875 
epoch: 600, acc: 0.669, loss: 0.600 (data_loss: 0.590, reg_loss: 0.010), lr: 0.019880913329158343 
epoch: 700, acc: 0.669, loss: 0.687 (data_loss: 0.675, reg_loss: 0.012), lr: 0.019861170418772778 
epoch: 800, acc: 0.669, loss: 0.642 (data_loss: 0.633, reg_loss: 0.009), lr: 0.019841466681217078 
epoch: 900, acc: 0.668, loss: 0.641 (data_loss: 0.635, reg_loss: 0.006), lr: 0.01982180200001982 
epoch: 1000, acc: 0.668, loss: 0

In [55]:
#Testing
# Forward pass on the test set
dense1.forward(x_test)
act1.forward(dense1.output)
dense2.forward(act1.output)

# Computing the loss on the test set
loss=loss_act.forward(dense2.output,y_test)

pred=np.argmax(loss_act.output,axis=1)
if len(y_test.shape)==2:
    y_test=np.argmax(y,axis=1)
acc=np.mean(pred==y_test)

print(f'Validation accuracy of Adam optimizer\nacc: {acc:.3f}, loss: {loss:.3f}')

Validation accuracy of Adam optimizer
acc: 0.591, loss: 0.689


##### Based on the experimental results in this notebook, Adagrad demonstrated superior performance compared to the other optimizers. Specifically, Adagrad achieved a validation accuracy of approximately 62.3%, while SGD with momentum, RMSprop, and Adam each reached around 59.1% accuracy. This improvement suggests that Adagrad's adaptive learning rate, which adjusts individually for each parameter based on the historical accumulation of squared gradients, is particularly effective for this classification task. The ability to dynamically adapt the step size appears to help the model converge more efficiently on the diabetes dataset, making Adagrad a promising choice for similar optimization problems.