<a href="https://colab.research.google.com/github/vksgm/ML_hands_on/blob/main/Neural_network_MINST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import kagglehub
import os
path = kagglehub.dataset_download("oddrationale/mnist-in-csv")
#Get the file name from the directory path
for filename in os.listdir(path):
    if filename.endswith(".csv"):
        filepath = os.path.join(path, filename)  # Construct the full file path
        break  # Stop searching after finding the first CSV file

data = pd.read_csv(filepath)
data.head()


Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
#convert pandas to numpy array
data = data.to_numpy()
#shape of the data
m,n = data.shape
#shuffle the data to split training set and avoding overfitting
np.random.shuffle(data)

#test data
data_dev = data[0:1000].T # transpose as we will pass 284 pixels as input to input neuron
#Input
X_dev = data_dev[1:n]
#labels or output
Y_dev = data_dev[0]
#train data & transposing
data_train = data[1000:m].T
#training inputs (784pixels * 9000 entries)
X_train = data_train[1:n]
#training lables
Y_train = data_train[0]




In [28]:
#printing 784 pixel values for the first input ( note: already transposed)
print(X_train[:,1].shape)
print(Y_train.shape)

(784, 2)
(9000,)


In [35]:
#initialize the params for weights and bias
def init_params():
  #Assign random weights initially
  #epresents the weights connecting the input layer (784 pixels of an image)
  #to the next layer (likely a hidden layer with 10 neurons).
    W12 = np.random.rand(10,784) - 0.5
    b12 = np.random.rand(10,1) - 0.5
    W23 = np.random.rand(10,10) - 0.5
    b23 = np.random.rand(10,1) - 0.5
    return W12, b12, W23, b23

def sigmoid(Z):
  return 1/(1+np.exp(-Z))

def softmax(Z):
  return np.exp(Z)/sum(np.exp(Z))

def forward_prop(W12, b12, W23, b23, X):
  #Activation fn for the first layer to second layer
  #Z12 = W.X + b
  Z12 = W12.dot(X) + b12
  A12 = sigmoid(Z12)
  #Note at the output layer we always use softmax function
  #Z23 = W.A12 + b
  Z23 = W23.dot(A12) + b23
  A23 = softmax(Z23)
  return Z12, A12, Z23, A23

#One hot Encoding for the output
def one_hot(Y):
  #creating matrix of 2000 * 10 for ( 2000 inputs with 10 columns each )
  one_hot_Y = np.zeros((Y.size, Y.max() + 1))
  # make 1 for the index pos at Y column and rest unchanged(0)
  # eg. for 3-> 0 0 0 1 0 0 0 0 0 0
  one_hot_Y[np.arange(Y.size), Y] = 1
  #transposing
  return one_hot_Y.T

def deriv_sigmoid(Z):
  #derivative function
  return sigmoid(Z) * (1 - sigmoid(Z))

def back_prop(Z12, A12, Z23, A23, W23, X, Y):
  one_hot_Y = one_hot(Y)
  m = Y.size
#error fn for output layer
  dZ23 = A23 - one_hot_Y #predicted act.fn - actual label
  #based on cross entropy cost function
  dW23 = 1/m * dZ23.dot(A12.T) #new weight with previous activation fn (A12)
  db23 = 1/m * np.sum(dZ23) #new bias

# previous layer
  dZ12 = W23.T.dot(dZ23) * deriv_sigmoid(Z12) #derivative of sigmoid fn
  dW12 = 1/m * dZ12.dot(X.T) # with input vector
  db12 = 1/m * np.sum(dZ12)

  return dW12, db12, dW23, db23

def update_params(W12, b12, W23, b23, dW12, db12, dW23, db23, alpha):
  W12 = W12 - alpha * dW12
  b12 = b12 - alpha * db12
  W23 = W23 - alpha * dW23
  b23 = b23 - alpha * db23

  return W12, b12, W23, b23

def get_accuracy(predictions, Y):
  print(predictions, Y)
  return np.sum(predictions == Y) / Y.size

def get_predictions(A23):
  return np.argmax(A23, 0)

def gradient_descent(X, Y, iterations, alpha):
  #initialize random weights and bias
  W12, b12, W23, b23 = init_params()
  for i in range(iterations):
    #forward prop
    Z12, A12, Z23, A23 = forward_prop(W12, b12, W23, b23, X)
    #backward prop
    dW12, db12, dW23, db23 = back_prop(Z12, A12, Z23, A23, W23, X, Y)
    #update params
    W12, b12, W23, b23 =  update_params(W12, b12, W23, b23, dW12, db12, dW23, db23, alpha)
    if i % 10 == 0:
      print("Iteration: ", i)
      print(get_accuracy(get_predictions(A23), Y))
  return W12, b12, W23, b23

In [38]:
#Calling the flow of gradient descent
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 500, 0.459)

  return 1/(1+np.exp(-Z))


Iteration:  0
[7 7 7 ... 7 7 4] [8 2 1 ... 1 1 6]
0.09122222222222222
Iteration:  10
[1 5 1 ... 8 1 4] [8 2 1 ... 1 1 6]
0.24222222222222223
Iteration:  20
[7 2 1 ... 1 1 4] [8 2 1 ... 1 1 6]
0.37544444444444447
Iteration:  30
[9 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.4271111111111111
Iteration:  40
[9 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.4831111111111111
Iteration:  50
[1 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.5296666666666666
Iteration:  60
[1 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.5682222222222222
Iteration:  70
[9 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.633
Iteration:  80
[9 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.6556666666666666
Iteration:  90
[9 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.687
Iteration:  100
[8 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.7131111111111111
Iteration:  110
[8 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.73
Iteration:  120
[8 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.745
Iteration:  130
[8 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.7558888888888889
Iteration:  140
[8 2 1 ... 1 1 6] [8 2 1 ... 1 1 6]
0.7622222222222