In [61]:
import numpy as np
import pandas as pd

In [62]:
data = pd.read_csv('train.csv')
print(data.head())
print(data.shape)

   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8  ...  pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  \
0       0  ...         0         0         0         0         0         0   
1       0  ...         0         0         0         0         0         0   
2       0  ...         0         0         0         0         0         0   
3       0  ...         0         0         0         0         0         0   
4       0  ...         0         0         0         0         0         0   

   pixel780  pixel781  pixel782  pixel783  
0         0         0         

In [63]:
data = data.values.T
print(data.shape)

(785, 42000)


In [64]:
y = data[0]  
x = (data[1:] / 255.0).T

print(y.shape)
print(x.shape)

(42000,)
(42000, 784)


In [65]:
splitRatio = 0.8
splitIndex = int(x.shape[0] * splitRatio)  # Use rows, not columns

xTrain, xTest = x[:splitIndex], x[splitIndex:]
yTrain, yTest = y[:splitIndex], y[splitIndex:]


In [66]:
b1 = np.zeros((16,)) 
print(b1)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [67]:
def params():
    w1 = np.random.randn(784, 16) * np.sqrt(1 / 784)
    b1 = np.zeros((1, 16))

    w2 = np.random.randn(16, 16) * np.sqrt(1 / 16)  
    b2 = np.zeros((1, 16))

    w3 = np.random.randn(16, 10) * np.sqrt(1 / 16)  
    b3 = np.zeros((1, 10))

    return w1, w2, w3, b1, b2, b3


In [68]:
def relu(x):
    return np.maximum(0, x)

def softmax(x):
    x = x - np.max(x, axis=1, keepdims=True)  # Prevent overflow
    e = np.exp(np.clip(x, -100, 100))  # Clip extreme values
    return e / np.sum(e, axis=1, keepdims=True)



def Loss(x, realY):
    assert x.shape == realY.shape, f"Shape mismatch: x={x.shape}, realY={realY.shape}"
    loss = -np.sum(realY * np.log(x + 1e-8)) / realY.shape[0]
    return loss

def reluDerivative(x):
    return (x > 0).astype(float)

In [69]:
def forward(w1, w2, w3, b1, b2, b3, x):
    z1 = x.dot(w1) + b1
    a1 = relu(z1)

    z2 = a1.dot(w2) + b2
    a2 = relu(z2)

    z3 = a2.dot(w3) + b3
    a3 = softmax(z3)
    return a3, a2, a1, z1, z2, z3

In [70]:
def backProp(a3, w3, a2, z2, a1, w2, z1, inputt, oneHotY):
    # Compute loss (Cross-Entropy / Log Loss)
    logLoss = Loss(a3, oneHotY)

    # Compute derivatives
    dz3 = a3 - oneHotY  # derivative of softmax + cross-entropy loss
    dw3 = a2.T.dot(dz3)  
    db3 = np.sum(dz3, axis=0, keepdims=True)
    
    da2 = dz3.dot(w3.T)
    dz2 = da2 * reluDerivative(z2)
    
    dw2 = a1.T.dot(dz2)  
    db2 = np.sum(dz2, axis=0, keepdims=True)

    da1 = dz2.dot(w2.T)
    dz1 = da1 * reluDerivative(z1)
    
    dw1 = inputt.T.dot(dz1)
    db1 = np.sum(dz1, axis=0, keepdims=True)

    return dw3, db3, dw2, db2, dw1, db1, logLoss 


In [71]:
def OneHotEncoder(y):
    oneHot = np.zeros((y.shape[0], 10))
    oneHot[np.arange(y.shape[0]), y] = 1
    return oneHot

In [72]:
w1, w2, w3, b1, b2, b3 = params()
oneHotedY = OneHotEncoder(yTrain)

In [79]:
iterations = 50
alpha = 0.0005
# print("This is w3: " ,w3)
# print(b3)
# print(w2)
# print(b2)
# print(w1)
# print(b1)

print("updations")
for i in range(iterations):

    z1 = xTrain.dot(w1) + b1
    a1 = relu(z1)

    z2 = a1.dot(w2) + b2
    a2 = relu(z2)

    z3 = a2.dot(w3) + b3
    a3 = softmax(z3)  

    if np.isnan(a3).any():
        print("NaN detected in softmax output at iteration: ", iterations)
        break

    

    dw3, db3, dw2, db2, dw1, db1, logLoss = backProp(a3, w3, a2, z2, a1, w2, z1, xTrain, oneHotedY)

    dw3 = np.clip(dw3, -1, 1)
    db3 = np.clip(db3, -1, 1)
    dw2 = np.clip(dw2, -1, 1)
    db2 = np.clip(db2, -1, 1)
    dw1 = np.clip(dw1, -1, 1)
    db1 = np.clip(db1, -1, 1)

    w3 = w3 - alpha * dw3
    b3 = b3 - alpha * db3
    w2 = w2 - alpha * dw2
    b2 = b2 - alpha * db2
    w1 = w1 - alpha * dw1
    b1 = b1 - alpha * db1

    logLoss = Loss(a3, oneHotedY)

    predictions = np.argmax(a3, axis=1)  # Get class with highest probability
    accuracy = np.mean(predictions == yTrain) * 100  # Compare with true labels
    print(f"Iteration {i+1}: Loss = {logLoss:.4f}, Accuracy = {accuracy:.2f}%")
# print("This is w3: ", w3.shape)
# print(b3)
# print(w2)
# print(b2)
# print(w1)
# print(b1)

updations
Iteration 1: Loss = 1.0883, Accuracy = 75.25%
Iteration 2: Loss = 1.0786, Accuracy = 75.55%
Iteration 3: Loss = 1.0689, Accuracy = 75.86%
Iteration 4: Loss = 1.0593, Accuracy = 76.13%
Iteration 5: Loss = 1.0498, Accuracy = 76.38%
Iteration 6: Loss = 1.0404, Accuracy = 76.65%
Iteration 7: Loss = 1.0310, Accuracy = 76.88%
Iteration 8: Loss = 1.0217, Accuracy = 77.15%
Iteration 9: Loss = 1.0125, Accuracy = 77.39%
Iteration 10: Loss = 1.0034, Accuracy = 77.75%
Iteration 11: Loss = 0.9943, Accuracy = 77.95%
Iteration 12: Loss = 0.9854, Accuracy = 78.26%
Iteration 13: Loss = 0.9765, Accuracy = 78.44%
Iteration 14: Loss = 0.9677, Accuracy = 78.72%
Iteration 15: Loss = 0.9590, Accuracy = 78.92%
Iteration 16: Loss = 0.9503, Accuracy = 79.20%
Iteration 17: Loss = 0.9418, Accuracy = 79.38%
Iteration 18: Loss = 0.9333, Accuracy = 79.55%
Iteration 19: Loss = 0.9249, Accuracy = 79.76%
Iteration 20: Loss = 0.9165, Accuracy = 80.03%
Iteration 21: Loss = 0.9083, Accuracy = 80.16%
Iteration 22

In [80]:
a3, _, _, _, _, _ = forward(w1, w2, w3, b1, b2, b3, xTest)
test_predictions = np.argmax(a3, axis=1)
test_accuracy = np.mean(test_predictions == yTest) * 100

print(f"Test Accuracy: {test_accuracy:.2f}%")

Test Accuracy: 83.96%
