In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('train.csv') # Data is "MNIST Digit Recognizer" by ANIMATRONBOT from Kaggle
df

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41997,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41998,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Activation Functions**

In [12]:
def softmax(Z):
    Z -= np.max(Z, axis=1, keepdims=True) # Softmax normalization
    A = np.exp(Z) / sum(np.exp(Z))
    return A

def ReLU(Z):
    return np.maximum(0, Z)

def ReLU_derivative(Z):
    return Z > 0

**Generate Parameters W, b**

In [4]:
def params(X):
    m, n = X.shape
    W1 = np.random.randn(n, 25) # First hidden layer has 25 neurons
    b1 = np.random.randn(1, 25)
    W2 = np.random.randn(25, 15) # Second hidden layer 15 neurons
    b2 = np.random.randn(1, 15)
    W3 = np.random.randn(15, 10) # Output layer
    b3 = np.random.randn(1, 10)
    return W1, b1, W2, b2, W3, b3

**Forward Backward Propagation**

In [5]:
def forward(X, W1, b1, W2, b2, W3, b3):
    Z1 = np.dot(X, W1) + b1
    A1 = ReLU(Z1)
    Z2 = np.dot(A1, W2) + b2
    A2 = ReLU(Z2)
    Z3 = np.dot(A2, W3) + b3
    A3 = softmax(Z3)
    return Z1, A1, Z2, A2, Z3, A3

In [6]:
def backward(X, Y, Z1, A1, Z2, A2, Z3, A3, W1, b1, W2, b2, W3, b3):
    m = X.shape[0]
    dZ3 = A3 - Y
    dW3 = 1 / m * np.dot(A2.T, dZ3)
    db3 = 1 / m * np.sum(dZ3, axis=0)
    dA2 = np.dot(dZ3, W3.T)
    dZ2 = dA2 * ReLU_derivative(Z2)
    dW2 = 1 / m * np.dot(A1.T, dZ2)
    db2 = 1 / m * np.sum(dZ2, axis=0)
    dA1 = np.dot(dZ2, W2.T)
    dZ1 = dA1 * ReLU_derivative(Z1)
    dW1 = 1 / m * np.dot(X.T, dZ1)
    db1 = 1 / m * np.sum(dZ1, axis=0)
    return dW1, db1, dW2, db2, dW3, db3

**Gradient Descent**

In [7]:
def gradient_descent(X, y, alpha, iterations):
    W1, b1, W2, b2, W3, b3 = params(X)
    for i in range(iterations):
        Z1, A1, Z2, A2, Z3, A3 = forward(X, W1, b1, W2, b2, W3, b3)
        dW1, db1, dW2, db2, dW3, db3 = backward(X, y, Z1, A1, Z2, A2, Z3, A3, W1, b1, W2, b2, W3, b3)
        W1, W2, W3 = (W1 - alpha * dW1, W2 - alpha * dW2, W3 - alpha * dW3)
        b1, b2, b3 = (b1 - alpha * db1, b2 - alpha * db2, b3 - alpha * db3)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = np.argmax(A3, axis=1).reshape([-1,1])
            pre = np.sum(predictions == y) / y.size
            print(f"Accuracy: {pre}")
    print("Iteration: ", iterations)
    print(f"Accuracy: {pre}")
    return W1, b1, W2, b2, W3, b3

In [8]:
X = df.drop(['label'], axis=1)
X = np.array(X)

y = df['label']
y = np.array(y).reshape([-1,1])

np.random.shuffle(X)
np.random.shuffle(y)

X_train = X[:38000]
X_test = X[38000:40000]
X_cv = X[40000:]

y_train = y[:38000]
y_test = y[38000:40000]
y_cv = y[40000:]

alpha = 0.001
iterations = 50

In [13]:
W1, b1, W2, b2, W3, b3 = gradient_descent(X_train, y_train, alpha, iterations)

Iteration:  0
Accuracy: 0.09923684210526315


  A = np.exp(Z) / sum(np.exp(Z))


Iteration:  10
Accuracy: 0.0983421052631579
Iteration:  20
Accuracy: 0.0983421052631579
Iteration:  30
Accuracy: 0.0983421052631579
Iteration:  40
Accuracy: 0.0983421052631579
Iteration:  50
Accuracy: 0.0983421052631579
