In [54]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [55]:
# Get full dataset
data = pd.read_csv('mnist.csv')
data.head(8)

Unnamed: 0,5,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,...,0.608,0.609,0.610,0.611,0.612,0.613,0.614,0.615,0.616,0.617
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
# Separate X and Y from data
data = np.array(data).T
Y = data[0]
X = data[1:] / 255
print(Y.shape, X.shape)

(59999,) (784, 59999)


In [57]:
# Split train and test datasets
train_units = int(np.floor(0.8 * X.shape[1]))

Y_train = Y[0:train_units]
Y_test = Y[train_units:]

X_train = X[:, 0:train_units]
X_test = X[:, train_units:]

print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(784, 47999) (47999,) (784, 12000) (12000,)


In [58]:
# Start the parameters for a NN with input layer + 2 hidden layers
#   n_inputs  -> Number of layers of the input layer
#   n_first   -> Number of layers of the first hidden layer
#   n_second  -> Number of layers of the second hidden layer
def init_params(n_inputs, n_first, n_second):
  W1 = np.random.rand(n_first, n_inputs) - 0.5
  b1 = np.random.rand(n_first, 1) - 0.5

  W2 = np.random.rand(n_second, n_first) - 0.5
  b2 = np.random.rand(n_second, 1) - 0.5

  return W1, b1, W2, b2

# Basic ReLU function
def ReLU(Z):
  return np.maximum(0, Z)

# Softmax function
def softmax(Z):
  return np.exp(Z) / sum(np.exp(Z))

# Forward propagation function
def forward_prop(W1, b1, W2, b2, X):
  Z1 = W1.dot(X) + b1
  A1 = ReLU(Z1)

  Z2 = W2.dot(A1) + b2
  A2 = softmax(Z2)

  return Z1, A1, Z2, A2

# One-hot encoding function
def one_hot(Y):
  one_hot_Y = np.zeros((Y.size, Y.max() + 1))
  one_hot_Y[np.arange(Y.size), Y] = 1
  one_hot_Y = one_hot_Y.T
  return one_hot_Y

# Derivative of ReLU
def deriv_ReLU(Z):
  return Z > 0

# Backwards propagation function using basic loss function
def back_prop(Z1, A1, Z2, A2, W2, X, Y):
  m = Y.size
  one_hot_Y = one_hot(Y)

  dZ2 = A2 - one_hot_Y          # Diff from preds and correct Y
  dW2 = 1 / m * dZ2.dot(A1.T)
  db2 = 1 / m * np.sum(dZ2)

  dZ1 = W2.T.dot(dZ2) * deriv_ReLU(Z1)
  dW1 = 1 / m * dZ1.dot(X.T)
  db1 = 1 / m * np.sum(dZ1)

  return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, lr=1e-3):
  W1 = W1 - lr * dW1
  b1 = b1 - lr * db1
  W2 = W2 - lr * dW2
  b2 = b2 - lr * db2

  return W1, b1, W2, b2


In [59]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

# NN training, takes in X and Y data, number of iterations and learning rate
# and returns the weights and biases of the neural network after training
def gradient_descent(X, Y, iterations, lr=1e-3):
  output_layer_neurons = len(np.unique(Y))
  W1, b1, W2, b2 = init_params(X.shape[0], 10, output_layer_neurons)    # Choose 10 neurons for hidden layer
  for i in range(iterations):
    Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
    dW1, db1, dW2, db2 = back_prop(Z1, A1, Z2, A2, W2, X, Y)
    W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, lr)

    if (i % 20) == 0:
      print(f"Iteration {i+1}")
      print(f"Accuracy: {get_accuracy(get_predictions(A2), Y)}")

  print(f"Final result")
  print(f"Accuracy: {get_accuracy(get_predictions(A2), Y)}")

  return W1, b1, W2, b2

In [60]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 500, 0.1)

Iteration 1
Accuracy: 0.06654305298027043
Iteration 21
Accuracy: 0.22106710556469927
Iteration 41
Accuracy: 0.3552782349632284
Iteration 61
Accuracy: 0.4381966290964395
Iteration 81
Accuracy: 0.49930206879309985
Iteration 101
Accuracy: 0.5560532511093981
Iteration 121
Accuracy: 0.6039709160607513
Iteration 141
Accuracy: 0.6433050688556011
Iteration 161
Accuracy: 0.6741182107960583
Iteration 181
Accuracy: 0.7030771474405717
Iteration 201
Accuracy: 0.7291193566532637
Iteration 221
Accuracy: 0.7483697577032855
Iteration 241
Accuracy: 0.7653076105752203
Iteration 261
Accuracy: 0.7798287464322173
Iteration 281
Accuracy: 0.7908289756036584
Iteration 301
Accuracy: 0.8004541761286693
Iteration 321
Accuracy: 0.8080168336840351
Iteration 341
Accuracy: 0.8145378028708932
Iteration 361
Accuracy: 0.821308777266193
Iteration 381
Accuracy: 0.826183878830809
Iteration 401
Accuracy: 0.8311214816975354
Iteration 421
Accuracy: 0.8355382403800079
Iteration 441
Accuracy: 0.8388508093918623
Iteration 461
Ac

In [61]:
def make_predictions(X, W1, b1, W2, b2):
    _, _, _, A2 = forward_prop(W1, b1, W2, b2, X)
    predictions = get_predictions(A2)
    return predictions

# Finally, compute the total accuracy on the test set
test_preds = make_predictions(X_test, W1, b1, W2, b2)
get_accuracy(test_preds, Y_test)

np.float64(0.8549166666666667)