In [1]:
import numpy as np
import pandas as pd
import gzip
import tensorflow as tf
import matplotlib.pyplot as plt
import math
from sklearn.metrics import accuracy_score

## Loading the Dataset

In [2]:
from tensorflow.contrib.learn.python.learn.datasets.mnist import extract_images, extract_labels
with open('./train-images-idx3-ubyte.gz', 'rb') as f:
    train_images = extract_images(f)
with open('./train-labels-idx1-ubyte.gz', 'rb') as f:
    train_labels = extract_labels(f)
with open('./t10k-images-idx3-ubyte.gz', 'rb') as f:
    test_images = extract_images(f)
with open('./t10k-labels-idx1-ubyte.gz', 'rb') as f:
    test_labels = extract_labels(f)

W0718 10:48:48.332733 140151043381056 deprecation.py:323] From <ipython-input-2-71bdf6b57526>:3: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.


Extracting ./train-images-idx3-ubyte.gz


W0718 10:48:48.585269 140151043381056 deprecation.py:323] From <ipython-input-2-71bdf6b57526>:5: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.


Extracting ./train-labels-idx1-ubyte.gz
Extracting ./t10k-images-idx3-ubyte.gz
Extracting ./t10k-labels-idx1-ubyte.gz


## Sizes of our test and train sets

In [3]:
print(train_images.shape)
print(train_labels.shape)
print(test_images.shape)
print(test_labels.shape)

(60000, 28, 28, 1)
(60000,)
(10000, 28, 28, 1)
(10000,)


In [4]:
m_train = train_images.shape[0]
num_px = train_images.shape[1]
m_test = test_images.shape[0]

print ("Number of training examples: " + str(m_train))
print ("Number of testing examples: " + str(m_test))
print ("Each image is of size: (" + str(num_px) + ", " + str(num_px) + ", 1)")
print ("train_images shape: " + str(train_images.shape))
print ("train_labels shape: " + str(train_labels.shape))
print ("test_images shape: " + str(test_images.shape))
print ("test_labels shape: " + str(test_labels.shape))

Number of training examples: 60000
Number of testing examples: 10000
Each image is of size: (28, 28, 1)
train_images shape: (60000, 28, 28, 1)
train_labels shape: (60000,)
test_images shape: (10000, 28, 28, 1)
test_labels shape: (10000,)


## Flattening the image and reshaping

In [5]:
train_images_flatten = train_images.reshape(train_images.shape[0],-1).T
test_images_flatten = test_images.reshape(test_images.shape[0],-1).T
train_x = train_images_flatten/255
test_x = test_images_flatten/255

In [6]:
print(train_images_flatten.shape)
print(test_images_flatten.shape)
print("***************")
print(train_x.shape)
print(test_x.shape)

(784, 60000)
(784, 10000)
***************
(784, 60000)
(784, 10000)


In [7]:
from sklearn.datasets import fetch_mldata
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

## converting label data inti one hot encoding

In [8]:
digits = 10
examples = train_labels.shape[0]
train_labels = train_labels.reshape(1, examples)
Y_new = np.eye(digits)[train_labels.astype('int32')]
Y_new = Y_new.T.reshape(digits, examples)

examples = test_labels.shape[0]
test_labels = test_labels.reshape(1,examples)
Y_new_test = np.eye(digits)[test_labels.astype('int32')]
Y_new_test = Y_new_test.T.reshape(digits, examples)

In [9]:
X_train = train_x
X_test = test_x
Y_train = Y_new
Y_test = Y_new_test

## Train and test data size after preprocessing

In [10]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(784, 60000)
(784, 10000)
(10, 60000)
(10, 10000)


In [11]:
"""
sigmoid function
"""
def sigmoid(z):
    s = 1. / (1. + np.exp(-z))
    return s
"""
loss function
"""
def compute_loss(Y, Y_hat):

    L_sum = np.sum(np.multiply(Y, np.log(Y_hat)))
    m = Y.shape[1]
    L = -(1./m) * L_sum

    return L
"""
Forward prop
"""
def feed_forward(X, params):

    cache = {}

    cache["Z1"] = np.matmul(params["W1"], X) + params["b1"]
    cache["A1"] = sigmoid(cache["Z1"])
    cache["Z2"] = np.matmul(params["W2"], cache["A1"]) + params["b2"]
    cache["A2"] = np.exp(cache["Z2"]) / np.sum(np.exp(cache["Z2"]), axis=0)

    return cache
"""
Backward prop
"""
def back_propagate(X, Y, params, cache):

    dZ2 = cache["A2"] - Y
    dW2 = (1./m_batch) * np.matmul(dZ2, cache["A1"].T)
    db2 = (1./m_batch) * np.sum(dZ2, axis=1, keepdims=True)

    dA1 = np.matmul(params["W2"].T, dZ2)
    dZ1 = dA1 * sigmoid(cache["Z1"]) * (1 - sigmoid(cache["Z1"]))
    dW1 = (1./m_batch) * np.matmul(dZ1, X.T)
    db1 = (1./m_batch) * np.sum(dZ1, axis=1, keepdims=True)

    grads = {"dW1": dW1, "db1": db1, "dW2": dW2, "db2": db2}

    return grads

In [12]:
np.random.seed(138)
m = 60000
# hyperparameters
n_x = X_train.shape[0]
n_h = 64
"""
Defining learning rate to be 4 as we are using momentum instead of gradient descent. 
with learning rate as 4 we are able to achieve 97% accuracy in 9 iterations
"""
learning_rate = 4
beta = .9
"""
Defining batch size to be 128
Advantages of using Mini batch gradient descent
--> you have the vectorization advantage
--> make progress without waiting to process the entire training set
Disadvantages
--> Doesn't always exactly converge (oscillates in a very small region, but you can reduce learning rate)
"""
batch_size = 128
batches = -(-m // batch_size)

# initialization
params = { "W1": np.random.randn(n_h, n_x) * np.sqrt(1. / n_x),
           "b1": np.zeros((n_h, 1)) * np.sqrt(1. / n_x),
           "W2": np.random.randn(digits, n_h) * np.sqrt(1. / n_h),
           "b2": np.zeros((digits, 1)) * np.sqrt(1. / n_h) }


"""
Intializing params for Gradient descent with momentum

"""
V_dW1 = np.zeros(params["W1"].shape)
V_db1 = np.zeros(params["b1"].shape)
V_dW2 = np.zeros(params["W2"].shape)
V_db2 = np.zeros(params["b2"].shape)

# train
for i in range(9):

    permutation = np.random.permutation(X_train.shape[1])
    X_train_shuffled = X_train[:, permutation]
    Y_train_shuffled = Y_train[:, permutation]

    for j in range(batches):

        begin = j * batch_size
        end = min(begin + batch_size, X_train.shape[1] - 1)
        X = X_train_shuffled[:, begin:end]
        Y = Y_train_shuffled[:, begin:end]
        m_batch = end - begin

        cache = feed_forward(X, params)
        grads = back_propagate(X, Y, params, cache)

        V_dW1 = (beta * V_dW1 + (1. - beta) * grads["dW1"])
        V_db1 = (beta * V_db1 + (1. - beta) * grads["db1"])
        V_dW2 = (beta * V_dW2 + (1. - beta) * grads["dW2"])
        V_db2 = (beta * V_db2 + (1. - beta) * grads["db2"])

        params["W1"] = params["W1"] - learning_rate * V_dW1
        params["b1"] = params["b1"] - learning_rate * V_db1
        params["W2"] = params["W2"] - learning_rate * V_dW2
        params["b2"] = params["b2"] - learning_rate * V_db2

    cache = feed_forward(X_train, params)
    train_cost = compute_loss(Y_train, cache["A2"])
    cache = feed_forward(X_test, params)
    test_cost = compute_loss(Y_test, cache["A2"])
    print("Epoch {}: training cost = {}, test cost = {}".format(i+1 ,train_cost, test_cost))

print("Done.")

Epoch 1: training cost = 0.15247891859267396, test cost = 0.1547560029527509
Epoch 2: training cost = 0.10002037166708286, test cost = 0.1143751924400732
Epoch 3: training cost = 0.07345533436456224, test cost = 0.09667640947427097
Epoch 4: training cost = 0.06631645063042363, test cost = 0.09893790330148221
Epoch 5: training cost = 0.05696481716210169, test cost = 0.09308700623494924
Epoch 6: training cost = 0.043058871259209014, test cost = 0.08732451509837161
Epoch 7: training cost = 0.04041933333490396, test cost = 0.08644605473893195
Epoch 8: training cost = 0.03311140650773119, test cost = 0.0813292627297561
Epoch 9: training cost = 0.02904318969196912, test cost = 0.08464942196754249
Done.


In [13]:
cache = feed_forward(X_test, params)
predictions = np.argmax(cache["A2"], axis=0)
labels = np.argmax(Y_test, axis=0)

print(classification_report(predictions, labels))
print(accuracy_score(predictions,labels))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       992
           1       0.99      0.99      0.99      1132
           2       0.99      0.95      0.97      1069
           3       0.98      0.97      0.97      1020
           4       0.97      0.98      0.98       964
           5       0.97      0.99      0.98       874
           6       0.97      0.96      0.97       967
           7       0.96      0.98      0.97      1014
           8       0.96      0.98      0.97       952
           9       0.97      0.96      0.97      1016

   micro avg       0.97      0.97      0.97     10000
   macro avg       0.97      0.97      0.97     10000
weighted avg       0.97      0.97      0.97     10000

0.9746
