<a href="https://colab.research.google.com/github/woodRock/grokking-deep-learning/blob/main/chapter_8_three_layer_network_on_mnist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 8 | Learning signal and ignoring noise

In [17]:
import sys
import numpy as np
from keras.datasets import mnist

# Create the dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()
images, labels = (X_train[0:1000]).reshape(1000,28*28) / 255, y_train[0:1000]

one_hot_labels = np.zeros((len(labels),10))

for i,l in enumerate(labels):
    one_hot_labels[i][l] = 1
labels = one_hot_labels

test_images = X_test.reshape(len(X_test),28*28) / 255
test_labels = np.zeros((len(y_test ),10))
for i,l in enumerate(y_test):
    test_labels[i][l] = 1

# Freeze the seed for reproducability.
np.random.seed(1)

# Activation function
relu = lambda x: (x >= 0) * x
relu2deriv = lambda x: x >= 0

# Hyperparameters
alpha = 0.005
iterations = 300
hidden_size = 100
pixels_per_image = 784
num_labels = 10

# Intitialize the weights.
weights_0_1 = 0.2 * np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, num_labels)) - 0.1

# Training loop
for j in range(iterations):
    error, correct_cnt = (0.0, 0)

    for i in range(len(images)):
        input, target = images[i:i+1], labels[i:i+1]

        # Forward pass
        layer_0 = input
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        layer_2 = np.dot(layer_1, weights_1_2)
        prediction = layer_2

        error += np.sum((target - prediction) ** 2)
        correct_cnt += int(np.argmax(prediction) == np.argmax(target))

        # Back propgation
        layer_2_delta = (target - prediction)
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)

        # Update weights
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    if (j % 10 == 0 or j == iterations -1):
        test_error, test_correct_cnt = (0.0, 0)

        for i in range(len(test_images)):
            input, target = test_images[i:i+1], test_labels[i:i+1]

            # Forward pass
            layer_0 = input
            layer_1 = relu(np.dot(layer_0, weights_0_1))
            layer_2 = np.dot(layer_1, weights_1_2)
            prediction = layer_2

            test_error += np.sum((target - prediction) ** 2)
            test_correct_cnt += int(np.argmax(prediction) == np.argmax(target))
        message=f"I:{j} Training Error: {error/float(len(images)):.4f} Correct: {correct_cnt/float(len(images))} \t Test Error: {test_error/float(len(test_images)):.4f} Correct: {test_correct_cnt/float(len(test_images))}"
        print(message)

I:0 Training Error: 0.6633 Correct: 0.642 	 Test Error: 0.5853 Correct: 0.6926
I:10 Training Error: 0.2724 Correct: 0.921 	 Test Error: 0.3983 Correct: 0.8369
I:20 Training Error: 0.2231 Correct: 0.949 	 Test Error: 0.3918 Correct: 0.837
I:30 Training Error: 0.2000 Correct: 0.964 	 Test Error: 0.4017 Correct: 0.8289
I:40 Training Error: 0.1839 Correct: 0.973 	 Test Error: 0.4127 Correct: 0.8186
I:50 Training Error: 0.1719 Correct: 0.976 	 Test Error: 0.4237 Correct: 0.8106
I:60 Training Error: 0.1628 Correct: 0.981 	 Test Error: 0.4348 Correct: 0.803
I:70 Training Error: 0.1559 Correct: 0.982 	 Test Error: 0.4456 Correct: 0.7958
I:80 Training Error: 0.1504 Correct: 0.988 	 Test Error: 0.4564 Correct: 0.7907
I:90 Training Error: 0.1460 Correct: 0.988 	 Test Error: 0.4667 Correct: 0.7863
I:100 Training Error: 0.1426 Correct: 0.99 	 Test Error: 0.4765 Correct: 0.782
I:110 Training Error: 0.1400 Correct: 0.991 	 Test Error: 0.4854 Correct: 0.7781
I:120 Training Error: 0.1381 Correct: 0.991

Dropout (Srivastava 2014)

References:
1. Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., & Salakhutdinov, R. (2014). Dropout: a simple way to prevent neural networks from overfitting. The journal of machine learning research, 15(1), 1929-1958.

In [30]:
import sys
import numpy as np
from keras.datasets import mnist

# Create the dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()
images, labels = (X_train[0:1000]).reshape(1000,28*28) / 255, y_train[0:1000]

one_hot_labels = np.zeros((len(labels),10))

for i,l in enumerate(labels):
    one_hot_labels[i][l] = 1
labels = one_hot_labels

test_images = X_test.reshape(len(X_test),28*28) / 255
test_labels = np.zeros((len(y_test ),10))
for i,l in enumerate(y_test):
    test_labels[i][l] = 1

# Freeze the seed for reproducability.
np.random.seed(1)

# Activation function
relu = lambda x: (x >= 0) * x
relu2deriv = lambda x: x >= 0

# Hyperparameters
alpha = 0.005
iterations = 300
hidden_size = 100
pixels_per_image = 784
num_labels = 10

# Intitialize the weights.
weights_0_1 = 0.2 * np.random.random((pixels_per_image, hidden_size)) - 0.1
weights_1_2 = 0.2 * np.random.random((hidden_size, num_labels)) - 0.1

# Training loop
for j in range(iterations):
    error, correct_cnt = (0.0, 0)

    for i in range(len(images)):
        input, target = images[i:i+1], labels[i:i+1]

        # Forward pass
        layer_0 = input
        layer_1 = relu(np.dot(layer_0, weights_0_1))
        # Dropout
        dropout_mask = np.random.randint(2, size=layer_1.shape)
        layer_1 *= dropout_mask * 2
        layer_2 = np.dot(layer_1, weights_1_2)
        prediction = layer_2

        error += np.sum((target - prediction) ** 2)
        correct_cnt += int(np.argmax(prediction) == np.argmax(target))

        # Back propgation
        layer_2_delta = (target - prediction)
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1)
        layer_1_delta *= dropout_mask

        # Update weights
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    if (j % 10 == 0 or j == iterations -1):
        test_error, test_correct_cnt = (0.0, 0)

        for i in range(len(test_images)):
            input, target = test_images[i:i+1], test_labels[i:i+1]

            # Forward pass
            layer_0 = input
            layer_1 = relu(np.dot(layer_0, weights_0_1))
            layer_2 = np.dot(layer_1, weights_1_2)
            prediction = layer_2

            test_error += np.sum((target - prediction) ** 2)
            test_correct_cnt += int(np.argmax(prediction) == np.argmax(target))
        message=f"I:{j} Training Error: {error/float(len(images)):.4f} Correct: {correct_cnt/float(len(images))} \t Test Error: {test_error/float(len(test_images)):.4f} Correct: {test_correct_cnt/float(len(test_images))}"
        print(message)

I:0 Training Error: 0.8912 Correct: 0.413 	 Test Error: 0.6414 Correct: 0.6333
I:10 Training Error: 0.4721 Correct: 0.764 	 Test Error: 0.4585 Correct: 0.787
I:20 Training Error: 0.4309 Correct: 0.809 	 Test Error: 0.4156 Correct: 0.8133
I:30 Training Error: 0.4157 Correct: 0.811 	 Test Error: 0.4213 Correct: 0.8114
I:40 Training Error: 0.4132 Correct: 0.827 	 Test Error: 0.4200 Correct: 0.8112
I:50 Training Error: 0.3922 Correct: 0.836 	 Test Error: 0.4099 Correct: 0.8133
I:60 Training Error: 0.4025 Correct: 0.836 	 Test Error: 0.4127 Correct: 0.8236
I:70 Training Error: 0.3833 Correct: 0.857 	 Test Error: 0.4121 Correct: 0.8033
I:80 Training Error: 0.3867 Correct: 0.854 	 Test Error: 0.4107 Correct: 0.8054
I:90 Training Error: 0.3769 Correct: 0.868 	 Test Error: 0.4111 Correct: 0.8144
I:100 Training Error: 0.3697 Correct: 0.864 	 Test Error: 0.4112 Correct: 0.7903
I:110 Training Error: 0.3717 Correct: 0.868 	 Test Error: 0.4110 Correct: 0.8003
I:120 Training Error: 0.3531 Correct: 0.

# Batch gradient descent

In [35]:
import numpy as np

# Create the dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()
images, labels = (X_train[0:1000]).reshape(1000,28*28) / 255, y_train[0:1000]

one_hot_labels = np.zeros((len(labels),10))

for i,l in enumerate(labels):
    one_hot_labels[i][l] = 1
labels = one_hot_labels

test_images = X_test.reshape(len(X_test),28*28) / 255
test_labels = np.zeros((len(y_test ),10))
for i,l in enumerate(y_test):
    test_labels[i][l] = 1

# Freeze the seed for reproducability.
np.random.seed(1)

# Activation function
def relu(x):
    return (x >= 0) * x # returns x if x > 0

def relu2deriv(output):
    return output >= 0 # returns 1 for input > 0

# Hyperparameters
batch_size = 100
alpha, iterations = (0.1, 1_000)
pixels_per_image, num_labels, hidden_size = (784, 10, 100)

# Initialize the network.
weights_0_1 = 0.2*np.random.random((pixels_per_image,hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size,num_labels)) - 0.1

# Training loop
for j in range(iterations):
    error, correct_cnt = (0.0, 0)
    for i in range(int(len(images) / batch_size)):
        batch_start, batch_end = ((i * batch_size),((i+1)*batch_size))
        input, target = images[batch_start:batch_end], labels[batch_start:batch_end]

        # Forward pass
        layer_0 = input
        layer_1 = relu(np.dot(layer_0,weights_0_1))
        # Dropout
        dropout_mask = np.random.randint(2,size=layer_1.shape)
        layer_1 *= dropout_mask * 2
        layer_2 = np.dot(layer_1,weights_1_2)
        prediction = layer_2

        error += np.sum((target - layer_2) ** 2)
        for k in range(batch_size):
            correct_cnt += int(np.argmax(layer_2[k:k+1]) == np.argmax(labels[batch_start+k:batch_start+k+1]))

        # Back propagation
        layer_2_delta = (target-prediction)/batch_size
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)* relu2deriv(layer_1)
        layer_1_delta *= dropout_mask

        # Update weights
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)

    if(j % 10 == 0 or j == iterations - 1):
        test_error = 0.0
        test_correct_cnt = 0

        for i in range(len(test_images)):
            layer_0 = test_images[i:i+1]
            layer_1 = relu(np.dot(layer_0,weights_0_1))
            layer_2 = np.dot(layer_1, weights_1_2)

            test_error += np.sum((test_labels[i:i+1] - layer_2) ** 2)
            test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i:i+1]))

        sys.stdout.write("\n" + \
                         "I:" + str(j) + \
                         " Test-Err:" + str(test_error/ float(len(test_images)))[0:5] +\
                         " Test-Acc:" + str(test_correct_cnt/ float(len(test_images)))+\
                         " Train-Err:" + str(error/ float(len(images)))[0:5] +\
                         " Train-Acc:" + str(correct_cnt/ float(len(images))))


I:0 Test-Err:0.815 Test-Acc:0.3832 Train-Err:1.272 Train-Acc:0.161
I:10 Test-Err:0.569 Test-Acc:0.7183 Train-Err:0.591 Train-Acc:0.672
I:20 Test-Err:0.508 Test-Acc:0.7577 Train-Err:0.530 Train-Acc:0.727
I:30 Test-Err:0.483 Test-Acc:0.7815 Train-Err:0.497 Train-Acc:0.758
I:40 Test-Err:0.464 Test-Acc:0.7915 Train-Err:0.486 Train-Acc:0.75
I:50 Test-Err:0.453 Test-Acc:0.7978 Train-Err:0.462 Train-Acc:0.784
I:60 Test-Err:0.446 Test-Acc:0.8015 Train-Err:0.445 Train-Acc:0.801
I:70 Test-Err:0.437 Test-Acc:0.8054 Train-Err:0.444 Train-Acc:0.807
I:80 Test-Err:0.440 Test-Acc:0.807 Train-Err:0.450 Train-Acc:0.803
I:90 Test-Err:0.437 Test-Acc:0.8059 Train-Err:0.444 Train-Acc:0.798
I:100 Test-Err:0.437 Test-Acc:0.8029 Train-Err:0.436 Train-Acc:0.805
I:110 Test-Err:0.431 Test-Acc:0.8024 Train-Err:0.420 Train-Acc:0.818
I:120 Test-Err:0.433 Test-Acc:0.8006 Train-Err:0.419 Train-Acc:0.824
I:130 Test-Err:0.432 Test-Acc:0.8003 Train-Err:0.424 Train-Acc:0.826
I:140 Test-Err:0.440 Test-Acc:0.7998 Train-Err

In [38]:
np.random.randint(2, size=10)

array([0, 1, 1, 1, 0, 1, 0, 1, 1, 0])