In [1]:
import tensorflow as tf
import numpy as np

In [2]:
# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

In [9]:
def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)

In [10]:
train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)

In [11]:
# define the model first, from input to output

# 2 layers again
n_units = 100
n_layers = 2
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

In [12]:
# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]

In [13]:
def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    # I hear that adding noise to the inputs improves generalization!
     #img_batch += tf.random.normal(tf.shape(img_batch), stddev=4.)

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))

Loss: 2.302609920501709 Accuracy: 0.0703125
Loss: 2.297576904296875 Accuracy: 0.140625
Loss: 2.2948856353759766 Accuracy: 0.15625
Loss: 2.304457902908325 Accuracy: 0.0859375
Loss: 2.2927143573760986 Accuracy: 0.1484375
Loss: 2.3073549270629883 Accuracy: 0.1171875
Loss: 2.305847644805908 Accuracy: 0.09375
Loss: 2.30051589012146 Accuracy: 0.09375
Loss: 2.2985267639160156 Accuracy: 0.140625
Loss: 2.305119037628174 Accuracy: 0.125
Loss: 2.297802209854126 Accuracy: 0.1484375
Loss: 2.2981455326080322 Accuracy: 0.1328125
Loss: 2.3044142723083496 Accuracy: 0.1015625
Loss: 2.301720142364502 Accuracy: 0.1640625
Loss: 2.2975637912750244 Accuracy: 0.125
Loss: 2.2974495887756348 Accuracy: 0.1328125
Loss: 2.3013200759887695 Accuracy: 0.0859375
Loss: 2.287970542907715 Accuracy: 0.2109375
Loss: 2.300555467605591 Accuracy: 0.1171875
Loss: 2.2937002182006836 Accuracy: 0.15625
Loss: 2.305666446685791 Accuracy: 0.078125


Problem diagnosed with added noise means that the network is less able to memorize training samples because they are changing all of the time, resulting in smaller network weights and a more robust network that has lower generalization error. Due to this reason it was indirectlt affecting the accuracy of model

In [14]:
test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))

Final test accuracy: 0.11349999904632568
