In [1]:
import tensorflow as tf
import numpy as np

In [19]:
# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

In [20]:
def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


In [21]:
train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

In [22]:
train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)

In [23]:
# define the model first, from input to output

# this is a super deep model, cool!
n_units = 100
n_layers = 8
w_range = 0.1

In [24]:
# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

In [25]:
# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

In [26]:
# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])


In [27]:
# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]

In [28]:
def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits

Problem dignoised using relu rather than using sigmoid. As sigmoid returns value between the range of 0 to 1, due to this gradient was trapped into local minima. Before diagnosing sigmoid activation function was applies which retun its range (0, 1).
Whereas, relu returns MAX(0, 1), output of relu does not have a maximum value because it returns 0 if value is negeative. So to minimise loss it helps gradient descent to reach its global minima

In [29]:
lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))

Loss: 2.302622079849243 Accuracy: 0.078125
Loss: 2.3007431030273438 Accuracy: 0.109375
Loss: 2.304184913635254 Accuracy: 0.09375
Loss: 2.2920308113098145 Accuracy: 0.1640625
Loss: 2.3046836853027344 Accuracy: 0.1015625
Loss: 2.30397891998291 Accuracy: 0.1015625
Loss: 2.3020431995391846 Accuracy: 0.046875
Loss: 2.290890693664551 Accuracy: 0.140625
Loss: 2.3018527030944824 Accuracy: 0.1171875
Loss: 2.263622283935547 Accuracy: 0.140625
Loss: 1.8177757263183594 Accuracy: 0.2421875
Loss: 1.4711976051330566 Accuracy: 0.390625
Loss: 1.0134949684143066 Accuracy: 0.5859375
Loss: 0.8483679890632629 Accuracy: 0.703125
Loss: 0.6489059925079346 Accuracy: 0.8203125
Loss: 0.6033318638801575 Accuracy: 0.8515625
Loss: 0.4619532525539398 Accuracy: 0.875
Loss: 0.790960967540741 Accuracy: 0.7734375
Loss: 0.3580614924430847 Accuracy: 0.9140625
Loss: 0.26347577571868896 Accuracy: 0.9296875
Loss: 0.3106400966644287 Accuracy: 0.9140625


In [31]:
test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))

Final test accuracy: 0.9272000193595886
