In [1]:
import tensorflow as tf
import numpy as np

In [2]:
# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)

In [7]:
# define the model first, from input to output

# 2 layers again
n_units = 100
n_layers = 2
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

In [8]:
# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]

In [9]:
def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    ###################################################
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]
    ###################################################
    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))

Loss: 2.310793876647949 Accuracy: 0.0546875
Loss: 0.600801944732666 Accuracy: 0.8203125
Loss: 0.4528772532939911 Accuracy: 0.859375
Loss: 0.34800082445144653 Accuracy: 0.890625
Loss: 0.4200986623764038 Accuracy: 0.8671875
Loss: 0.28800642490386963 Accuracy: 0.9296875
Loss: 0.1946699023246765 Accuracy: 0.9296875
Loss: 0.32868507504463196 Accuracy: 0.890625
Loss: 0.3479222059249878 Accuracy: 0.9296875
Loss: 0.22616975009441376 Accuracy: 0.9609375
Loss: 0.24219101667404175 Accuracy: 0.9375
Loss: 0.22506286203861237 Accuracy: 0.9453125
Loss: 0.1369064599275589 Accuracy: 0.96875
Loss: 0.2659686207771301 Accuracy: 0.9296875
Loss: 0.14065642654895782 Accuracy: 0.9609375
Loss: 0.12561121582984924 Accuracy: 0.9609375
Loss: 0.1326696276664734 Accuracy: 0.953125
Loss: 0.2205049991607666 Accuracy: 0.9296875
Loss: 0.10521896183490753 Accuracy: 0.9609375
Loss: 0.16275766491889954 Accuracy: 0.9609375
Loss: 0.12569619715213776 Accuracy: 0.9375


Problem diagnosed with removing softmax above at variable named as "logits". As you don't need to do softmax if you don't need probabilities. And using raw logits leads to more numerically stable code. During evaluation, if you are only interested in the highest-probability class, then you can do argmax(vec) on the logits. If you want probability distribution over classes, then you'll need to exponentiate and normalize to 1 - that's what softmax does. After diagnosing, we have more optimum outcome in reducing loss and improved accuracy

In [10]:
test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))

Final test accuracy: 0.9610000252723694
