## Anurag Nagarkoti (239426), Wahab Haseeb Bhatti (239978), Suyash Gawandi (239716)

In [264]:
# set up log dir and file writer(s)
import os
from datetime import datetime
logdir = os.path.join("logs", "fail" + str(datetime.now()))
train_writer = tf.summary.create_file_writer(os.path.join(logdir, "train"))


In [265]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# this is a super deep model, cool!
n_units = 100
n_layers = 8

#reduced the range of weights from 0.4 to 0.1
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    logits = (tf.matmul(x, layers[-1][0]) + layers[-1][1])

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)
    
    with train_writer.as_default():
        tf.summary.scalar("accuracy", acc, step=step)
        tf.summary.scalar("loss", xent, step=step)
        tf.summary.histogram("logits", (logits), step=step)
        tf.summary.histogram("weights", (w_input), step=step)
    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))




test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.302590847015381 Accuracy: 0.0703125
Loss: 2.3047566413879395 Accuracy: 0.0859375
Loss: 2.3044583797454834 Accuracy: 0.1015625
Loss: 2.304006576538086 Accuracy: 0.125
Loss: 2.3008244037628174 Accuracy: 0.1015625
Loss: 2.298145055770874 Accuracy: 0.1484375
Loss: 2.292484998703003 Accuracy: 0.140625
Loss: 2.2876052856445312 Accuracy: 0.15625
Loss: 2.2033913135528564 Accuracy: 0.1953125
Loss: 1.8391205072402954 Accuracy: 0.234375
Loss: 1.8448164463043213 Accuracy: 0.1953125
Loss: 1.4252291917800903 Accuracy: 0.40625
Loss: 1.2602057456970215 Accuracy: 0.46875
Loss: 1.7310227155685425 Accuracy: 0.3046875
Loss: 1.1376526355743408 Accuracy: 0.6171875
Loss: 0.907447874546051 Accuracy: 0.734375
Loss: 0.6712876558303833 Accuracy: 0.765625
Loss: 0.6052484512329102 Accuracy: 0.765625
Loss: 0.7400422096252441 Accuracy: 0.7734375
Loss: 0.516584038734436 Accuracy: 0.8515625
Loss: 0.23684126138687134 Accuracy: 0.9453125
Final test accuracy: 0.9134999513626099


In [152]:
#Exploding Gradient
for gradients in range(0, 16, 2):
    print(tf.reduce_mean(grads[gradients]))

tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)


In [166]:
#Weights
for layer in range(0, 8):
    print(tf.reduce_mean(layers[layer][0]))

tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)
tf.Tensor(nan, shape=(), dtype=float32)


# Fail 1 Solution:
- Logits did not have the correct activation function for cross entropy loss
- Also has larger weights

In [261]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [266]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 785), started 0:01:09 ago. (Use '!kill 785' to kill it.)

# Fail 2

In [251]:
rm -rf ./logs/

In [201]:
logdir = os.path.join("logs", "fail" + str(datetime.now()))
train_writer = tf.summary.create_file_writer(os.path.join(logdir, "train"))

In [202]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# this is a super deep model, cool!
n_units = 100
n_layers = 8
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.sigmoid(tf.matmul(x, w) + b)
    logits = (tf.matmul(x, layers[-1][0]) + layers[-1][1])

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)
    
    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))

        with train_writer.as_default():
            tf.summary.scalar("accuracy", acc, step=step)
            tf.summary.scalar("loss", xent, step=step)
            tf.summary.histogram("logits", (logits), step=step)
            tf.summary.histogram("weights", (w_input), step=step)
            for grad, var in zip(grads, all_variables):
                tf.summary.histogram(var.name + '_grad', grad, step=step)


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.3205761909484863 Accuracy: 0.0859375
Loss: 2.3008761405944824 Accuracy: 0.125
Loss: 2.2974891662597656 Accuracy: 0.078125
Loss: 2.307187557220459 Accuracy: 0.0546875
Loss: 2.2981297969818115 Accuracy: 0.1484375
Loss: 2.2920022010803223 Accuracy: 0.1171875
Loss: 2.3081936836242676 Accuracy: 0.0859375
Loss: 2.3023176193237305 Accuracy: 0.1015625
Loss: 2.2901558876037598 Accuracy: 0.140625
Loss: 2.303689956665039 Accuracy: 0.125
Loss: 2.30786395072937 Accuracy: 0.125
Loss: 2.3128044605255127 Accuracy: 0.09375
Loss: 2.3132457733154297 Accuracy: 0.078125
Loss: 2.3168492317199707 Accuracy: 0.09375
Loss: 2.304708480834961 Accuracy: 0.109375
Loss: 2.312185764312744 Accuracy: 0.0625
Loss: 2.306908130645752 Accuracy: 0.09375
Loss: 2.3023087978363037 Accuracy: 0.15625
Loss: 2.30953311920166 Accuracy: 0.046875
Loss: 2.297300338745117 Accuracy: 0.1015625
Loss: 2.300079345703125 Accuracy: 0.0859375
Final test accuracy: 0.09799999743700027


In [203]:
#Vanishing Gradient
for gradients in range(0, 16, 2):
    print(tf.reduce_mean(grads[gradients]))

tf.Tensor(7.712398e-11, shape=(), dtype=float32)
tf.Tensor(-8.9114036e-11, shape=(), dtype=float32)
tf.Tensor(4.444703e-09, shape=(), dtype=float32)
tf.Tensor(2.9256881e-08, shape=(), dtype=float32)
tf.Tensor(-2.7893222e-07, shape=(), dtype=float32)
tf.Tensor(-2.359258e-06, shape=(), dtype=float32)
tf.Tensor(-7.1668983e-06, shape=(), dtype=float32)
tf.Tensor(1.3815185e-05, shape=(), dtype=float32)


In [204]:
%load_ext tensorboard
%tensorboard --logdir logs_fail2

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 96972), started 0:02:42 ago. (Use '!kill 96972' to kill it.)

# Fail 2: Vanishing Gradient Problem
- Fixed by changing the activation function from sigmoid to ReLU

# Fail 3

In [233]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# uhm, maybe don't use that many layers actually. 2 is fine!
n_units = 100
n_layers = 2
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, 0.),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, 0.),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, 0.),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.leaky_relu(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 3.4199376106262207 Accuracy: 0.09375
Loss: 2.246652126312256 Accuracy: 0.1328125
Loss: 2.3341312408447266 Accuracy: 0.09375
Loss: 1.8768280744552612 Accuracy: 0.2890625
Loss: 1.7602627277374268 Accuracy: 0.375
Loss: 1.4352123737335205 Accuracy: 0.5078125
Loss: 1.2670220136642456 Accuracy: 0.5078125
Loss: 0.9759520292282104 Accuracy: 0.671875
Loss: 0.7710796594619751 Accuracy: 0.8125
Loss: 0.7946479320526123 Accuracy: 0.7421875
Loss: 0.5637328624725342 Accuracy: 0.84375
Loss: 0.5424056649208069 Accuracy: 0.859375
Loss: 0.6422869563102722 Accuracy: 0.796875
Loss: 0.4663069546222687 Accuracy: 0.8828125
Loss: 0.43985897302627563 Accuracy: 0.859375
Loss: 0.4139168858528137 Accuracy: 0.890625
Loss: 0.5024144649505615 Accuracy: 0.84375
Loss: 0.2682867646217346 Accuracy: 0.9296875
Loss: 0.32801398634910583 Accuracy: 0.90625
Loss: 0.5202920436859131 Accuracy: 0.859375
Loss: 0.5544601082801819 Accuracy: 0.8359375
Final test accuracy: 0.8889999985694885


In [223]:
#Analysing Gradient values for Weights 
for gradients in range(0, 6, 2):
    print(tf.reduce_mean(grads[gradients]))

tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(0.0, shape=(), dtype=float32)


In [234]:
# after leaky relu
for gradients in range(0, 6, 2):
    print(tf.reduce_mean(grads[gradients]))

tf.Tensor(-2.7846037e-05, shape=(), dtype=float32)
tf.Tensor(0.0006243043, shape=(), dtype=float32)
tf.Tensor(-7.450581e-11, shape=(), dtype=float32)


# Fail 3: Dying ReLU problem as the mean of gradients is  0

# Fail 4

In [230]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# 2 layers again
n_units = 100
n_layers = 2
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    logits = tf.matmul(x, layers[-1][0]) + layers[-1][1]

    return logits

lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    # I hear that adding noise to the inputs improves generalization!
    #img_batch += tf.random.normal(tf.shape(img_batch), stddev=1)

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.2976608276367188 Accuracy: 0.1328125
Loss: 0.7102692127227783 Accuracy: 0.78125
Loss: 0.41992825269699097 Accuracy: 0.8515625
Loss: 0.30277734994888306 Accuracy: 0.8984375
Loss: 0.28708428144454956 Accuracy: 0.9140625
Loss: 0.2686609625816345 Accuracy: 0.921875
Loss: 0.3374359607696533 Accuracy: 0.8984375
Loss: 0.22945359349250793 Accuracy: 0.921875
Loss: 0.3092650771141052 Accuracy: 0.890625
Loss: 0.38359490036964417 Accuracy: 0.890625
Loss: 0.3187181055545807 Accuracy: 0.9296875
Loss: 0.1970217227935791 Accuracy: 0.9296875
Loss: 0.14141243696212769 Accuracy: 0.9375
Loss: 0.1671806275844574 Accuracy: 0.9375
Loss: 0.21383097767829895 Accuracy: 0.953125
Loss: 0.19383956491947174 Accuracy: 0.9453125
Loss: 0.16973355412483215 Accuracy: 0.953125
Loss: 0.2713935673236847 Accuracy: 0.9296875
Loss: 0.13440853357315063 Accuracy: 0.953125
Loss: 0.13311249017715454 Accuracy: 0.9453125
Loss: 0.12100504338741302 Accuracy: 0.96875
Final test accuracy: 0.9558999538421631


# Fail 4: Removing noise worked better for generalisation error in this case

In [228]:
import tensorflow as tf
import numpy as np


# get the data
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()


def preprocess_images(images):
    return images.reshape(-1, 784).astype(np.float32) / 255


def preprocess_labels(labels):
    return labels.reshape(-1).astype(np.int32)


train_images = preprocess_images(train_images)
test_images = preprocess_images(test_images)
train_labels = preprocess_labels(train_labels)
test_labels = preprocess_labels(test_labels)

train_data = tf.data.Dataset.from_tensor_slices((train_images, train_labels)).shuffle(60000).batch(128).repeat()
#test_data = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(128)


# define the model first, from input to output

# 2 layers again
n_units = 100
n_layers = 2
w_range = 0.1

# just set up a "chain" of hidden layers
# model is represented by a list where each element is a layer,
# and each layer is in turn a list of the layer variables (w, b)

# first layer goes from n_input to n_hidden
w_input = tf.Variable(tf.random.uniform([784, n_units], -w_range, w_range),
                      name="w0")
b_input = tf.Variable(tf.zeros(n_units), name="b0")
layers = [[w_input, b_input]]

# all other hidden layers go from n_hidden to n_hidden
for layer in range(n_layers - 1):
    w = tf.Variable(tf.random.uniform([n_units, n_units], -w_range, w_range),
                    name="w" + str(layer+1))
    b = tf.Variable(tf.zeros(n_units), name="b" + str(layer+1))
    layers.append([w, b])

# finally add the output layer
w_out = tf.Variable(tf.random.uniform([n_units, 10], -w_range, w_range),
                    name="wout")
b_out = tf.Variable(tf.zeros(10), name="bout")
layers.append([w_out, b_out])

# flatten the layers to get a list of variables
all_variables = [variable for layer in layers for variable in layer]


def model_forward(inputs):
    x = inputs
    for w, b in layers[:-1]:
        x = tf.nn.relu(tf.matmul(x, w) + b)
    # finally, the softmax classification output layer :)))
    logits = (tf.matmul(x, layers[-1][0]) + layers[-1][1])

    return logits


lr = 0.1
train_steps = 2000
for step, (img_batch, lbl_batch) in enumerate(train_data):
    if step > train_steps:
        break

    with tf.GradientTape() as tape:
        # here we just run all the layers in sequence via a for-loop
        logits = model_forward(img_batch)
        xent = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=lbl_batch))

    grads = tape.gradient(xent, all_variables)
    for grad, var in zip(grads, all_variables):
        var.assign_sub(lr*grad)

    if not step % 100:
        preds = tf.argmax(logits, axis=1, output_type=tf.int32)
        acc = tf.reduce_mean(tf.cast(tf.equal(preds, lbl_batch), tf.float32))
        print("Loss: {} Accuracy: {}".format(xent, acc))


test_preds = model_forward(test_images)
test_preds = tf.argmax(test_preds, axis=1, output_type=tf.int32)
acc = tf.reduce_mean(tf.cast(tf.equal(test_preds, test_labels), tf.float32))
print("Final test accuracy: {}".format(acc))


Loss: 2.2788195610046387 Accuracy: 0.2265625
Loss: 0.7461198568344116 Accuracy: 0.765625
Loss: 0.45109856128692627 Accuracy: 0.859375
Loss: 0.4517952799797058 Accuracy: 0.8359375
Loss: 0.16791829466819763 Accuracy: 0.9609375
Loss: 0.2859359681606293 Accuracy: 0.921875
Loss: 0.28815436363220215 Accuracy: 0.890625
Loss: 0.28165754675865173 Accuracy: 0.9140625
Loss: 0.22864007949829102 Accuracy: 0.9453125
Loss: 0.15202617645263672 Accuracy: 0.9453125
Loss: 0.2008635401725769 Accuracy: 0.9453125
Loss: 0.1860160082578659 Accuracy: 0.9453125
Loss: 0.1460484266281128 Accuracy: 0.9453125
Loss: 0.15097488462924957 Accuracy: 0.953125
Loss: 0.46626636385917664 Accuracy: 0.8671875
Loss: 0.13847529888153076 Accuracy: 0.9609375
Loss: 0.18282972276210785 Accuracy: 0.96875
Loss: 0.1205345094203949 Accuracy: 0.9765625
Loss: 0.07197416573762894 Accuracy: 0.984375
Loss: 0.15321514010429382 Accuracy: 0.96875
Loss: 0.14835239946842194 Accuracy: 0.953125
Final test accuracy: 0.9560999870300293


# Fail 5: sparse_softmax_cross_entropy internally applies softmax onto logits and requires raw logit output i.e before softmax