In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train.reshape(-1, 784).astype(np.float32) / 255.0
x_test = x_test.reshape(-1, 784).astype(np.float32) / 255.0
y_train = tf.one_hot(y_train, depth=10)
y_test = tf.one_hot(y_test, depth=10)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [2]:
def train_model(hidden1=128, hidden2=64, activation=tf.nn.relu, lr=0.01, epochs=10, batch_size=100):
    input_size = 784
    output_size = 10

    W1 = tf.Variable(tf.random.normal([input_size, hidden1], stddev=0.1))
    b1 = tf.Variable(tf.zeros([hidden1]))
    W2 = tf.Variable(tf.random.normal([hidden1, hidden2], stddev=0.1))
    b2 = tf.Variable(tf.zeros([hidden2]))
    W3 = tf.Variable(tf.random.normal([hidden2, output_size], stddev=0.1))
    b3 = tf.Variable(tf.zeros([output_size]))

    def forward(x):
        z1 = tf.matmul(x, W1) + b1
        a1 = activation(z1)
        z2 = tf.matmul(a1, W2) + b2
        a2 = activation(z2)
        return tf.matmul(a2, W3) + b3

    def compute_loss(logits, labels):
        return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=logits))

    def compute_accuracy(logits, labels):
        preds = tf.argmax(logits, axis=1)
        actual = tf.argmax(labels, axis=1)
        return tf.reduce_mean(tf.cast(tf.equal(preds, actual), tf.float32))

    num_batches = x_train.shape[0] // batch_size
    for epoch in range(epochs):
        avg_loss = 0
        for i in range(num_batches):
            start, end = i * batch_size, (i + 1) * batch_size
            x_batch = x_train[start:end]
            y_batch = y_train[start:end]
            with tf.GradientTape() as tape:
                logits = forward(x_batch)
                loss = compute_loss(logits, y_batch)
            grads = tape.gradient(loss, [W1, b1, W2, b2, W3, b3])
            for var, grad in zip([W1, b1, W2, b2, W3, b3], grads):
                var.assign_sub(lr * grad)
            avg_loss += loss.numpy()

        test_logits = forward(x_test)
        test_acc = compute_accuracy(test_logits, y_test)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Accuracy: {test_acc.numpy():.4f}")


In [3]:
# Try different activation functions and hidden layer sizes
train_model(hidden1=128, hidden2=64, activation=tf.nn.relu, lr=0.01, epochs=10, batch_size=100)
train_model(hidden1=256, hidden2=128, activation=tf.nn.relu, lr=0.005, epochs=10, batch_size=64)

Epoch 1/10, Loss: 769.3507, Accuracy: 0.8203
Epoch 2/10, Loss: 314.3485, Accuracy: 0.8794
Epoch 3/10, Loss: 241.1804, Accuracy: 0.8977
Epoch 4/10, Loss: 210.6573, Accuracy: 0.9068
Epoch 5/10, Loss: 192.2643, Accuracy: 0.9132
Epoch 6/10, Loss: 178.9829, Accuracy: 0.9187
Epoch 7/10, Loss: 168.4487, Accuracy: 0.9225
Epoch 8/10, Loss: 159.6407, Accuracy: 0.9256
Epoch 9/10, Loss: 152.0442, Accuracy: 0.9288
Epoch 10/10, Loss: 145.3033, Accuracy: 0.9308
Epoch 1/10, Loss: 930.0171, Accuracy: 0.8579
Epoch 2/10, Loss: 421.8740, Accuracy: 0.8918
Epoch 3/10, Loss: 340.5153, Accuracy: 0.9061
Epoch 4/10, Loss: 301.7886, Accuracy: 0.9122
Epoch 5/10, Loss: 276.7086, Accuracy: 0.9176
Epoch 6/10, Loss: 258.0137, Accuracy: 0.9226
Epoch 7/10, Loss: 242.9494, Accuracy: 0.9263
Epoch 8/10, Loss: 230.2112, Accuracy: 0.9303
Epoch 9/10, Loss: 219.1077, Accuracy: 0.9329
Epoch 10/10, Loss: 209.2189, Accuracy: 0.9358
