In [1]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers, models

In [2]:
(X_train, y_train), (X_test, y_test)= tf.keras.datasets.mnist.load_data()
X_train.shape, X_test.shape, y_train.shape, y_train.shape

((60000, 28, 28), (10000, 28, 28), (60000,), (60000,))

In [3]:
#Normalize the dataset to 0 and 1
X_train = X_train.astype("float32") / 255.0
X_test = X_test.astype("float32") / 255.0

In [4]:
#Add the grey scale channel (called channel 1) to the existing rows and columns.
X_train = X_train[..., None]
X_test = X_test[..., None]

In [5]:
val_size = 6000
X_val = X_train[:val_size]
y_val = y_train[:val_size]

X_tr = X_train[val_size:]
y_tr = y_train[val_size:]

In [6]:
X_val.shape, y_val.shape, X_tr.shape, y_tr.shape

((6000, 28, 28, 1), (6000,), (54000, 28, 28, 1), (54000,))

In [7]:
# RandomTranslation(height_factor=0.08, width_factor=0.08) moves (shifts) the whole image up/down and left/right by a random amount.
# height_factor=0.08 → shift up/down by up to 0.08 × 28 = 2.24 pixels
# width_factor=0.08 → shift left/right by up to 0.08 × 28 = 2.24 pixels
# So the digit might move about ±2 pixels vertically and ±2 pixels horizontally (roughly).
# When you shift an image, part of it moves out of frame and new blank area appears on the opposite side. Keras fills that new area (typically with 0 / black).

use_argmentation = True
augment = keras.Sequential([
    layers.RandomTranslation(height_factor=0.08, width_factor=0.08),
    layers.RandomRotation(0.06)
])

2026-01-11 13:36:47.370786: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2026-01-11 13:36:47.370806: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2026-01-11 13:36:47.370813: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2026-01-11 13:36:47.370825: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2026-01-11 13:36:47.370835: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
model = keras.Sequential([
    layers.Input(shape=(28, 28, 1)),
    layers.Conv2D(32, 3, padding="same"),
    layers.BatchNormalization(),
    layers.Activation("relu"),
    layers.Conv2D(32, 3, padding="same"),
    layers.BatchNormalization(),
    layers.Activation("relu"),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),

    layers.Conv2D(64, 3, padding="same"),
    layers.BatchNormalization(),
    layers.Activation("relu"),
    layers.Conv2D(64, 3, padding="same"),
    layers.BatchNormalization(),
    layers.Activation("relu"),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),

    layers.Flatten(),
    layers.Dense(128),
    layers.BatchNormalization(),
    layers.Activation('relu'),

    layers.Dense(10, activation="softmax")
])

In [9]:
use_sgd = False

if use_sgd:
    optimizer = keras.optimizers.SGD(learning_rate=0.05, momentum=0.9, nesterov=True)
else:
    optimizer = keras.optimizers.Adam(learning_rate=1e-3)

model.compile(
    optimizer=optimizer,
    loss=keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"],
)


In [10]:
model.summary()

In [11]:
# Callbacks: Early Stopping, Checkpoint, LR schedule
callbacks = [
    keras.callbacks.ModelCheckpoint(
        'mnist_best.keras',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    ),
    keras.callbacks.EarlyStopping(
        monitor='val_accuracy',
        patience=3,
        restore_best_weights=True,
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=1,
        min_lr=1e-5,
        verbose=1
    )
]

In [12]:
history = model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=128,
    callbacks=callbacks,
    verbose=1  # set 1 to see per-epoch logs
)

Epoch 1/20


2026-01-11 13:36:48.258406: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.9085 - loss: 0.3134
Epoch 1: val_accuracy improved from None to 0.91717, saving model to mnist_best.keras
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 43ms/step - accuracy: 0.9619 - loss: 0.1363 - val_accuracy: 0.9172 - val_loss: 0.2668 - learning_rate: 0.0010
Epoch 2/20
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.9862 - loss: 0.0463
Epoch 2: val_accuracy improved from 0.91717 to 0.98767, saving model to mnist_best.keras
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 41ms/step - accuracy: 0.9871 - loss: 0.0439 - val_accuracy: 0.9877 - val_loss: 0.0390 - learning_rate: 0.0010
Epoch 3/20
[1m421/422[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 39ms/step - accuracy: 0.9885 - loss: 0.0362
Epoch 3: val_accuracy improved from 0.98767 to 0.99017, saving model to mnist_best.keras
[1m422/422[0m [32m━━━━━━━━━

In [13]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print("Test Accuracy = ", test_acc)

best = keras.models.load_model("mnist_best.keras")
best_test_loss, best_test_acc = best.evaluate(X_test, y_test, verbose=1)
print('Best saved model test accuracy ', best_test_acc)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9963 - loss: 0.0125
Test Accuracy =  0.9962999820709229
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9963 - loss: 0.0125
Best saved model test accuracy  0.9962999820709229
