In [14]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt

In [15]:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

train_images = train_images.astype("float32") / 255.0
test_images = test_images.astype("float32") / 255.0

train_images = train_images.reshape((60000, 28 * 28))
test_images = test_images.reshape((10000, 28 * 28))

In [16]:
def compile_and_train(model):
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.001,
        decay_steps=1000,
        decay_rate=0.96,
        staircase=True,
    )

    optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)

    early_stop = keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=3, restore_best_weights=True
    )

    model.compile(
        optimizer=optimizer,
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    history = model.fit(
        train_images,
        train_labels,
        validation_split=0.2,
        epochs=50,
        batch_size=128,
        callbacks=[early_stop],
        verbose=1,
    )
    return history

In [17]:
model_1 = keras.Sequential([
    keras.layers.Input(shape=(784,)),
    keras.layers.Dense(512, kernel_constraint=keras.constraints.max_norm(3)),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.3),

    keras.layers.Dense(128, kernel_constraint=keras.constraints.max_norm(3)),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.2),

    keras.layers.Dense(10, activation="softmax"),
])

In [18]:
model_2 = keras.Sequential([
    keras.layers.Input(shape=(784,)),
    keras.layers.Dense(256, kernel_constraint=keras.constraints.max_norm(3)),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.25),

    keras.layers.Dense(128, kernel_constraint=keras.constraints.max_norm(3)),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.25),

    keras.layers.Dense(64, kernel_constraint=keras.constraints.max_norm(3)),
    keras.layers.Activation("relu"),
    keras.layers.Dropout(0.2),

    keras.layers.Dense(10, activation="softmax"),
])

In [19]:
history_3 = compile_and_train(model_1)
history_4 = compile_and_train(model_2)

Epoch 1/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 20ms/step - accuracy: 0.8134 - loss: 0.6045 - val_accuracy: 0.9581 - val_loss: 0.1406
Epoch 2/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 17ms/step - accuracy: 0.9521 - loss: 0.1551 - val_accuracy: 0.9702 - val_loss: 0.0994
Epoch 3/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.9670 - loss: 0.1090 - val_accuracy: 0.9756 - val_loss: 0.0839
Epoch 4/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.9751 - loss: 0.0822 - val_accuracy: 0.9747 - val_loss: 0.0856
Epoch 5/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.9791 - loss: 0.0684 - val_accuracy: 0.9773 - val_loss: 0.0765
Epoch 6/50
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 19ms/step - accuracy: 0.9827 - loss: 0.0553 - val_accuracy: 0.9782 - val_loss: 0.0768
Epoch 7/50
[1m375

In [20]:
def plot_history(histories, names):
    plt.figure(figsize=(14, 5))

    # Loss
    plt.subplot(1, 2, 1)
    for history, name in zip(histories, names):
        plt.plot(history.history["loss"], label=f"{name} Train")
        plt.plot(history.history["val_loss"], label=f"{name} Val")
    plt.title("Training and Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    # Accuracy
    plt.subplot(1, 2, 2)
    for history, name in zip(histories, names):
        plt.plot(history.history["accuracy"], label=f"{name} Train")
        plt.plot(history.history["val_accuracy"], label=f"{name} Val")
    plt.title("Training and Validation Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()

    plt.tight_layout()
    plt.show()

plot_history([history_3, history_4], ["Model 1", "Model 2"])

In [23]:
test_loss_3, test_acc_3 = model_1.evaluate(test_images, test_labels, verbose=0)
test_loss_4, test_acc_4 = model_2.evaluate(test_images, test_labels, verbose=0)

In [25]:
print(f"Model 1 - Test Accuracy: {test_acc_3:.4f}, Test Loss: {test_loss_3:.4f}")
print(f"Model 2 - Test Accuracy: {test_acc_4:.4f}, Test Loss: {test_loss_4:.4f}")

Model 1 - Test Accuracy: 0.9810, Test Loss: 0.0623
Model 2 - Test Accuracy: 0.9780, Test Loss: 0.0750


| Metrics | Model 1 (Droput) | Model 2 (MaxNorm) |
|:--------:|:--------:|:--------:|
|  **Regularization**   |  Droput (0.3 & 0.2)   |  MaxNorm constraint (limit = 3.0)   |
|  **Final training loss**   |  ~0.04   |  ~0.035   |
|  **Final validation loss**   |  0.05   |  0.045   |
|  **Test accuracy**    |  97.80%   |  98.10%   |
|  **Test loss**   |  0.0750   |  0.0623   |

####  Model 3 – Dropout Regularization

- Applied **Dropout** with 30% and 20% dropout rates in two hidden layers.
- Dropout helps reduce overfitting by randomly deactivating neurons during training.
- Test accuracy reached **97.80%**, which is strong, but the model exhibited **higher test loss (0.0750)**.
- Slightly lower generalization performance and prediction confidence compared to MaxNorm.

---

####  Model 4 – MaxNorm Constraint Regularization

- Used **MaxNorm constraint** to limit the L2 norm of weight vectors in Dense layers.
- Achieved **higher test accuracy (98.10%)** and **lower test loss (0.0623)** than the Dropout model.
- MaxNorm helped the model learn stable representations while effectively preventing overfitting.
- Demonstrated **better generalization** and **confidence in predictions**.

---