In [1]:
import numpy as np
import tensorflow as tf

# Question 3: Multi Layer Perceptron



In [2]:
# Load the MNIST dataset
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()

# Check the shape of the data
print("Original training images shape:", train_images.shape)
print("Original training labels shape:", train_labels.shape)


Original training images shape: (60000, 28, 28)
Original training labels shape: (60000,)


In [3]:
# --- Preprocess the Data ---

# Normalize the pixel values to be between 0 and 1
# Convert to float32 first to ensure the division results in a float, not an integer.
train_images = train_images.astype('float32') / 255.0
test_images = test_images.astype('float32') / 255.0

# Reshape the images from (28, 28) to a flat vector of 784 elements
train_images_flat = train_images.reshape(train_images.shape[0], 784)
test_images_flat = test_images.reshape(test_images.shape[0], 784)

# Check the new shape to confirm it's correct
print(f"Original training images shape: {train_images.shape}")
print(f"Reshaped training images shape: {train_images_flat.shape}")
print(f"Reshaped test images shape: {test_images_flat.shape}")

Original training images shape: (60000, 28, 28)
Reshaped training images shape: (60000, 784)
Reshaped test images shape: (10000, 784)


In [4]:
# --- One-Hot Encode the Labels ---
from tensorflow.keras.utils import to_categorical

# Convert labels to one-hot encoding
train_labels_one_hot = to_categorical(train_labels, num_classes=10)
test_labels_one_hot = to_categorical(test_labels, num_classes=10)

# Check the new shape of the labels and look at an example
print("Original training labels shape:", train_labels.shape)
print("One-hot encoded training labels shape:", train_labels_one_hot.shape)

print("\n--- Example ---")
print("Original first label:", train_labels[0])
print("One-hot encoded first label:", train_labels_one_hot[0])

Original training labels shape: (60000,)
One-hot encoded training labels shape: (60000, 10)

--- Example ---
Original first label: 5
One-hot encoded first label: [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


In [5]:
# --- Initialize MLP Parameters ---

# Define network architecture
input_size = 784
hidden_nodes = 20
output_size = 10

# Define hyperparameters
learning_rate = 0.1

# Initialize weights and biases for both layers
# 1. Input to Hidden Layer
weights_h = np.random.rand(input_size, hidden_nodes) * 0.1
biases_h = np.random.rand(hidden_nodes) * 0.1

# 2. Hidden to Output Layer
weights_o = np.random.rand(hidden_nodes, output_size) * 0.1
biases_o = np.random.rand(output_size) * 0.1

# Print the shapes of our new parameters to verify
print("--- MLP Parameters ---")
print("Input->Hidden Weights shape:", weights_h.shape)
print("Input->Hidden Biases shape:", biases_h.shape)
print("Hidden->Output Weights shape:", weights_o.shape)
print("Hidden->Output Biases shape:", biases_o.shape)

--- MLP Parameters ---
Input->Hidden Weights shape: (784, 20)
Input->Hidden Biases shape: (20,)
Hidden->Output Weights shape: (20, 10)
Hidden->Output Biases shape: (10,)


### Training on small 6000 sample subset for initial runs

In [6]:
# --- MLP Training Loop ---

# Set the number of epochs
epochs = 10


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


train_images_small = train_images_flat[:6000]
train_labels_small = train_labels_one_hot[:6000]

for epoch in range(epochs):
    print(f"--- Epoch {epoch + 1}/{epochs} ---")
    
    # Loop through each image (x) and its one-hot label (d)
    for x, d in zip(train_images_small, train_labels_small):
        
        # 1. FORWARD PASS
        # Step 1.1: From Input to Hidden Layer
        hidden_layer_input = np.dot(x, weights_h) + biases_h
        hidden_layer_output = sigmoid(hidden_layer_input)
        
        # Step 1.2: From Hidden to Output Layer
        final_layer_input = np.dot(hidden_layer_output, weights_o) + biases_o
        final_output = sigmoid(final_layer_input)

        # 2. BACKWARD PASS (Backpropagation)
        
        # Step 2.1: Calculate the error term (delta) for the OUTPUT layer
        delta_o = (d - final_output) * final_output * (1 - final_output)
        
        # Step 2.2: Calculate the error term for the HIDDEN layer
        delta_h = np.dot(delta_o, weights_o.T) * hidden_layer_output * (1 - hidden_layer_output)

        # 3. UPDATE PARAMETERS
        
        # Step 3.1: Update Output Layer weights and biases
        weights_o += learning_rate * np.outer(hidden_layer_output, delta_o)
        biases_o += learning_rate * delta_o
        
        # Step 3.2: Update Hidden Layer weights and biases
        weights_h += learning_rate * np.outer(x, delta_h)
        biases_h += learning_rate * delta_h

    # 4. TRACK PERFORMANCE (at the end of each epoch)
    # Perform a full forward pass on the small training set to get the accuracy
    hidden_output = sigmoid(np.dot(train_images_small, weights_h) + biases_h)
    final_output = sigmoid(np.dot(hidden_output, weights_o) + biases_o)
    
    predicted_classes = np.argmax(final_output, axis=1)
    true_classes = np.argmax(train_labels_small, axis=1)
    
    accuracy = np.mean(predicted_classes == true_classes)
    print(f"Training Accuracy: {accuracy * 100:.2f}%")

--- Epoch 1/10 ---
Training Accuracy: 10.13%
--- Epoch 2/10 ---
Training Accuracy: 20.72%
--- Epoch 3/10 ---
Training Accuracy: 51.87%
--- Epoch 4/10 ---
Training Accuracy: 72.02%
--- Epoch 5/10 ---
Training Accuracy: 76.38%
--- Epoch 6/10 ---
Training Accuracy: 81.02%
--- Epoch 7/10 ---
Training Accuracy: 86.10%
--- Epoch 8/10 ---
Training Accuracy: 87.72%
--- Epoch 9/10 ---
Training Accuracy: 89.18%
--- Epoch 10/10 ---
Training Accuracy: 90.07%


In [7]:
# --- Evaluate the MLP on the Test Set ---

# Perform a full forward pass on the test data using the final trained parameters
# Step 1: Input to Hidden Layer
hidden_output_test = sigmoid(np.dot(test_images_flat, weights_h) + biases_h)

# Step 2: Hidden to Output Layer
final_output_test = sigmoid(np.dot(hidden_output_test, weights_o) + biases_o)

# Get the predicted class for each test image
test_predicted_classes = np.argmax(final_output_test, axis=1)

# Get the true class for each test image
test_true_classes = np.argmax(test_labels_one_hot, axis=1)

# Calculate the final test accuracy
test_accuracy = np.mean(test_predicted_classes == test_true_classes)

print(f"Final MLP Test Accuracy: {test_accuracy * 100:.2f}%")

Final MLP Test Accuracy: 86.19%


### Training on the full 60000 sample set

In [8]:
# --- Initialize MLP Parameters ---

# Define network architecture
input_size = 784
hidden_nodes = 20
output_size = 10

# Define hyperparameters
learning_rate = 0.1

# Initialize weights and biases for both layers
# 1. Input to Hidden Layer
weights_h = np.random.rand(input_size, hidden_nodes) * 0.1
biases_h = np.random.rand(hidden_nodes) * 0.1

# 2. Hidden to Output Layer
weights_o = np.random.rand(hidden_nodes, output_size) * 0.1
biases_o = np.random.rand(output_size) * 0.1

# Print the shapes of our new parameters to verify
print("--- MLP Parameters ---")
print("Input->Hidden Weights shape:", weights_h.shape)
print("Input->Hidden Biases shape:", biases_h.shape)
print("Hidden->Output Weights shape:", weights_o.shape)
print("Hidden->Output Biases shape:", biases_o.shape)

--- MLP Parameters ---
Input->Hidden Weights shape: (784, 20)
Input->Hidden Biases shape: (20,)
Hidden->Output Weights shape: (20, 10)
Hidden->Output Biases shape: (10,)


In [9]:
# --- MLP Training Loop ---

# Set the number of epochs
epochs = 10


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

for epoch in range(epochs):
    print(f"--- Epoch {epoch + 1}/{epochs} ---")
    
    # Loop through each image (x) and its one-hot label (d)
    for x, d in zip(train_images_flat, train_labels_one_hot):
        
        # 1. FORWARD PASS
        # Step 1.1: From Input to Hidden Layer
        hidden_layer_input = np.dot(x, weights_h) + biases_h
        hidden_layer_output = sigmoid(hidden_layer_input)
        
        # Step 1.2: From Hidden to Output Layer
        final_layer_input = np.dot(hidden_layer_output, weights_o) + biases_o
        final_output = sigmoid(final_layer_input)

        # 2. BACKWARD PASS (Backpropagation)
        
        # Step 2.1: Calculate the error term (delta) for the OUTPUT layer
        delta_o = (d - final_output) * final_output * (1 - final_output)
        
        # Step 2.2: Calculate the error term for the HIDDEN layer
        delta_h = np.dot(delta_o, weights_o.T) * hidden_layer_output * (1 - hidden_layer_output)

        # 3. UPDATE PARAMETERS
        
        # Step 3.1: Update Output Layer weights and biases
        weights_o += learning_rate * np.outer(hidden_layer_output, delta_o)
        biases_o += learning_rate * delta_o
        
        # Step 3.2: Update Hidden Layer weights and biases
        weights_h += learning_rate * np.outer(x, delta_h)
        biases_h += learning_rate * delta_h

    # 4. TRACK PERFORMANCE (at the end of each epoch)
    # Perform a full forward pass on the small training set to get the accuracy
    hidden_output = sigmoid(np.dot(train_images_flat, weights_h) + biases_h)
    final_output = sigmoid(np.dot(hidden_output, weights_o) + biases_o)
    
    predicted_classes = np.argmax(final_output, axis=1)
    true_classes = np.argmax(train_labels_one_hot, axis=1)
    
    accuracy = np.mean(predicted_classes == true_classes)
    print(f"Training Accuracy: {accuracy * 100:.2f}%")

--- Epoch 1/10 ---
Training Accuracy: 86.02%
--- Epoch 2/10 ---
Training Accuracy: 89.97%
--- Epoch 3/10 ---
Training Accuracy: 90.42%
--- Epoch 4/10 ---
Training Accuracy: 91.44%
--- Epoch 5/10 ---
Training Accuracy: 91.86%
--- Epoch 6/10 ---
Training Accuracy: 92.31%
--- Epoch 7/10 ---
Training Accuracy: 92.58%
--- Epoch 8/10 ---
Training Accuracy: 92.72%
--- Epoch 9/10 ---
Training Accuracy: 92.74%
--- Epoch 10/10 ---
Training Accuracy: 92.81%


In [10]:
# --- Evaluate the MLP on the Test Set ---

# Perform a full forward pass on the test data using the final trained parameters
# Step 1: Input to Hidden Layer
hidden_output_test = sigmoid(np.dot(test_images_flat, weights_h) + biases_h)

# Step 2: Hidden to Output Layer
final_output_test = sigmoid(np.dot(hidden_output_test, weights_o) + biases_o)

# Get the predicted class for each test image
test_predicted_classes = np.argmax(final_output_test, axis=1)

# Get the true class for each test image
test_true_classes = np.argmax(test_labels_one_hot, axis=1)

# Calculate the final test accuracy
test_accuracy = np.mean(test_predicted_classes == test_true_classes)

print(f"Final MLP Test Accuracy: {test_accuracy * 100:.2f}%")

Final MLP Test Accuracy: 91.97%
