<a href="https://colab.research.google.com/github/vennelanayini/Feed-Forward-Neural-Networks/blob/main/160122737166-DL_Assignment_1(MNIST).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
# Fetch MNIST dataset
data, labels = fetch_openml('mnist_784', version=1, return_X_y=True, parser='pandas')
data = data.to_numpy() / 255.0  # Normalize pixel values between 0 and 1
labels = labels.to_numpy().astype(int)
# Convert labels to one-hot encoding
encoder = OneHotEncoder(sparse_output=False, categories='auto')
y_encoded = encoder.fit_transform(labels.reshape(-1, 1))
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data, y_encoded, test_size=0.10, random_state=42, stratify=y_encoded
)
# Neural network structure
input_nodes = 784   # 28x28 images
hidden_nodes = 50   # Hidden layer size
output_nodes = 10   # 10 digit classes
learning_step = 0.01
epochs = 10  # Adjusted to range from 5 to 10
batch_count = 32
optimization_methods = ['sgd', 'momentum', 'nesterov', 'rmsprop', 'adam']
# Initialize parameters
np.random.seed(42)
W1 = np.random.randn(input_nodes, hidden_nodes) * 0.01
b1 = np.zeros((1, hidden_nodes))
W2 = np.random.randn(hidden_nodes, output_nodes) * 0.01
b2 = np.zeros((1, output_nodes))
# Activation functions
def activation_sigmoid(x):
    return 1 / (1 + np.exp(-x))
def activation_softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def loss_cross_entropy(predictions, actual):
    return -np.mean(np.sum(actual * np.log(predictions + 1e-9), axis=1))
def evaluate_accuracy(predictions, actual):
    return np.mean(np.argmax(predictions, axis=1) == np.argmax(actual, axis=1))
for method in optimization_methods:
    print(f"Training using {method} optimizer")
    # Initialize weights again
    W1 = np.random.randn(input_nodes, hidden_nodes) * 0.01
    b1 = np.zeros((1, hidden_nodes))
    W2 = np.random.randn(hidden_nodes, output_nodes) * 0.01
    b2 = np.zeros((1, output_nodes))
    velocity_W1 = np.zeros_like(W1)
    velocity_b1 = np.zeros_like(b1)
    velocity_W2 = np.zeros_like(W2)
    velocity_b2 = np.zeros_like(b2)
    for cycle in range(5, epochs + 1):  # Starts at 5
        # Forward propagation
        layer1_output = np.dot(X_train, W1) + b1
        activated_layer1 = activation_sigmoid(layer1_output)
        layer2_output = np.dot(activated_layer1, W2) + b2
        predictions = activation_softmax(layer2_output)
        # Compute loss and accuracy
        loss_value = loss_cross_entropy(predictions, y_train)
        accuracy_value = evaluate_accuracy(predictions, y_train)
        # Backpropagation process
        error_output = predictions - y_train  # Output layer gradient
        gradient_W2 = np.dot(activated_layer1.T, error_output) / len(X_train)
        gradient_b2 = np.sum(error_output, axis=0, keepdims=True) / len(X_train)
        error_hidden = np.dot(error_output, W2.T)
        gradient_hidden = error_hidden * activated_layer1 * (1 - activated_layer1)
        gradient_W1 = np.dot(X_train.T, gradient_hidden) / len(X_train)
        gradient_b1 = np.sum(gradient_hidden, axis=0, keepdims=True) / len(X_train)
        # Apply optimization techniques
        if method == 'sgd':
            W1 -= learning_step * gradient_W1
            b1 -= learning_step * gradient_b1
            W2 -= learning_step * gradient_W2
            b2 -= learning_step * gradient_b2
        elif method == 'momentum':
            momentum_factor = 0.9
            velocity_W1 = momentum_factor * velocity_W1 - learning_step * gradient_W1
            velocity_b1 = momentum_factor * velocity_b1 - learning_step * gradient_b1
            velocity_W2 = momentum_factor * velocity_W2 - learning_step * gradient_W2
            velocity_b2 = momentum_factor * velocity_b2 - learning_step * gradient_b2
            W1 += velocity_W1
            b1 += velocity_b1
            W2 += velocity_W2
            b2 += velocity_b2
        elif method == 'adam':
            beta1, beta2, epsilon = 0.9, 0.999, 1e-8
            m_W1, v_W1 = np.zeros_like(W1), np.zeros_like(W1)
            m_b1, v_b1 = np.zeros_like(b1), np.zeros_like(b1)
            m_W2, v_W2 = np.zeros_like(W2), np.zeros_like(W2)
            m_b2, v_b2 = np.zeros_like(b2), np.zeros_like(b2)
            m_W1 = beta1 * m_W1 + (1 - beta1) * gradient_W1
            v_W1 = beta2 * v_W1 + (1 - beta2) * (gradient_W1 ** 2)
            W1 -= learning_step * m_W1 / (np.sqrt(v_W1) + epsilon)
            m_b1 = beta1 * m_b1 + (1 - beta1) * gradient_b1
            v_b1 = beta2 * v_b1 + (1 - beta2) * (gradient_b1 ** 2)
            b1 -= learning_step * m_b1 / (np.sqrt(v_b1) + epsilon)
            m_W2 = beta1 * m_W2 + (1 - beta1) * gradient_W2
            v_W2 = beta2 * v_W2 + (1 - beta2) * (gradient_W2 ** 2)
            W2 -= learning_step * m_W2 / (np.sqrt(v_W2) + epsilon)
            m_b2 = beta1 * m_b2 + (1 - beta1) * gradient_b2
            v_b2 = beta2 * v_b2 + (1 - beta2) * (gradient_b2 ** 2)
            b2 -= learning_step * m_b2 / (np.sqrt(v_b2) + epsilon)
        print(f"Cycle {cycle}/{epochs} - Loss: {loss_value:.4f} - Accuracy: {accuracy_value:.4f}")
    # Evaluate performance on test data
    test_layer1 = np.dot(X_test, W1) + b1
    test_activated1 = activation_sigmoid(test_layer1)
    test_layer2 = np.dot(test_activated1, W2) + b2
    test_predictions = activation_softmax(test_layer2)
    final_accuracy = evaluate_accuracy(test_predictions, y_test)
    print(f"Final Accuracy with {method}: {final_accuracy:.4f}\n")

'''Optimizer: SGD
Learning Rate: 0.1
Hidden Layer Size: 64
Reason:
Stochastic Gradient Descent (SGD) is a simple and effective optimizer.
A learning rate of 0.1 ensures the model converges faster than a very small learning rate.
Increasing hidden neurons from 50 to 64 allows for better feature representation.
Expected Accuracy: ~92%
Configuration 2 (Adam Optimizer with Tuned Parameters)

Optimizer: Adam
Learning Rate: 0.001
Hidden Layer Size: 128
Reason:
Adam is known to work well for image classification problems, adjusting learning rates adaptively.
A higher hidden layer size (128 neurons) improves model capacity.
Lower learning rate (0.001) prevents instability and overfitting.
Expected Accuracy: ~97%
Configuration 3 (Momentum-Accelerated Learning)

Optimizer: Momentum (β=0.9)
Learning Rate: 0.01
Hidden Layer Size: 100
Reason:
Momentum helps in faster convergence by keeping past gradients in memory.
Learning rate of 0.01 ensures smooth updates without overshooting.
100 hidden neurons provide a balance between complexity and generalization")

Training using sgd optimizer
Cycle 5/10 - Loss: 2.3040 - Accuracy: 0.0994
Cycle 6/10 - Loss: 2.3039 - Accuracy: 0.0994
Cycle 7/10 - Loss: 2.3038 - Accuracy: 0.0994
Cycle 8/10 - Loss: 2.3037 - Accuracy: 0.0994
Cycle 9/10 - Loss: 2.3037 - Accuracy: 0.0994
Cycle 10/10 - Loss: 2.3036 - Accuracy: 0.0994
Final Accuracy with sgd: 0.0994

Training using momentum optimizer
Cycle 5/10 - Loss: 2.3040 - Accuracy: 0.0994
Cycle 6/10 - Loss: 2.3039 - Accuracy: 0.0994
Cycle 7/10 - Loss: 2.3037 - Accuracy: 0.0994
Cycle 8/10 - Loss: 2.3035 - Accuracy: 0.0994
Cycle 9/10 - Loss: 2.3032 - Accuracy: 0.0994
Cycle 10/10 - Loss: 2.3029 - Accuracy: 0.0994
Final Accuracy with momentum: 0.0994

Training using nesterov optimizer
Cycle 5/10 - Loss: 2.3039 - Accuracy: 0.0985
Cycle 6/10 - Loss: 2.3039 - Accuracy: 0.0985
Cycle 7/10 - Loss: 2.3039 - Accuracy: 0.0985
Cycle 8/10 - Loss: 2.3039 - Accuracy: 0.0985
Cycle 9/10 - Loss: 2.3039 - Accuracy: 0.0985
Cycle 10/10 - Loss: 2.3039 - Accuracy: 0.0985
Final Accuracy with