<a href="https://colab.research.google.com/github/vennelanayini/Feed-Forward-Neural-Networks/blob/main/160122737166-DL_Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
# Fetch MNIST dataset
data, labels = fetch_openml('mnist_784', version=1, return_X_y=True, parser='pandas')
data = data.to_numpy() / 255.0  # Normalize pixel values between 0 and 1
labels = labels.to_numpy().astype(int)
# Convert labels to one-hot encoding
encoder = OneHotEncoder(sparse_output=False, categories='auto')
y_encoded = encoder.fit_transform(labels.reshape(-1, 1))
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data, y_encoded, test_size=0.10, random_state=42, stratify=y_encoded
)
# Neural network structure
input_nodes = 784   # 28x28 images
hidden_nodes = 50   # Hidden layer size
output_nodes = 10   # 10 digit classes
learning_step = 0.01
epochs = 10  # Adjusted to range from 5 to 10
batch_count = 32
optimization_methods = ['sgd', 'momentum', 'nesterov', 'rmsprop', 'adam']
# Initialize parameters
np.random.seed(42)
W1 = np.random.randn(input_nodes, hidden_nodes) * 0.01
b1 = np.zeros((1, hidden_nodes))
W2 = np.random.randn(hidden_nodes, output_nodes) * 0.01
b2 = np.zeros((1, output_nodes))
# Activation functions
def activation_sigmoid(x):
    return 1 / (1 + np.exp(-x))
def activation_softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def loss_cross_entropy(predictions, actual):
    return -np.mean(np.sum(actual * np.log(predictions + 1e-9), axis=1))
def evaluate_accuracy(predictions, actual):
    return np.mean(np.argmax(predictions, axis=1) == np.argmax(actual, axis=1))
for method in optimization_methods:
    print(f"Training using {method} optimizer")
    # Initialize weights again
    W1 = np.random.randn(input_nodes, hidden_nodes) * 0.01
    b1 = np.zeros((1, hidden_nodes))
    W2 = np.random.randn(hidden_nodes, output_nodes) * 0.01
    b2 = np.zeros((1, output_nodes))
    velocity_W1 = np.zeros_like(W1)
    velocity_b1 = np.zeros_like(b1)
    velocity_W2 = np.zeros_like(W2)
    velocity_b2 = np.zeros_like(b2)
    for cycle in range(5, epochs + 1):  # Starts at 5
        # Forward propagation
        layer1_output = np.dot(X_train, W1) + b1
        activated_layer1 = activation_sigmoid(layer1_output)
        layer2_output = np.dot(activated_layer1, W2) + b2
        predictions = activation_softmax(layer2_output)
        # Compute loss and accuracy
        loss_value = loss_cross_entropy(predictions, y_train)
        accuracy_value = evaluate_accuracy(predictions, y_train)
        # Backpropagation process
        error_output = predictions - y_train  # Output layer gradient
        gradient_W2 = np.dot(activated_layer1.T, error_output) / len(X_train)
        gradient_b2 = np.sum(error_output, axis=0, keepdims=True) / len(X_train)
        error_hidden = np.dot(error_output, W2.T)
        gradient_hidden = error_hidden * activated_layer1 * (1 - activated_layer1)
        gradient_W1 = np.dot(X_train.T, gradient_hidden) / len(X_train)
        gradient_b1 = np.sum(gradient_hidden, axis=0, keepdims=True) / len(X_train)
        # Apply optimization techniques
        if method == 'sgd':
            W1 -= learning_step * gradient_W1
            b1 -= learning_step * gradient_b1
            W2 -= learning_step * gradient_W2
            b2 -= learning_step * gradient_b2
        elif method == 'momentum':
            momentum_factor = 0.9
            velocity_W1 = momentum_factor * velocity_W1 - learning_step * gradient_W1
            velocity_b1 = momentum_factor * velocity_b1 - learning_step * gradient_b1
            velocity_W2 = momentum_factor * velocity_W2 - learning_step * gradient_W2
            velocity_b2 = momentum_factor * velocity_b2 - learning_step * gradient_b2
            W1 += velocity_W1
            b1 += velocity_b1
            W2 += velocity_W2
            b2 += velocity_b2
        elif method == 'adam':
            beta1, beta2, epsilon = 0.9, 0.999, 1e-8
            m_W1, v_W1 = np.zeros_like(W1), np.zeros_like(W1)
            m_b1, v_b1 = np.zeros_like(b1), np.zeros_like(b1)
            m_W2, v_W2 = np.zeros_like(W2), np.zeros_like(W2)
            m_b2, v_b2 = np.zeros_like(b2), np.zeros_like(b2)
            m_W1 = beta1 * m_W1 + (1 - beta1) * gradient_W1
            v_W1 = beta2 * v_W1 + (1 - beta2) * (gradient_W1 ** 2)
            W1 -= learning_step * m_W1 / (np.sqrt(v_W1) + epsilon)
            m_b1 = beta1 * m_b1 + (1 - beta1) * gradient_b1
            v_b1 = beta2 * v_b1 + (1 - beta2) * (gradient_b1 ** 2)
            b1 -= learning_step * m_b1 / (np.sqrt(v_b1) + epsilon)
            m_W2 = beta1 * m_W2 + (1 - beta1) * gradient_W2
            v_W2 = beta2 * v_W2 + (1 - beta2) * (gradient_W2 ** 2)
            W2 -= learning_step * m_W2 / (np.sqrt(v_W2) + epsilon)
            m_b2 = beta1 * m_b2 + (1 - beta1) * gradient_b2
            v_b2 = beta2 * v_b2 + (1 - beta2) * (gradient_b2 ** 2)
            b2 -= learning_step * m_b2 / (np.sqrt(v_b2) + epsilon)
        print(f"Cycle {cycle}/{epochs} - Loss: {loss_value:.4f} - Accuracy: {accuracy_value:.4f}")
    # Evaluate performance on test data
    test_layer1 = np.dot(X_test, W1) + b1
    test_activated1 = activation_sigmoid(test_layer1)
    test_layer2 = np.dot(test_activated1, W2) + b2
    test_predictions = activation_softmax(test_layer2)
    final_accuracy = evaluate_accuracy(test_predictions, y_test)
    print(f"Final Accuracy with {method}: {final_accuracy:.4f}\n")

'''Optimizer: SGD
Learning Rate: 0.1
Hidden Layer Size: 64
Reason:
Stochastic Gradient Descent (SGD) is a simple and effective optimizer.
A learning rate of 0.1 ensures the model converges faster than a very small learning rate.
Increasing hidden neurons from 50 to 64 allows for better feature representation.
Expected Accuracy: ~92%
Configuration 2 (Adam Optimizer with Tuned Parameters)

Optimizer: Adam
Learning Rate: 0.001
Hidden Layer Size: 128
Reason:
Adam is known to work well for image classification problems, adjusting learning rates adaptively.
A higher hidden layer size (128 neurons) improves model capacity.
Lower learning rate (0.001) prevents instability and overfitting.
Expected Accuracy: ~97%
Configuration 3 (Momentum-Accelerated Learning)

Optimizer: Momentum (β=0.9)
Learning Rate: 0.01
Hidden Layer Size: 100
Reason:
Momentum helps in faster convergence by keeping past gradients in memory.
Learning rate of 0.01 ensures smooth updates without overshooting.
100 hidden neurons provide a balance between complexity and generalization")

Training using sgd optimizer
Cycle 5/10 - Loss: 2.3040 - Accuracy: 0.0994
Cycle 6/10 - Loss: 2.3039 - Accuracy: 0.0994
Cycle 7/10 - Loss: 2.3038 - Accuracy: 0.0994
Cycle 8/10 - Loss: 2.3037 - Accuracy: 0.0994
Cycle 9/10 - Loss: 2.3037 - Accuracy: 0.0994
Cycle 10/10 - Loss: 2.3036 - Accuracy: 0.0994
Final Accuracy with sgd: 0.0994

Training using momentum optimizer
Cycle 5/10 - Loss: 2.3040 - Accuracy: 0.0994
Cycle 6/10 - Loss: 2.3039 - Accuracy: 0.0994
Cycle 7/10 - Loss: 2.3037 - Accuracy: 0.0994
Cycle 8/10 - Loss: 2.3035 - Accuracy: 0.0994
Cycle 9/10 - Loss: 2.3032 - Accuracy: 0.0994
Cycle 10/10 - Loss: 2.3029 - Accuracy: 0.0994
Final Accuracy with momentum: 0.0994

Training using nesterov optimizer
Cycle 5/10 - Loss: 2.3039 - Accuracy: 0.0985
Cycle 6/10 - Loss: 2.3039 - Accuracy: 0.0985
Cycle 7/10 - Loss: 2.3039 - Accuracy: 0.0985
Cycle 8/10 - Loss: 2.3039 - Accuracy: 0.0985
Cycle 9/10 - Loss: 2.3039 - Accuracy: 0.0985
Cycle 10/10 - Loss: 2.3039 - Accuracy: 0.0985
Final Accuracy with

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Fetch Fashion MNIST dataset
data, labels = fetch_openml('Fashion-MNIST', version=1, return_X_y=True, parser='pandas')
data = data.to_numpy() / 255.0  # Normalize pixel values between 0 and 1
labels = labels.to_numpy().astype(int)

# Convert labels to one-hot encoding
encoder = OneHotEncoder(sparse_output=False, categories='auto')
y_encoded = encoder.fit_transform(labels.reshape(-1, 1))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data, y_encoded, test_size=0.10, random_state=42, stratify=y_encoded
)

# Neural network structure
input_nodes = 784   # 28x28 images
hidden_layer_sizes = [64, 128, 100]  # Different configurations
output_nodes = 10   # 10 clothing classes
learning_rates = [0.1, 0.001, 0.01]
optimizers = ['sgd', 'adam', 'momentum']
epochs = 10

# Activation functions
def activation_sigmoid(x):
    return 1 / (1 + np.exp(-x))

def activation_softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def loss_cross_entropy(predictions, actual):
    return -np.mean(np.sum(actual * np.log(predictions + 1e-9), axis=1))

def evaluate_accuracy(predictions, actual):
    return np.mean(np.argmax(predictions, axis=1) == np.argmax(actual, axis=1))

results = []

for hidden_nodes, learning_rate, optimizer in zip(hidden_layer_sizes, learning_rates, optimizers):
    print(f"Training using {optimizer} optimizer with {hidden_nodes} hidden nodes and LR={learning_rate}")

    # Initialize weights
    W1 = np.random.randn(input_nodes, hidden_nodes) * 0.01
    b1 = np.zeros((1, hidden_nodes))
    W2 = np.random.randn(hidden_nodes, output_nodes) * 0.01
    b2 = np.zeros((1, output_nodes))

    velocity_W1 = np.zeros_like(W1)
    velocity_b1 = np.zeros_like(b1)
    velocity_W2 = np.zeros_like(W2)
    velocity_b2 = np.zeros_like(b2)
    for cycle in range(1, epochs + 1):
        # Forward propagation
        layer1_output = np.dot(X_train, W1) + b1
        activated_layer1 = activation_sigmoid(layer1_output)
        layer2_output = np.dot(activated_layer1, W2) + b2
        predictions = activation_softmax(layer2_output)
        # Compute loss and accuracy
        loss_value = loss_cross_entropy(predictions, y_train)
        accuracy_value = evaluate_accuracy(predictions, y_train)
        # Backpropagation process
        error_output = predictions - y_train
        gradient_W2 = np.dot(activated_layer1.T, error_output) / len(X_train)
        gradient_b2 = np.sum(error_output, axis=0, keepdims=True) / len(X_train)
        error_hidden = np.dot(error_output, W2.T)
        gradient_hidden = error_hidden * activated_layer1 * (1 - activated_layer1)
        gradient_W1 = np.dot(X_train.T, gradient_hidden) / len(X_train)
        gradient_b1 = np.sum(gradient_hidden, axis=0, keepdims=True) / len(X_train)
        # Apply optimization techniques
        if optimizer == 'sgd':
            W1 -= learning_rate * gradient_W1
            b1 -= learning_rate * gradient_b1
            W2 -= learning_rate * gradient_W2
            b2 -= learning_rate * gradient_b2
        elif optimizer == 'momentum':
            momentum_factor = 0.9
            velocity_W1 = momentum_factor * velocity_W1 - learning_rate * gradient_W1
            velocity_b1 = momentum_factor * velocity_b1 - learning_rate * gradient_b1
            velocity_W2 = momentum_factor * velocity_W2 - learning_rate * gradient_W2
            velocity_b2 = momentum_factor * velocity_b2 - learning_rate * gradient_b2
            W1 += velocity_W1
            b1 += velocity_b1
            W2 += velocity_W2
            b2 += velocity_b2
        elif optimizer == 'adam':
            beta1, beta2, epsilon = 0.9, 0.999, 1e-8
            m_W1, v_W1 = np.zeros_like(W1), np.zeros_like(W1)
            m_b1, v_b1 = np.zeros_like(b1), np.zeros_like(b1)
            m_W2, v_W2 = np.zeros_like(W2), np.zeros_like(W2)
            m_b2, v_b2 = np.zeros_like(b2), np.zeros_like(b2)
            m_W1 = beta1 * m_W1 + (1 - beta1) * gradient_W1
            v_W1 = beta2 * v_W1 + (1 - beta2) * (gradient_W1 ** 2)
            W1 -= learning_rate * m_W1 / (np.sqrt(v_W1) + epsilon)
            m_b1 = beta1 * m_b1 + (1 - beta1) * gradient_b1
            v_b1 = beta2 * v_b1 + (1 - beta2) * (gradient_b1 ** 2)
            b1 -= learning_rate * m_b1 / (np.sqrt(v_b1) + epsilon)
        print(f"Cycle {cycle}/{epochs} - Loss: {loss_value:.4f} - Accuracy: {accuracy_value:.4f}")
    # Evaluate performance on test data
    test_layer1 = np.dot(X_test, W1) + b1
    test_activated1 = activation_sigmoid(test_layer1)
    test_layer2 = np.dot(test_activated1, W2) + b2
    test_predictions = activation_softmax(test_layer2)
    final_accuracy = evaluate_accuracy(test_predictions, y_test)
    print(f"Final Accuracy with {optimizer}: {final_accuracy:.4f}\n")
    results.append([optimizer, learning_rate, hidden_nodes, final_accuracy])
# Display final results in tabular format
results_df = pd.DataFrame(results, columns=["Optimizer", "Learning Rate", "Hidden Layer Size", "Final Accuracy"])
print(results_df)  # Prints the final conclusions in a table format

Training using sgd optimizer with 64 hidden nodes and LR=0.1
Cycle 1/10 - Loss: 2.3034 - Accuracy: 0.1000
Cycle 2/10 - Loss: 2.3029 - Accuracy: 0.1000
Cycle 3/10 - Loss: 2.3026 - Accuracy: 0.1000
Cycle 4/10 - Loss: 2.3022 - Accuracy: 0.1087
Cycle 5/10 - Loss: 2.3019 - Accuracy: 0.1686
Cycle 6/10 - Loss: 2.3017 - Accuracy: 0.1777
Cycle 7/10 - Loss: 2.3014 - Accuracy: 0.1364
Cycle 8/10 - Loss: 2.3011 - Accuracy: 0.1093
Cycle 9/10 - Loss: 2.3009 - Accuracy: 0.1021
Cycle 10/10 - Loss: 2.3006 - Accuracy: 0.1006
Final Accuracy with sgd: 0.1001

Training using adam optimizer with 128 hidden nodes and LR=0.001
Cycle 1/10 - Loss: 2.3041 - Accuracy: 0.1000
Cycle 2/10 - Loss: 2.2721 - Accuracy: 0.1899
Cycle 3/10 - Loss: 2.2450 - Accuracy: 0.2841
Cycle 4/10 - Loss: 2.2208 - Accuracy: 0.3347
Cycle 5/10 - Loss: 2.1990 - Accuracy: 0.4001
Cycle 6/10 - Loss: 2.1797 - Accuracy: 0.4489
Cycle 7/10 - Loss: 2.1627 - Accuracy: 0.4783
Cycle 8/10 - Loss: 2.1480 - Accuracy: 0.5056
Cycle 9/10 - Loss: 2.1350 - Ac

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import cifar10, cifar100
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# Load CIFAR-10 dataset
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

# Normalize pixel values between 0 and 1
X_train, X_test = X_train / 255.0, X_test / 255.0

# Convert labels to one-hot encoding
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)

# Define neural network parameters
input_nodes = 32 * 32 * 3  # CIFAR images are 32x32x3 (RGB)
hidden_nodes = 128  # Hidden layer size
output_nodes = 10  # 10 classes in CIFAR-10
learning_rate = 0.001
epochs = 10
batch_size = 32

# Initialize weights
np.random.seed(42)
W1 = np.random.randn(input_nodes, hidden_nodes) * 0.01
b1 = np.zeros((1, hidden_nodes))
W2 = np.random.randn(hidden_nodes, output_nodes) * 0.01
b2 = np.zeros((1, output_nodes))

def activation_relu(x):
    return np.maximum(0, x)

def activation_softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def loss_cross_entropy(predictions, actual):
    return -np.mean(np.sum(actual * np.log(predictions + 1e-9), axis=1))

def evaluate_accuracy(predictions, actual):
    return np.mean(np.argmax(predictions, axis=1) == np.argmax(actual, axis=1))

# Training loop
for epoch in range(epochs):
    # Forward propagation
    X_train_flat = X_train.reshape(len(X_train), -1)
    layer1_output = np.dot(X_train_flat, W1) + b1
    activated_layer1 = activation_relu(layer1_output)
    layer2_output = np.dot(activated_layer1, W2) + b2
    predictions = activation_softmax(layer2_output)

    # Compute loss and accuracy
    loss_value = loss_cross_entropy(predictions, y_train)
    accuracy_value = evaluate_accuracy(predictions, y_train)

    # Backpropagation
    error_output = predictions - y_train
    gradient_W2 = np.dot(activated_layer1.T, error_output) / len(X_train)
    gradient_b2 = np.sum(error_output, axis=0, keepdims=True) / len(X_train)
    error_hidden = np.dot(error_output, W2.T) * (activated_layer1 > 0)
    gradient_W1 = np.dot(X_train_flat.T, error_hidden) / len(X_train)
    gradient_b1 = np.sum(error_hidden, axis=0, keepdims=True) / len(X_train)

    # Update weights using SGD
    W1 -= learning_rate * gradient_W1
    b1 -= learning_rate * gradient_b1
    W2 -= learning_rate * gradient_W2
    b2 -= learning_rate * gradient_b2

    print(f"Epoch {epoch+1}/{epochs} - Loss: {loss_value:.4f} - Accuracy: {accuracy_value:.4f}")

# Evaluate on test data
X_test_flat = X_test.reshape(len(X_test), -1)
test_layer1 = np.dot(X_test_flat, W1) + b1
test_activated1 = activation_relu(test_layer1)
test_layer2 = np.dot(test_activated1, W2) + b2
test_predictions = activation_softmax(test_layer2)
final_accuracy = evaluate_accuracy(test_predictions, y_test)
print(f"Final Accuracy on CIFAR-10: {final_accuracy:.4f}")

# CIFAR-100 Implementation
(X_train100, y_train100), (X_test100, y_test100) = cifar100.load_data()
X_train100, X_test100 = X_train100 / 255.0, X_test100 / 255.0
y_train100 = to_categorical(y_train100, 100)
y_test100 = to_categorical(y_test100, 100)

# Modify output layer for CIFAR-100
output_nodes = 100
W2 = np.random.randn(hidden_nodes, output_nodes) * 0.01
b2 = np.zeros((1, output_nodes))

# Training loop for CIFAR-100
for epoch in range(epochs):
    X_train100_flat = X_train100.reshape(len(X_train100), -1)
    layer1_output = np.dot(X_train100_flat, W1) + b1
    activated_layer1 = activation_relu(layer1_output)
    layer2_output = np.dot(activated_layer1, W2) + b2
    predictions = activation_softmax(layer2_output)

    loss_value = loss_cross_entropy(predictions, y_train100)
    accuracy_value = evaluate_accuracy(predictions, y_train100)

    error_output = predictions - y_train100
    gradient_W2 = np.dot(activated_layer1.T, error_output) / len(X_train100)
    gradient_b2 = np.sum(error_output, axis=0, keepdims=True) / len(X_train100)
    error_hidden = np.dot(error_output, W2.T) * (activated_layer1 > 0)
    gradient_W1 = np.dot(X_train100_flat.T, error_hidden) / len(X_train100)
    gradient_b1 = np.sum(error_hidden, axis=0, keepdims=True) / len(X_train100)

    W1 -= learning_rate * gradient_W1
    b1 -= learning_rate * gradient_b1
    W2 -= learning_rate * gradient_W2
    b2 -= learning_rate * gradient_b2

    print(f"Epoch {epoch+1}/{epochs} - Loss: {loss_value:.4f} - Accuracy: {accuracy_value:.4f}")

# Evaluate on CIFAR-100
test_layer1 = np.dot(X_test100.reshape(len(X_test100), -1), W1) + b1
test_activated1 = activation_relu(test_layer1)
test_layer2 = np.dot(test_activated1, W2) + b2
test_predictions = activation_softmax(test_layer2)
final_accuracy = evaluate_accuracy(test_predictions, y_test100)
print(f"Final Accuracy on CIFAR-100: {final_accuracy:.4f}")


Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 0us/step
Epoch 1/10 - Loss: 2.3022 - Accuracy: 0.0952
Epoch 2/10 - Loss: 2.3022 - Accuracy: 0.0952
Epoch 3/10 - Loss: 2.3022 - Accuracy: 0.0951
Epoch 4/10 - Loss: 2.3022 - Accuracy: 0.0952
Epoch 5/10 - Loss: 2.3022 - Accuracy: 0.0953
Epoch 6/10 - Loss: 2.3021 - Accuracy: 0.0954
Epoch 7/10 - Loss: 2.3021 - Accuracy: 0.0955
Epoch 8/10 - Loss: 2.3021 - Accuracy: 0.0956
Epoch 9/10 - Loss: 2.3021 - Accuracy: 0.0956
Epoch 10/10 - Loss: 2.3021 - Accuracy: 0.0958
Final Accuracy on CIFAR-10: 0.0932
Downloading data from https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz
[1m169001437/169001437[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 0us/step
Epoch 1/10 - Loss: 4.6057 - Accuracy: 0.0103
Epoch 2/10 - Loss: 4.6057 - Accuracy: 0.0103
Epoch 3/10 - Loss: 4.6057 - Accuracy: 0.0104
Epoch 4/10 - Loss: 4.6057 - Accuracy: 0.0104
Epo