In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load the dataframe
df = pd.read_csv('digit_data.csv')
print('Size of the dataframe: ',df.shape)

In [None]:
# Randomly select and visualize images from each label
random_rows = df.groupby('label').apply(lambda x: x.sample(1)).reset_index(drop=True)
fig, axes = plt.subplots(2, 5, figsize=(3, 3))
for i, ax in enumerate(axes.flat):
    image_data = random_rows.iloc[i, 1:].values
    image = np.array(image_data).reshape(28,28)
    label = random_rows.iloc[i, 0]
    ax.imshow(image, cmap='gray')
    ax.set_title(f"Label: {label}", fontsize=8)
    ax.axis('off')
plt.subplots_adjust(wspace=.2, hspace=-.8)
plt.tight_layout()
plt.show()

In [None]:
# Convert dataframe to numpy array and shuffle
df_arr = np.array(df)
np.random.seed(42)
np.random.shuffle(df_arr)

In [None]:
# Split into test and train sets
test_set = df_arr[:1000,:].T
train_set = df_arr[1000:,:].T

In [None]:
# Normalize input data
train_image = train_set[1:, :] / 255
train_label = train_set[0, :]
test_image = test_set[1:, :] / 255
test_label = test_set[0, :]

In [None]:
# Define model hyperparameters
input_size = 784
hidden_size = 128
output_size = 10
alpha = 0.01
epoch = 200
lambd = 0.01
keep_prob = 0.8

In [None]:
idx = np.random.randint(1000, 42000)
image = train_image[:,idx].reshape(28,28)
label = train_label[idx]
plt.figure(figsize=(3,3))
plt.imshow(image, cmap='gray')
plt.title(f"Label: {label}", fontsize=12)
plt.axis('off')
plt.show()

In [None]:
# Xavier initialization for weights
def initialize_parameters():
    np.random.seed(42)
    W0 = np.random.randn(hidden_size, input_size) * np.sqrt(2 / input_size)
    b0 = np.zeros((hidden_size, 1))
    W1 = np.random.randn(output_size, hidden_size) * np.sqrt(2 / hidden_size)
    b1 = np.zeros((output_size, 1))
    return W0, b0, W1, b1

In [None]:
# ReLU activation function
def relu_activation(x):
    return np.maximum(0, x)

# ReLU derivative
def relu_derivative(x):
    return np.where(x > 0, 1, 0)

# Softmax activation function
def softmax_activation(x):
    temp = np.exp(x - np.max(x))
    return temp / np.sum(temp)

In [None]:
# Forward propagation
def forward_propagation(W0, b0, W1, b1, A0, keep_prob):
    Z1 = np.dot(W0, A0) + b0
    A1 = relu_activation(Z1)
    D1 = np.random.rand(A1.shape[0], A1.shape[1]) < keep_prob
    A1 *= D1
    A1 /= keep_prob
    Z2 = np.dot(W1, A1) + b1
    A2 = softmax_activation(Z2)
    return Z1, A1, Z2, A2

In [None]:
def one_hot_label(Y):
    m = Y.shape[0]
    one_hot_encoded = np.zeros((10, m))
    one_hot_encoded[Y, np.arange(m)] = 1
    return one_hot_encoded

In [None]:
# Backward propagation with L2 regularization
def backward_propagation(Z1, A1, Z2, A2, W0, W1, A0, Y, lambd):
    m = Y.shape[0]
    Y = one_hot_label(Y)
    dZ2 = A2 - Y
    dW1 = (1/m) * np.dot(dZ2, A1.T) + (lambd / m) * W1
    db1 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)
    dZ1 = np.dot(W1.T, dZ2) * relu_derivative(Z1)
    dW0 = (1/m) * np.dot(dZ1, A0.T) + (lambd / m) * W0
    db0 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)
    return dW0, db0, dW1, db1

In [None]:
# Update parameters
def update_parameters(W0, b0, W1, b1, dW0, db0, dW1, db1, alpha):
    W0 -= alpha * dW0
    b0 -= alpha * db0
    W1 -= alpha * dW1
    b1 -= alpha * db1
    return W0, b0, W1, b1

In [None]:
def obtain_prediction(x):
    # If x is a single vector
    if len(x.shape) == 1:
        return np.argmax(x)

    # If x is an array of vectors
    else:
        return np.apply_along_axis(np.argmax, axis=0, arr=x)

In [None]:
# Obtain the accuracy score
def accuracy_score(predicted, actual):
    correct = np.sum(predicted == actual)
    total = predicted.shape
    return correct / total

In [None]:
def gradient_descent_with_early_stopping(A0, Y, alpha, epoch):
    W0, b0, W1, b1 = initialize_parameters()
    accuracy_list = []
    best_validation_accuracy = 0.0
    for i in range(1, epoch + 1):
        Z1, A1, Z2, A2 = forward_propagation(W0, b0, W1, b1, A0, keep_prob)
        dW0, db0, dW1, db1 = backward_propagation(Z1, A1, Z2, A2, W0, W1, A0, Y, lambd)
        W0, b0, W1, b1 = update_parameters(W0, b0, W1, b1, dW0, db0, dW1, db1, alpha)
        
        # Validation accuracy
        validation_predictions = make_predictions(W0, b0, W1, b1, validation_image)
        validation_accuracy = accuracy_score(validation_predictions, validation_label)
        
        accuracy_list.append((i, (validation_accuracy, accuracy)))
        
        # Early stopping
        if validation_accuracy > best_validation_accuracy:
            best_validation_accuracy = validation_accuracy
        else:
            break
    
    return W0, b0, W1, b1, accuracy_list

In [None]:
# Make predictions
def make_predictions(W0, b0, W1, b1, A0):
    _, _, _, A2 = forward_propagation(W0, b0, W1, b1, A0, keep_prob)
    predictions = obtain_prediction(A2)
    return predictions

In [None]:
def test_prediction(W0, b0, W1, b1, idx):
    image = test_set[1:,idx].reshape(28, 28)
    actual_label = test_label[idx]
    prediction = make_predictions(W0, b0, W1, b1, test_image[:,idx])

    plt.figure(figsize=(3, 3))
    plt.imshow(image, cmap='gray')
    plt.title(f'Predicted: {prediction}, Actual: {actual_label}')
    plt.axis('off')
    plt.show()

In [None]:
W0, b0, W1, b1, accuracy_list = gradient_descent_with_early_stopping(train_image, train_label, alpha, epoch)
test_prediction(W0, b0, W1, b1, np.random.randint(0, 1000))

In [None]:
# Extract epoch numbers and accuracy values
epochs = [entry[0] for entry in accuracy_list]
accuracy_values = [entry[1][0] for entry in accuracy_list]

# Find the maximum accuracy and its index
max_accuracy = max(accuracy_values)
max_accuracy_epoch = epochs[accuracy_values.index(max_accuracy)]

# Print the maximum accuracy and its epoch
print(f"Epoch : {max_accuracy_epoch}, Maximum Accuracy: {max_accuracy*100:.2f}%")


# Plot the epoch versus accuracy curve
plt.plot(epochs, accuracy_values)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Epoch vs Accuracy')
plt.grid()
plt.show()