In [1]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split



#load the dataset
digits = load_digits()
X = digits.data
y = digits.target


#Split the dataset 
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)


#Transpose the data matrices to have shape (number of features , number of examples)
X_train = X_train.T
X_test = X_test.T

X_train = X_train/16
X_test = X_test/16

y_train = y_train.reshape(1,-1)
y_test = y_test.reshape(1,-1)

#Get dimension of data
n_x = X_train.shape[0]
m = X_train.shape[1]

#Define the architecture of the data
n_h = 10 #number of neurons in the hidden layer
n_y = 10 #number of layers in the output layer(10 digits)

def init_params():
    """"
    Intializes the weights and biases for the neural network

    Returns a dictionary containing the inital parameters
    """
    W1 = np.random.randn(n_h,n_x)-0.5
    b1 = np.random.randn(n_h,1)-0.5

    W2 = np.random.randn(n_y,n_h) - 0.5
    b2 = np.random.randn(n_y,1)-0.5

    return W1,b1,W2,b2

# calling function to see the initial shapes and values
W1,b1,W2,b2 = init_params()

print(f"Shape of W1: {W1.shape}")
print(f"Shape of b1: {b1.shape}")
print(f"Shape of W2: {W2.shape}")
print(f"Shape of b2: {b2.shape}")

# Now, it's your turn.

def init_params():
    """"
    Intializes the weights and biases for the neural network

    Returns a dictionary containing the inital parameters
    """
    W1 = np.random.randn(n_h,n_x)-0.5
    b1 = np.random.randn(n_h,1)-0.5

    W2 = np.random.randn(n_y,n_h) - 0.5
    b2 = np.random.randn(n_y,1)-0.5

    return W1,b1,W2,b2

# calling function to see the initial shapes and values
W1,b1,W2,b2 = init_params()

print(f"Shape of W1: {W1.shape}")
print(f"Shape of b1: {b1.shape}")
print(f"Shape of W2: {W2.shape}")
print(f"Shape of b2: {b2.shape}")

# Now, it's your turn.


Shape of W1: (10, 64)
Shape of b1: (10, 1)
Shape of W2: (10, 10)
Shape of b2: (10, 1)
Shape of W1: (10, 64)
Shape of b1: (10, 1)
Shape of W2: (10, 10)
Shape of b2: (10, 1)


In [2]:
def ReLU(Z):
    """
    Implements the Rectified Linear Unit (ReLU) activation function.
    
    Arguments:
    Z -- The output of the linear layer, a numpy array of any shape.
    
    Returns:
    A -- The output of ReLU(Z), same shape as Z.
    
    """
    return np.maximum(0, Z)

def softmax(Z):
    """
    Implements the Softmax activation function.
    
    Arguments:
    Z -- The output of the linear layer, a numpy array of shape (n_y, m).
    
    Returns:
    A -- The output of softmax(Z), a probability distribution over classes.
    """
    # The np.exp(Z - np.max(Z)) is a trick for numerical stability
    A = np.exp(Z - np.max(Z)) / sum(np.exp(Z - np.max(Z)))
    return A

def forward_prop(W1, b1, W2, b2, X):
    """
    Implements the forward propagation for our two-layer network.
    
    Arguments:
    W1, b1, W2, b2 -- The parameters of the model.
    X -- The input data of shape (n_x, m).
    
    Returns:
    Z1, A1, Z2, A2 -- Values computed during forward propagation.
                       We need these for the backward pass later!
    """
    # Hidden Layer
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    
    # Output Layer
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    
    return Z1, A1, Z2, A2

# Let's test it with our initialized parameters and training data
# Note: You should have W1, b1, W2, b2 from your init_params() function
# and X_train from your data loading step.
Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X_train)

print("\n--- Forward Propagation ---")
print(f"Shape of Z1: {Z1.shape}")
print(f"Shape of A1: {A1.shape}")
print(f"Shape of Z2: {Z2.shape}")
print(f"Shape of A2: {A2.shape}")

# A2 contains the predictions for each of the 'm' examples.
# Let's look at the prediction for the first example:
print(f"\nPrediction for first example:\n {A2[:, 0]}")
print(f"Sum of probabilities for first example: {np.sum(A2[:, 0])}")


--- Forward Propagation ---
Shape of Z1: (10, 1437)
Shape of A1: (10, 1437)
Shape of Z2: (10, 1437)
Shape of A2: (10, 1437)

Prediction for first example:
 [0.09062914 0.19211407 0.05824004 0.27342622 0.15168987 0.04117552
 0.02876632 0.04502442 0.10191677 0.01701764]
Sum of probabilities for first example: 1.0000000000000002


In [3]:
def one_hot(Y):
    """
    Converts a vector of labels into a one-hot encoded matrix

    Arguments:
    Y -- The label vector of shape (1, m).

    Returns:
    one_hot_Y -- A one-hot encoded matrix of shape (n_y, m).
    """
    # Create an m x (max_label + 1) matrix of zeros
    one_hot_Y = np.zeros((Y.size, Y.max() + 1)) # <-- FIX IS HERE: added parentheses
    
    # Set the element at the correct column (label) to 1 for each example
    one_hot_Y[np.arange(Y.size), Y] = 1

    # Transpose to get the correct shape (n_y, m)
    one_hot_Y = one_hot_Y.T

    return one_hot_Y

def backward_prop(Z1,A1,Z2,A2,W2,X,Y):
    """"
    Implements the backward propogation for the 2 layered network

    z1,a1,z2,a2 -- values from the forward propgation
    w2 = weight matrix of the output layer
    x -- the input data
    y -- the true labels
    """
    m = X.shape[1]

    one_hot_y = one_hot(Y)

    #output layers of the gradient
    dZ2 = A2 - one_hot_y
    dW2 = 1/m * dZ2.dot(A1.T)
    db2 = 1/m * np.sum(dZ2,axis=1,keepdims=True)

    # hidden layer gradients
    dZ1 = W2.T.dot(dZ2)*(Z1>0)  # Element wise multiplication for RELU derivative
    dW1 = 1/m * dZ1.dot(X.T)
    db1 = 1/m * np.sum(dZ1,axis=1,keepdims=True)

    return dW1,db1,dW2,db2
def update_params(W1,b1,W2,b2,dW1,db1,dW2,db2,alpha):
    


    W1 = W1 - alpha*dW1
    b1 = b1 - alpha*db1
    W2 = W2 - alpha*dW2
    b2 = b2 - alpha*db2

    return W1,b1,W2,b2





In [4]:
def get_predictions(A2):
    """
    Gets the class predictions from the output probabilities.
    
    Arguments:
    A2 -- The output probabilities from the softmax layer, shape (n_y, m).
    
    Returns:
    predictions -- A 1D array of predicted class labels.
    """
    return np.argmax(A2, 0) # Returns the index of the max value along axis 0

def get_accuracy(predictions, Y):
    """
    Calculates the accuracy of the predictions against the true labels.
    
    Arguments:
    predictions -- A 1D array of predicted class labels.
    Y -- The true labels, shape (1, m).
    
    Returns:
    accuracy -- The percentage of correct predictions.
    """
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, alpha, iterations):
    """
    Performs gradient descent to train the neural network.
    
    Arguments:
    X -- The input training data.
    Y -- The true labels for the training data.
    alpha -- The learning rate.
    iterations -- The number of training iterations (epochs).
    
    Returns:
    W1, b1, W2, b2 -- The trained parameters.
    """
    # 1. Initialize parameters
    W1, b1, W2, b2 = init_params() # Use your init_params function
    
    for i in range(iterations):
        # 2. Forward Propagation
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X) # Use your forward_prop
        
        # 3. Backward Propagation
        dW1, db1, dW2, db2 = backward_prop(Z1, A1, Z2, A2, W2, X, Y) # Use your backward_prop
        
        # 4. Update Parameters
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha) # Use your update_params
        
        # 5. Print progress (optional)
        if i % 50 == 0: # Print every 50 iterations
            print(f"Iteration: {i}")
            predictions = get_predictions(A2)
            print(f"Accuracy: {get_accuracy(predictions, Y) * 100:.2f}%")
            
    return W1, b1, W2, b2

# --- Training the model ---
# Make sure X_train and y_train are correctly prepared before this point.
# X_train should be transposed and normalized (X_train.T / 16.0)
# y_train should be reshaped (y_train.reshape(1, -1))

# It's crucial to apply the normalization to X_train and X_test here if you haven't already
# For the `load_digits` dataset, pixel values are 0-16, so divide by 16.0
# The original prompt suggested 255.0 for a typical 0-255 image. For load_digits, 16.0 is correct.
X_train = X_train / 16.0
X_test = X_test / 16.0

print("\n--- Starting Training ---")
final_W1, final_b1, final_W2, final_b2 = gradient_descent(X_train, y_train, 0.10, 1000) # You can adjust alpha and iterations
print("--- Training Complete ---")

# Now, test on the test set
print("\n--- Testing on Test Set ---")
A2_test = forward_prop(final_W1, final_b1, final_W2, final_b2, X_test)
test_predictions = get_predictions(A2_test)
test_accuracy = get_accuracy(test_predictions, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


--- Starting Training ---
Iteration: 0
[3 3 6 ... 3 3 3] [[6 0 0 ... 2 7 1]]
Accuracy: 16.70%
Iteration: 50
[6 6 6 ... 6 6 6] [[6 0 0 ... 2 7 1]]
Accuracy: 11.41%
Iteration: 100
[6 6 6 ... 6 6 6] [[6 0 0 ... 2 7 1]]
Accuracy: 14.96%
Iteration: 150
[6 6 6 ... 8 6 6] [[6 0 0 ... 2 7 1]]
Accuracy: 19.21%
Iteration: 200
[6 6 6 ... 8 6 6] [[6 0 0 ... 2 7 1]]
Accuracy: 20.53%
Iteration: 250
[6 1 6 ... 3 6 6] [[6 0 0 ... 2 7 1]]
Accuracy: 20.88%
Iteration: 300
[6 1 0 ... 3 6 1] [[6 0 0 ... 2 7 1]]
Accuracy: 22.76%
Iteration: 350
[6 1 0 ... 3 6 1] [[6 0 0 ... 2 7 1]]
Accuracy: 23.80%
Iteration: 400
[6 1 0 ... 3 6 1] [[6 0 0 ... 2 7 1]]
Accuracy: 24.57%
Iteration: 450
[6 1 0 ... 3 6 1] [[6 0 0 ... 2 7 1]]
Accuracy: 26.58%
Iteration: 500
[6 1 6 ... 3 6 1] [[6 0 0 ... 2 7 1]]
Accuracy: 27.77%
Iteration: 550
[6 1 6 ... 3 6 1] [[6 0 0 ... 2 7 1]]
Accuracy: 28.60%
Iteration: 600
[6 1 6 ... 3 6 1] [[6 0 0 ... 2 7 1]]
Accuracy: 29.09%
Iteration: 650
[6 1 6 ... 3 6 1] [[6 0 0 ... 2 7 1]]
Accuracy: 29.