In [None]:
import numpy as np

class LinearLayer:
    """
    A fully connected (dense) layer that performs a linear transformation.

    Attributes:
        W (numpy.ndarray): Weight matrix with shape (output_dim, input_dim).
        b (numpy.ndarray): Bias vector with shape (output_dim, 1).
        X (numpy.ndarray): Cached input used during the forward pass.
        dW (numpy.ndarray): Gradient with respect to the weights.
        db (numpy.ndarray): Gradient with respect to the biases.
    """
    def __init__(self, input_dim, output_dim):
        """
        Initialize the LinearLayer with random weights and biases using He initialization.

        Args:
            input_dim (int): Dimension of the input features.
            output_dim (int): Number of neurons (output features).

        Weight initialization:
            Weights and biases are initialized from a normal distribution and scaled by sqrt(2/input_dim).
        """
        self.W = np.random.randn(output_dim, input_dim) * np.sqrt(2.0 / input_dim)
        self.b = np.random.randn(output_dim, 1) * np.sqrt(2.0 / input_dim)

    def forward(self, X):
        """
        Compute the forward pass of the linear layer.

        Args:
            X (numpy.ndarray): Input data with shape (input_dim, m) where m is the number of examples.

        Returns:
            numpy.ndarray: Linear output with shape (output_dim, m)

        Notes:
            The input X is stored for use during backpropagation.
        """

        # TODO: Store the input and calculate the output of the linear layer
        pass

    def backward(self, dA):
        """
        Compute the backward pass of the linear layer.

        Args:
            dA (numpy.ndarray): Gradient of the loss with respect to the output of this layer,
                                having shape (output_dim, m).

        Returns:
            numpy.ndarray: Gradient of the loss with respect to the input X,
                           with shape (input_dim, m).

        Updates:
            Sets self.dW as the gradient with respect to W (shape: (output_dim, input_dim)).
            Sets self.db as the gradient with respect to b (shape: (output_dim, 1)).
        """

        # TODO: Calculate the gradient of the loss with respect to the weights and biases
        # TODO: Return the gradient of the loss with respect to the input
        pass

    def update(self, lr):
        """
        Update the parameters of the layer using gradient descent.

        Args:
            lr (float): Learning rate for the parameter update.

        Returns:
            None
        """

        # TODO: Update the weights and biases of the layer using the learning rate
        pass

class ReLU:
    """
    Rectified Linear Unit (ReLU) activation function.
    """
    def forward(self, X):
        """
        Compute the forward pass using ReLU activation.

        Args:
            X (numpy.ndarray): Input data of any shape.

        Returns:
            numpy.ndarray: Output after applying ReLU element-wise (same shape as X).
        """

        # TODO: Store the input and calculate the output of the ReLU layer
        pass

    def backward(self, dA):
        """
        Compute the backward pass for the ReLU activation.

        Args:
            dA (numpy.ndarray): Gradient of the loss with respect to the ReLU output,
                                having the same shape as the input X.

        Returns:
            numpy.ndarray: Gradient of the loss with respect to the input X.
        """

        # TODO: Calculate the gradient of the loss with respect to the input
        # TODO: Return the gradient of the loss with respect to the input
        pass

    def update(self, lr):
        """
        Update function for ReLU activation. Since ReLU has no parameters, no update is performed.

        Args:
            lr (float): Learning rate

        Returns:
            None
        """
        # TODO: Update the weights and biases of the layer using the learning rate
        pass

class Softmax:
    """
    Softmax activation function typically used at the output layer for multi-class classification.
    """
    def forward(self, X):
        """
        Compute the forward pass using softmax activation.

        Args:
            X (numpy.ndarray): Input data with shape (n_classes, m), where n_classes is the number of classes
                               and m is the number of examples.

        Returns:
            numpy.ndarray: Softmax probabilities with shape (n_classes, m).
        """
        # TODO: Store the input and calculate the output of the softmax layer
        pass

    def backward(self, dA):
        """
        Compute the backward pass for the softmax activation.

        Args:
            dA (numpy.ndarray): Gradient of the loss with respect to the softmax output,
                                having shape (n_classes, m).

        Returns:
            numpy.ndarray: Passed-through gradient

        Note:
            Often the derivative is combined with cross-entropy loss simplifying the gradient.
        """

        # TODO: Calculate the gradient of the loss with respect to the input
        # TODO: Return the gradient of the loss with respect to the input
        pass

    def update(self, lr):
        """
        Update function for Softmax activation. No update is performed because softmax has no trainable parameters.

        Args:
            lr (float): Learning rate

        Returns:
            None
        """

        # TODO: Update the weights and biases of the layer using the learning rate
        pass

class CleanNumpyNeuralNetwork:
    """
    A neural network implemented using numpy for classification tasks on MNIST-like data.

    Assumed Input:
        - X: Each column is a flattened 28x28 MNIST style image, i.e., shape (784, m) where m is the number of examples.

    Example Architecture:
        - Layer 1: Linear layer mapping from 784 to 26 features.
        - Output Activation: Softmax.

    The network supports forward propagation, backpropagation (with cross-entropy loss derivative),
    converting probabilities to class labels, and training via mini-batch gradient descent.
    """
    def __init__(self, seed=42):
        """
        Initialize the neural network and its layers.

        Args:
            seed (int): Random seed for reproducibility. Default is 42.

        Notes:
            The network's weights and biases are initialized in their own init functions using He initialization.
        """
        np.random.seed(seed)
        
        self.L1 = LinearLayer(784, 26)
        self.softmax = Softmax()

        self.layers = [self.L1, self.softmax]

    def forward(self, X):
        """
        Perform a forward pass through the entire network.

        Args:
            X (numpy.ndarray): Input data with shape (784, m), where m is the number of examples.

        Returns:
            numpy.ndarray: Output probabilities from the network with shape (n_classes, m).
                           Here n_classes is 26.
        """

        # TODO: Calculate the output of the network

        return X

    def cross_entropy(self, Y_hat, Y):
        """
        Compute the cross-entropy loss.

        Args:
            Y_hat (numpy.ndarray): Predicted probability matrix of shape (n_classes, m).
            Y (numpy.ndarray): One-hot encoded true labels of shape (n_classes, m).

        Returns:
            float: The average cross-entropy loss over all m examples.

        Notes:
            A small constant epsilon is added to Y_hat to avoid computing log(0).
        """

        # TODO: Calculate the cross-entropy loss
        pass

    def convert_prob_into_class(self, probs):
        """
        Convert predicted probability distributions into class labels.

        Args:
            probs (numpy.ndarray): Predicted probabilities with shape (n_classes, m).

        Returns:
            numpy.ndarray: Array of predicted class labels with shape (m,).
        """

        # TODO: Convert the probabilities into a class
        pass

    def get_accuracy(self, Y_hat, Y):
        """
        Compute the classification accuracy.

        Args:
            Y_hat (numpy.ndarray): Predicted probability matrix from the network, shape (n_classes, m).
            Y (numpy.ndarray): One-hot encoded true labels, shape (n_classes, m).

        Returns:
            float: Accuracy as a fraction between 0 and 1.
        """

        # TODO: Calculate the accuracy of the network
        pass

    def backprop(self, Y_hat, Y):
        """
        Perform backpropagation over the entire network to compute gradients.

        Args:
            Y_hat (numpy.ndarray): Predicted output probabilities, shape (n_classes, m).
            Y (numpy.ndarray): One-hot encoded true labels, shape (n_classes, m).

        Process:
            Starts by computing the derivative of the cross-entropy loss with respect to the final layer
            and then propagate the gradients backward through all layers.
        """

        # TODO: Calculate the gradient of the loss with respect to the input
        pass

    def train(self, X, Y, epochs, learning_rate, batch_size=32, verbose=False):
        """
        Train the neural network using mini-batch gradient descent.

        Args:
            X (numpy.ndarray): Input data with shape (784, m), where each column is a flattened MNIST style image.
            Y (numpy.ndarray): One-hot encoded labels with shape (n_classes, m), where n_classes is 26
            epochs (int): Number of epochs for training.
            learning_rate (float): Learning rate for the parameter updates.
            batch_size (int, optional): Number of examples per mini-batch. Default is 32.
            verbose (bool, optional): If True, prints training progress every 500 epochs. Default is False.

        Returns:
            dict: A dictionary containing:
                - 'loss_history': List of loss values for each epoch.
                - 'accuracy_history': List of accuracy values for each epoch.

        Process:
            - Shuffles the dataset each epoch.
            - Processes data in mini-batches.
            - Performs a forward pass, backpropagation, and parameter updates for each mini-batch.
            - Computes the loss and accuracy for the entire dataset after each epoch.
        """
        loss_history = []
        accuracy_history = []
        m = X.shape[1]
        
        for i in range(epochs):
            # Mini-batch processing
            permutation = np.random.permutation(m)
            X_shuffled = X[:, permutation]
            Y_shuffled = Y[:, permutation]
            
            for j in range(0, m, batch_size):
                X_batch = X_shuffled[:, j:j+batch_size]
                Y_batch = Y_shuffled[:, j:j+batch_size]
                
                # Forward propagation
                # TODO: Calculate the output of the network
                
                # Backward propagation
                # TODO: Calculate the gradients of the loss with respect to the input
                
                # Update parameters
                # TODO: Update the weights and biases of the layer using the learning rate
            
            # Calculate metrics for the whole epoch
            Y_hat_full = self.forward(X)
            loss = self.cross_entropy(Y_hat_full, Y)
            accuracy = self.get_accuracy(Y_hat_full, Y)
            
            loss_history.append(loss)
            accuracy_history.append(accuracy)
            
            if verbose and i % 500 == 0:
                print(f"Epoch {i+1}/{epochs}")
                print(f"loss: {loss:.5f}")
                print(f"accuracy: {accuracy:.5f}")
                print("-" * 30)
        
        return {'loss_history': loss_history, 'accuracy_history': accuracy_history}