In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Problem 2: The MNIST dataset 

The goal is to build a neural network to recognize handwritten digits.
We'll use the MNIST dataset.

**Contents:**
- [Neural Network Stuff](#Neural-Network-Stuff)
- [The train and test MNIST datasets](#The-train-and-test-MNIST-datasets)
- [Two-layer Neural Network](#Two-layer-Neural-Network)
- [Three-layer Neural Network](#Three-layer-Neural-Network)

## Neural Network Stuff

In [None]:
class layer:
    'Neural network dense layer'
    
    # initialization (weights and biases)
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01*np.random.randn(n_inputs, n_neurons) 
        self.biases = np.zeros((1,n_neurons))

    # output
    def forward(self, inputs):
        self.output = inputs.dot(self.weights) + self.biases
        
        # remember input values
        self.inputs = inputs
      
    def backward(self, dvalues):
        
        # gradient
        self.dinputs = dvalues.dot(self.weights.T)
        
        # Gradients on parameters
        self.dweights = self.inputs.T.dot(dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

        
class activation_ReLU:
    'rectified linear unit activation function'
    
    def forward(self, inputs):
        self.output = np.maximum(0, inputs)
        
        # remember input values
        self.inputs = inputs
        
    def backward(self, dvalues):

        # copy dvalues
        self.dinputs = dvalues.copy()
        
        # zero gradient where input values were negative
        self.dinputs[self.inputs <= 0] = 0
        
class activation_softmax:
    'softmax activation function'
    
    def forward(self, inputs):
        
        # unnormalized probabilities
        exp_values = np.exp(inputs-np.max(inputs,axis=1, keepdims=True))
        
        # normalized probabilities
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
        
        # remember input values
        self.inputs = inputs
        
    def backward(self,dvalues):

        
        # initialized gradient (derivative) array
        self.dinputs = np.zeros(dvalues.shape)
        
        # For each data point, compute gradient
        for i in range(len(dvalues)):
            output_row = self.output[i]
            dvalues_row = dvalues[i]
            
            # Flatten output array
            output_row = output_row.reshape(-1, 1)

            # Calculate derivative matrix of the output
            derivative_matrix = np.diagflat(output_row) - output_row.dot(output_row.T)
            
            # store gradient
            self.dinputs[i] = dvalues_row.dot(derivative_matrix)


class loss:
    def calculate(self, output,y):
        sample_losses = self.forward(output,y)
        loss = np.mean(sample_losses)
        return loss
    
# cross-entropy loss        
class loss_crossentropy(loss):
    
    def forward(self, y_pred, y_true):
        
        # number of data points
        n_samples = len(y_pred)
        
        # clip data to prevent division by 0
        y_pred_clipped = np.clip(y_pred, 1e-7,1 - 1e-7)
        
        # confidence values
        confidence_values = y_pred_clipped[range(n_samples),y_true]
       
        # sample losses 
        losses = -np.log(confidence_values)
        return losses
    
    
    def backward(self, dvalues, y_true):
        
        
        # Number of samples
        n_samples = len(dvalues)
    
        #one-hot matrix
        Y = np.zeros(dvalues.shape)
        Y[np.arange(len(y_true)),y_true] = 1
        
        # Calculate gradient
        self.dinputs = -Y / dvalues
        
        # Normalize gradient
        self.dinputs = self.dinputs / n_samples
        
# DS optimizer
class optimizer_GD:

    def __init__(self, learning_rate=1.0):
        self.learning_rate = learning_rate
        
    def update_parameters(self, layer):
        layer.weights = layer.weights - self.learning_rate*layer.dweights
        layer.biases = layer.biases - self.learning_rate*layer.dbiases

## The train and test MNIST datasets

In [None]:
# load the train dataset
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Theory/master/Data/digits.csv'
data_train = pd.read_csv(url)

In [None]:
X = data_train.iloc[:,0:784].to_numpy() # pixels
y = data_train['label'].to_numpy() # labels

In [None]:
# load the test dataset
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Theory/master/Data/digits_test.csv'
data_test = pd.read_csv(url)

In [None]:
X_test = data_test.iloc[:,0:784].to_numpy() # pixels
y_test = data_test['label'].to_numpy() # labels

## Two-layer Neural Network

**Part 1:** Fit a neural network with two layers to the training dataset

In [None]:
# build the neural network 


In [None]:
# fit the neural network to the training dataset


In [None]:
# plot the loss as a function of the number of epochs


In [None]:
# plot the accuracy as a function of the number of epochs


**Part 2:** Test your neural network on the test set.
Compute the accuracy and the confusion matrix.

In [None]:
# forward pass 


In [None]:
# predictions


In [None]:
# accuracy


In [None]:
# confusion matrix


**Part 3:** Display some of the misclassified digits.

## Three-layer Neural Network

**Part 1:** Fit a neural network with three layers to the training dataset

In [None]:
# build the neural network 


In [None]:
# fit the neural network to the training dataset


In [None]:
# plot the loss as a function of the number of epochs


In [None]:
# plot the accuracy as a function of the number of epochs


**Part 2:** Test your neural network on the test set.
Compute the accuracy and the confusion matrix.

In [None]:
# forward pass 


In [None]:
# predictions


In [None]:
# accuracy


In [None]:
# confusion matrix


**Part 3:** Display some of the misclassified digits.