## [Problem 1] Fully Connected Layer

In [15]:
import numpy as np

class FC:
    """
    Fully connected layer
    Parameters
    ----------
    n_nodes1 : int
      Number of nodes in the previous layer
    n_nodes2 : int
      Number of nodes in the next layer
    initializer: instance of initialization method
    optimizer: instance of optimization method
    """
    def __init__(self, n_nodes1, n_nodes2, initializer, optimizer):
        self.optimizer = optimizer
        self.W = initializer.W(n_nodes1, n_nodes2)
        self.B = initializer.B(n_nodes2)
    
    def forward(self, X):
        """
        Forward propagation
        Parameters
        ----------
        X : ndarray, shape (batch_size, n_nodes1)
            Input
        Returns
        ----------
        A : ndarray, shape (batch_size, n_nodes2)
            Output
        """  

        self.X = X
        return np.dot(X, self.W) + self.B      
    
    def backward(self, dA):
        """
        Backward propagation
        Parameters
        ----------
        dA : ndarray, shape (batch_size, n_nodes2)
            Gradient from the next layer
        Returns
        ----------
        dZ : ndarray, shape (batch_size, n_nodes1)
            Gradient to be passed to the previous layer
        """
    

        self.dW = np.dot(self.X.T, dA)
        self.dB = np.sum(dA, axis=0)
        dX = np.dot(dA, self.W.T)
        self = self.optimizer.update(self)
        return dX


## [Problem 2] Initialization Method

In [16]:
class SimpleInitializer:
    """
    Simple initialization with Gaussian distribution
    Parameters
    ----------
    sigma : float
      Standard deviation of Gaussian distribution
    """
    def __init__(self, sigma):
        self.sigma = sigma
    
    def W(self, n_nodes1, n_nodes2):
        """
        Weight initialization
        Parameters
        ----------
        n_nodes1 : int
          Number of nodes in the previous layer
        n_nodes2 : int
          Number of nodes in the next layer
        Returns
        ----------
        W : ndarray, shape (n_nodes1, n_nodes2)
            Initialized weights
        """
        return self.sigma * np.random.randn(n_nodes1, n_nodes2)
    
    def B(self, n_nodes2):
        """
        Bias initialization
        Parameters
        ----------
        n_nodes2 : int
          Number of nodes in the next layer
        Returns
        ----------
        B : ndarray, shape (n_nodes2,)
            Initialized biases
        """
        return np.zeros(n_nodes2)

## [Problem 3] Optimization Method

In [17]:
class SGD:
    """
    Stochastic gradient descent
    Parameters
    ----------
    lr : float
      Learning rate
    """
    def __init__(self, lr):
        self.lr = lr
    
    def update(self, layer):
        """
        Update weights and biases for a layer
        Parameters
        ----------
        layer : Instance of the layer before update
        """
        layer.W -= self.lr * layer.dW
        layer.B -= self.lr * layer.dB
        return layer

## [Problem 4] Activation Functions

In [18]:
class Tanh:
    def forward(self, A):
        self.Z = np.tanh(A)
        return self.Z

    def backward(self, dZ):
        return dZ * (1 - self.Z ** 2)

class Softmax:
    def forward(self, A):
        expA = np.exp(A - np.max(A, axis=1, keepdims=True))
        self.Z = expA / np.sum(expA, axis=1, keepdims=True)
        return self.Z

    def backward(self, Z, Y):
        return self.Z - Y

## [Problem 5] ReLU Activation Function

In [5]:
class ReLU:
    def forward(self, A):
        self.A = A
        return np.maximum(0, A)
    
    def backward(self, dZ):
        return dZ * (self.A > 0)


## [Problem 6] Weight Initialization


In [6]:
class XavierInitializer:
    def W(self, n_nodes1, n_nodes2):
        return np.random.randn(n_nodes1, n_nodes2) / np.sqrt(n_nodes1)
    
    def B(self, n_nodes2):
        return np.zeros(n_nodes2)

class HeInitializer:
    def W(self, n_nodes1, n_nodes2):
        return np.random.randn(n_nodes1, n_nodes2) * np.sqrt(2 / n_nodes1)
    
    def B(self, n_nodes2):
        return np.zeros(n_nodes2)


## [Problem 7] AdaGrad Optimization

In [7]:
class AdaGrad:
    def __init__(self, lr):
        self.lr = lr
        self.HW = 0
        self.HB = 0
    
    def update(self, layer):
        self.HW += layer.dW * layer.dW
        self.HB += layer.dB * layer.dB
        layer.W -= self.lr * layer.dW / (np.sqrt(self.HW) + 1e-7)
        layer.B -= self.lr * layer.dB / (np.sqrt(self.HB) + 1e-7)
        return layer


## [Problem 8] Scratch Deep Neural Network Classifier

In [19]:




class ScratchDeepNeuralNetworkClassifier:
    def __init__(self, n_features, n_output, sigma, lr, n_nodes1, n_nodes2, initializer='simple', optimizer='sgd'):
        self.n_features = n_features
        self.n_output = n_output
        self.sigma = sigma
        self.lr = lr
        self.n_nodes1 = n_nodes1
        self.n_nodes2 = n_nodes2

        if initializer == 'simple':
            self.initializer = SimpleInitializer(self.sigma)
        if optimizer == 'sgd':
            self.optimizer = SGD(self.lr)

        self.FC1 = FC(self.n_features, self.n_nodes1, self.initializer, self.optimizer)
        self.activation1 = Tanh()
        self.FC2 = FC(self.n_nodes1, self.n_nodes2, self.initializer, self.optimizer)
        self.activation2 = Tanh()
        self.FC3 = FC(self.n_nodes2, self.n_output, self.initializer, self.optimizer)
        self.activation3 = Softmax()

    def fit(self, X, y, epochs, batch_size):
        n_samples = X.shape[0]
        for epoch in range(epochs):
            for i in range(0, n_samples, batch_size):
                X_batch = X[i:i+batch_size]
                y_batch = y[i:i+batch_size]

                # Forward
                A1 = self.FC1.forward(X_batch)
                Z1 = self.activation1.forward(A1)
                A2 = self.FC2.forward(Z1)
                Z2 = self.activation2.forward(A2)
                A3 = self.FC3.forward(Z2)
                Z3 = self.activation3.forward(A3)

                # Backward
                dA3 = self.activation3.backward(Z3, y_batch)
                dZ2 = self.FC3.backward(dA3)
                dA2 = self.activation2.backward(dZ2)
                dZ1 = self.FC2.backward(dA2)
                dA1 = self.activation1.backward(dZ1)
                dZ0 = self.FC1.backward(dA1)

    def predict(self, X):
        A1 = self.FC1.forward(X)
        Z1 = self.activation1.forward(A1)
        A2 = self.FC2.forward(Z1)
        Z2 = self.activation2.forward(A2)
        A3 = self.FC3.forward(Z2)
        Z3 = self.activation3.forward(A3)
        return np.argmax(Z3, axis=1)


## [Problem 9] Learning and Estimation

In [20]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

mnist = fetch_openml('mnist_784')
X = mnist.data / 255.0
y = mnist.target.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y_train and y_test to NumPy arrays and reshape them
y_train = y_train.to_numpy().reshape(-1, 1)
y_test = y_test.to_numpy().reshape(-1, 1)

# One-hot encode the labels
encoder = OneHotEncoder(sparse_output=False)
y_train_onehot = encoder.fit_transform(y_train)
y_test_onehot = encoder.transform(y_test)


In [21]:
# Instantiate and train the model
model = ScratchDeepNeuralNetworkClassifier(n_features=784, n_output=10, sigma=0.01, lr=0.01, n_nodes1=100, n_nodes2=50, initializer='simple', optimizer='sgd')
model.fit(X_train, y_train_onehot, epochs=10, batch_size=32)

# Predict and calculate accuracy
y_pred = model.predict(X_test)
y_test_labels = np.argmax(y_test_onehot, axis=1)
accuracy = np.mean(y_pred == y_test_labels)
print(f'Accuracy: {accuracy}')


Accuracy: 0.9687142857142857
