# MNIST Prediction with Convolutional Neural Network

- MNIST dataset: is a dataset of 60,000 28x28 grayscale images of the 10 digits, along with a test set of 10,000 images. More info can be found at the [MNIST homepage](http://yann.lecun.com/exdb/mnist/).
- Goal: build a simple artificial neural network to predict the digit in the images.
- Reference: [Oddly Satisfying Deep Learning](https://pythonandml.github.io/dlbook/content/convolutional_neural_networks/cnn_over_mlp.html)

#### Import libraries

In [32]:
# for linear algebra
import numpy as np

# for plotting data, loss, accuracy
import matplotlib.pyplot as plt

# loading mnist dataset from keras
from keras import datasets

# show progress bar
from tqdm import tqdm

# for type hinting
from typing import Optional, Union

#### 1. Utils Functions

1. **plot_data**: plot the random 8 images from the dataset.
2. **Base Layer**: Base class for all the layers.
3. **Activation Functions**: Linear, reLU, Sigmoid, Tanh, Softmax.
3. **Weight Initialization**: Zeros, Ones, Random, Random Uniform.
4. **Optimization Functions**: Gradient Descent, Stochastic Gradient Descent, RMSprop, Adam.

##### 1.1. Plotting Functions

In [33]:
def plot_data(
    X: np.ndarray, y: np.ndarray, y_proba: Optional[np.ndarray] = None
) -> None:
    nrows, ncols = 2, 4
    _, axes = plt.subplots(nrows, ncols, figsize=(8, 4))

    len_x = X.shape[0]
    for idx in range(nrows * ncols):
        ax = axes[idx // ncols, idx % ncols]

        img_idx = np.random.randint(0, len_x)

        ax.imshow(X[img_idx], cmap="gray")
        ax.set(xticks=[], yticks=[])

        true_label = f"True: {y[img_idx]}"
        color = "black"

        if y_proba is not None:
            pred_label = f"Pred: {y_proba[img_idx]}"
            color = "green" if y[img_idx] == y_proba[img_idx] else "red"

        img_title = true_label if y_proba is None else f"{true_label}\n{pred_label}"
        ax.set_xlabel(img_title, color=color)

    plt.tight_layout()
    plt.show()

##### 1.2. Base Layer class

In [34]:
class BaseLayer:
    def __init__(self):
        self.input = None
        self.output = None

    def forward(self, X: np.ndarray) -> np.ndarray:
        """
        :param X: input data

        TODO: return output
        """
        pass

    def backpropagation(self, dZ: np.ndarray, lr: float) -> np.ndarray:
        """
        :param dZ: gradient of loss with respect to output
        :param lr: learning rate

        TODO: update parameters and return input gradient
        """
        pass

    def get_dimensions(self, inp_shape: tuple[int, int, int, int]) -> None:
        """
        :params inp_shape: shape of input

        TODO: get the demensions of input and save it to self.m, self.Nc, self.Nh, self.Nw
        """
        pass

    def update(self, lr: float, m: int, k: int):
        """
        lr: learning rate
        m: batch_size (sumber of samples in batch)
        k: iteration_number

        TODO: update parameters
        """
        pass

##### 1.2. Activation Functions class

In [35]:
class Activation(BaseLayer):
    def __init__(self, act: str = "linear") -> None:
        """
        :param act: activation function's name (relu, sigmoid, tanh, linear)
        """
        self.act = act

    def linear(self, x: np.ndarray) -> np.ndarray:
        return x

    def d_linear(self, x: np.ndarray) -> np.ndarray:
        return np.ones(x.shape)

    def reLU(self, x: np.ndarray) -> np.ndarray:
        return x * (x > 0)

    def d_reLU(self, x: np.ndarray) -> np.ndarray:
        return (x > 0) * np.ones(x.shape)

    def sigmoid(self, x: np.ndarray) -> np.ndarray:
        return 1 / (1 + np.exp(-x))

    def d_sigmoid(self, x: np.ndarray) -> np.ndarray:
        return self.sigmoid(x) * (1 - self.sigmoid(x))

    def tanh(self, x: np.ndarray) -> np.ndarray:
        return np.tanh(x)

    def d_tanh(self, x: np.ndarray) -> np.ndarray:
        return 1 - self.tanh(x) ** 2

    def softmax(self, x: np.ndarray) -> np.ndarray:
        z = x - np.max(x, axis=-1, keepdims=True)
        numerator = np.exp(z)
        denominator = np.sum(numerator, axis=-1, keepdims=True)
        return numerator / denominator

    def d_softmax(self, x: np.ndarray) -> np.ndarray:
        if len(x.shape) == 1:
            x = np.array(x).reshape(1, -1)
        else:
            x = np.array(x)

        _, d = x.shape
        a = self.softmax(x)
        tensor1 = np.einsum("ij,ik->ijk", a, a)
        tensor2 = np.einsum("ij,jk->ijk", a, np.eye(d, d))

        return tensor2 - tensor1

    def get_activation(self, X: np.ndarray) -> np.ndarray:
        """
        :param X: input data to apply activation function
        """
        if self.act == "linear":
            return self.linear(X)
        elif self.act == "reLU":
            return self.reLU(X)
        elif self.act == "sigmoid":
            return self.sigmoid(X)
        elif self.act == "tanh":
            return self.tanh(X)
        elif self.act == "softmax":
            return self.softmax(X)
        else:
            raise ValueError(
                "Valid activation functions are linear, reLU, sigmoid, tanh, softmax"
            )

    def get_d_activation(self, X: np.ndarray) -> np.ndarray:
        if self.act == "linear":
            return self.d_linear(X)
        elif self.act == "reLU":
            return self.d_reLU(X)
        elif self.act == "sigmoid":
            return self.d_sigmoid(X)
        elif self.act == "tanh":
            return self.d_tanh(X)
        elif self.act == "softmax":
            return self.d_softmax(X)
        else:
            raise ValueError(
                "Valid activation functions are linear, reLU, sigmoid, tanh, softmax"
            )

    def forward(self, X: np.ndarray) -> np.ndarray:
        """
        :param X: input data to apply activation function
        """
        self.X = X
        return self.get_activation(X)

    def backpropagation(self, dZ: np.ndarray, lr: float) -> np.ndarray:
        """
        :param dZ: gradient of loss with respect to output
        :param lr: learning rate
        """
        f_prime = self.get_d_activation(self.X)

        if self.activation_type == "softmax":
            dx = np.einsum("ijk,ik->ij", f_prime, dZ)
        else:
            dx = dZ * f_prime
        return dx

##### 1.3. Weight Initialization class

- Zeros initialization: $w = np.zeros(shape)$
- Ones initialization: $w = np.ones(shape)$
- Random initialization: $w = np.random.randn(shape)$
- Random uniform initialization: $w = np.random.uniform(size=shape)$

In [36]:
class WeightInitializer:
    def __init__(self, shape: tuple, init: str = "random", seed: int = 69) -> None:
        """
        :param shape: shape of the weight matrix
        :param init: type of initialization (available initializations: zeros, ones, random, random_uniform)
        :param seed: seed for random initialization
        """
        self.shape = shape
        self.init = init
        self.seed = seed

    def zeros(self) -> np.ndarray:
        if self.seed is not None:
            np.random.seed(self.seed)
        return np.zeros(self.shape)

    def ones(self) -> np.ndarray:
        if self.seed is not None:
            np.random.seed(self.seed)
        return np.ones(self.shape)

    def random(self) -> np.ndarray:
        if self.seed is not None:
            np.random.seed(self.seed)
        return np.random.normal(size=self.shape)

    def random_uniform(self) -> np.ndarray:
        if self.seed is not None:
            np.random.seed(self.seed)
        return np.random.uniform(size=self.shape)

    def get_initializer(self) -> np.ndarray:
        if self.init == "zeros":
            return self.zeros()
        elif self.init == "ones":
            return self.ones()
        elif self.init == "random":
            return self.random()
        elif self.init == "random_uniform":
            return self.random_uniform()
        else:
            raise ValueError(
                "Valid initializations are: zeros, ones, random, random_uniform"
            )

##### 1.4.  Optimizers class

- Gradient Descent Optimizer: $w = w - \alpha \nabla_w L(w)$
- Stochastic Gradient Descent Optimizer: $w = w - \alpha \nabla_w L(w)$
- RMSprop Optimizer: $v = \beta v + (1 - \beta) \nabla_w L(w) \odot \nabla_w L(w)$ and $w = w - \alpha \frac{\nabla_w L(w)}{\sqrt{v + \epsilon}}$
- Adam Optimizer: $m = \beta_1 m + (1 - \beta_1) \nabla_w L(w)$, $v = \beta_2 v + (1 - \beta_2) \nabla_w L(w) \odot \nabla_w L(w)$, $m_{\text{corrected}} = \frac{m}{1 - \beta_1^t}$, $v_{\text{corrected}} = \frac{v}{1 - \beta_2^t}$, and $w = w - \alpha \frac{m_{\text{corrected}}}{\sqrt{v_{\text{corrected}} + \epsilon}}$

> Note: Actually, i only use the Gradient Descent Optimizer in this notebook.

In [37]:
class Optimizer:
    def __init__(
        self,
        op_type: str = "GD",
        shape_W: tuple[int, int] = None,
        shape_b: tuple[int, int] = None,
        m1: float = 0.9,
        m2: float = 0.999,
        epsilon: int = 1e-8,
    ) -> None:
        """
        :param op_type: type of optimizer (available optimizers: GD, SGD, RMSProp, Adam)
        :param shape_W: shape of the weight matrix
        :param shape_b: shape of the bias matrix
        :param m1: hyperparameter >= 0 that accelerates gradient descent in the relevant direction and dampens oscillations. Used in RMSprop
        :param m2: hyperparameter for adam only
        :param epsilon: parameter used in adam and RMSprop to prevent division by zero error
        """
        self.op_type = op_type
        self.m1 = m1
        self.m2 = m2
        self.epsilon = epsilon

        self.vdW = np.zeros(shape_W)
        self.vdb = np.zeros(shape_b)

        self.SdW = np.zeros(shape_W)
        self.Sdb = np.zeros(shape_b)

    def GD(self, dW: np.ndarray, db: np.ndarray, _: int) -> tuple:
        """
        :param dW: gradient of Weight W for iteration k
        :param db: gradient of bias b for iteration k
        :param _: iteration number
        """
        return dW, db

    def SGD(self, dW: np.ndarray, db: np.ndarray, _: int) -> tuple:
        """
        :param dW: gradient of Weight W for iteration k
        :param db: gradient of bias b for iteration k
        :param _: iteration number
        """
        self.vdW = self.m1 * self.vdW + (1 - self.m1) * dW
        self.vdb = self.m1 * self.vdb + (1 - self.m1) * db

        return self.vdW, self.vdb

    def RMSProp(self, dW: np.ndarray, db: np.ndarray, _: int) -> tuple:
        """
        :param dW: gradient of Weight W for iteration k
        :param db: gradient of bias b for iteration k
        :param k: iteration number
        """
        self.SdW = self.m2 * self.SdW + (1 - self.m2) * (dW**2)
        self.Sdb = self.m2 * self.Sdb + (1 - self.m2) * (db**2)

        den_W = np.sqrt(self.SdW) + self.epsilon
        den_b = np.sqrt(self.Sdb) + self.epsilon

        return dW / den_W, db / den_b

    def Adam(self, dW: np.ndarray, db: np.ndarray, k: int) -> tuple:
        """
        :param dW: gradient of Weight W for iteration k
        :param db: gradient of bias b for iteration k
        :param k: iteration number
        """
        # momentum
        self.vdW = self.m1 * self.vdW + (1 - self.m1) * dW
        self.vdb = self.m1 * self.vdb + (1 - self.m1) * db

        # rmsprop
        self.SdW = self.m2 * self.SdW + (1 - self.m2) * (dW**2)
        self.Sdb = self.m2 * self.Sdb + (1 - self.m2) * (db**2)

        # correction
        if k > 1:
            vdW_h = self.vdW / (1 - (self.m1**k))
            vdb_h = self.vdb / (1 - (self.m1**k))
            SdW_h = self.SdW / (1 - (self.m2**k))
            Sdb_h = self.Sdb / (1 - (self.m2**k))
        else:
            vdW_h = self.vdW
            vdb_h = self.vdb
            SdW_h = self.SdW
            Sdb_h = self.Sdb

        den_W = np.sqrt(SdW_h) + self.epsilon
        den_b = np.sqrt(Sdb_h) + self.epsilon

        return vdW_h / den_W, vdb_h / den_b

    def get_optimizer(self, dW: np.ndarray, db: np.ndarray, k: int) -> tuple:
        """
        :param dW: gradient of Weight W for iteration k
        :param db: gradient of bias b for iteration k
        :param k: iteration number
        """
        if self.op_type == "GD":
            return self.GD(dW, db, k)
        elif self.op_type == "SGD":
            return self.SGD(dW, db, k)
        elif self.op_type == "RMSProp":
            return self.RMSProp(dW, db, k)
        elif self.op_type == "Adam":
            return self.Adam(dW, db, k)
        else:
            raise ValueError("Valid optiomizers are GD, SGD, RMSProp, Adam")

##### 1.5. Loss Functions class

- Mean Squared Error Loss: $L(y, \hat{y}) = \frac{1}{2} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2$
- Derivative of Mean Squared Error Loss: $\frac{\partial L(y, \hat{y})}{\partial \hat{y}} = \hat{y} - y$
- Binary Cross Entropy Loss: $L(y, \hat{y}) = - \sum_{i=1}^{n} y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i)$
- Derivative of Binary Cross Entropy Loss: $\frac{\partial L(y, \hat{y})}{\partial \hat{y}} = - \frac{y}{\hat{y}} + \frac{1 - y}{1 - \hat{y}$

In [38]:
class Loss:
    def __init__(self, loss: str = "mse") -> None:
        """
        :param loss: str, loss function (Available: mse, cross-entropy)
        """
        self.loss = loss

    # Mean Squared Error
    def mse(self, a: np.ndarray, y: np.ndarray) -> float:
        """
        :param a: predicted value
        :param y: true value
        """
        return (1 / 2) * np.sum((np.linalg.norm(a - y, axis=1)) ** 2)

    def d_mse(self, a: np.ndarray, y: np.ndarray) -> float:
        """
        :param a: predicted value
        :param y: true value
        """
        return a - y

    # Binary Cross Entropy
    def cross_entropy(
        self, a: np.ndarray, y: np.ndarray, epsilon: float = 1e-12
    ) -> float:
        """
        :param a: predicted value
        :param y: true value
        """
        a = np.clip(a, epsilon, 1.0 - epsilon)
        return -np.sum(y * np.log(a))

    def d_cross_entropy(
        self, a: np.ndarray, y: np.ndarray, epsilon: float = 1e-12
    ) -> float:
        """
        :param a: predicted value
        :param y: true value
        """
        a = np.clip(a, epsilon, 1.0 - epsilon)
        return -y / a

    def get_loss(self, a: np.ndarray, y: np.ndarray) -> float:
        """
        :param a: predicted value
        :param y: true value
        """
        if self.loss == "mse":
            return self.mse(a, y)
        elif self.loss == "cross-entropy":
            return self.cross_entropy(a, y)
        else:
            raise ValueError("Valid losses are mse, cross-entropy")

    def get_d_loss(self, a: np.ndarray, y: np.ndarray) -> float:
        """
        :param a: predicted value
        :param y: true value
        """
        if self.loss == "mse":
            return self.d_mse(a, y)
        elif self.loss == "bce":
            return self.d_bce()
        else:
            raise ValueError("Valid losses are mse, cross-entropy")

##### 1.6. Learning Rate Decay class

In [39]:
class LearningRateDecay:
    def __init__(self) -> None:
        pass

    def constant(self, t: int, lr: float) -> float:
        """
        :param t: iteration number
        :param lr: learning rate initial value
        """
        return lr

    def time_decay(self, t: int, lr: float, k: float) -> float:
        """
        :param t: iteration number
        :param lr: learning rate initial value
        :param k: decay rate
        """
        return lr / (1 + (k * t))

    def step_decay(self, t: int, lr: float, F: int, D: float) -> float:
        """
        :param t: iteration number
        :param lr: learning rate initial value
        :param F: fractor value controlling the decay
        :param D: "Drop every" iteration
        """
        return lr * (F ** np.floor((1 + t) / D))

    def exponential_decay(self, t: int, lr: float, k: float) -> float:
        """
        :param t: iteration number
        :param lr: learning rate initial value
        :param k: decay rate
        """
        return lr * np.exp(-k * t)

#### 2. Convolutional Neural Network

##### 2.1. Convolutional Layer

In [40]:
class Padding2D(BaseLayer):
    def __init__(self, p: Union[str, tuple[int, int]] = "valid") -> None:
        """
        :param p: padding type (valid, same or tuple[int,int])
        """
        self.p = p

    def get_dimensions(
        self, inp_shape: tuple[int, int], kernel_size: int, s: tuple[int, int] = (1, 1)
    ) -> tuple:
        """
        :param inp_shape: input shape (H,W)
        :param kernel_size: kernel size
        :param s: stride
        """
        if len(inp_shape) == 4:
            m, Nc, Nh, Nw = inp_shape
        elif len(inp_shape) == 3:
            Nc, Nh, Nw = inp_shape

        Kh, Kw = kernel_size
        Sh, Sw = s
        p = self.p

        if type(p) == int:
            pt, pb = p, p
            pl, pr = p, p
        elif type(p) == tuple:
            ph, pw = p
            pt, pb = ph // 2, (ph + 1) // 2
            pl, pr = pw // 2, (pw + 1) // 2
        elif p == "valid":
            pt, pb = 0, 0
            pl, pr = 0, 0
        elif p == "same":
            # calculating how much padding is required in all 4 directions
            # (top, bottom, left and right)
            ph = (Sh - 1) * Nh + Kh - Sh
            pw = (Sw - 1) * Nw + Kw - Sw

            pt, pb = ph // 2, (ph + 1) // 2
            pl, pr = pw // 2, (pw + 1) // 2

        else:
            raise ValueError("Valid padding types are: valid, same or tuple")

        if len(inp_shape) == 4:
            out_shape = (m, Nc, Nh + pt + pb, Nw + pl + pr)
        elif len(inp_shape) == 3:
            out_shape = (Nc, Nh + pt + pb, Nw + pl + pr)

        return out_shape, (pt, pb, pl, pr)

    def forward(
        self, X: np.ndarray, kernel_size: int, s: tuple[int, int] = (1, 1)
    ) -> np.ndarray:
        """
        :param X: input data
        :param kernel_size: kernel size
        :param s: stride

        :return X_pad: padded input data
        """

        self.inp_shape = X.shape
        m, Nc, Nh, Nw = X.shape

        self.out_shape, (self.pt, self.pb, self.pl, self.pr) = self.get_dimensions(
            self.inp_shape, kernel_size, s
        )

        zeros_r = np.zeros((m, Nc, Nh, self.pr))
        zeros_l = np.zeros((m, Nc, Nh, self.pl))
        zeros_t = np.zeros((m, Nc, self.pt, Nw + self.pl + self.pr))
        zeros_b = np.zeros((m, Nc, self.pb, Nw + self.pl + self.pr))

        X_pad = np.concatenate((X, zeros_r), axis=3)
        X_pad = np.concatenate((zeros_l, X_pad), axis=3)
        X_pad = np.concatenate((zeros_t, X_pad), axis=2)
        X_pad = np.concatenate((X_pad, zeros_b), axis=2)

        return X_pad

    def backpropagation(self, dZ, lr):
        """
        :param dZ: Backprop Error of padded X (Xp)

        :return dX: Backprop Error of X
        """
        m, Nc, Nh, Nw = self.inp_shape
        dX = dZ[:, :, self.pt : self.pt + Nh, self.pl : self.pl + Nw]
        return dX

In [41]:
class Conv2D(BaseLayer):
    def __init__(
        self,
        filters: int,
        kernel_size: Union[int, tuple[int, int]] = 3,
        s: tuple[int, int] = (1, 1),
        p: Union[str, tuple[int, int]] = "valid",
        act: str = "linear",
        is_bias: bool = True,
        weight_init: str = "random",
        kernel_regularizer: tuple[str, float] = None,
        seed: int = 69,
        inp_shape: tuple[int, int, int, int] = None,
    ) -> None:
        """
        :param filters: number of filters in the convolutional layer
        :param kernel_size: size of the kernel, int or tuple of 2 integers (default: 3)
        :param s: stride (Sh, Sw) (default: (1,1))
        :param p: padding type, valid or same or tuple[int,int] (default: valid)
        :param act: activation function, valid activations are linear, reLU, sigmoid, tanh, softmax (default: linear)
        :bias: bool, whether to use bias in the convolutional layer
        :param weight_init: weight initialization type, valid initializations are zeros, ones, random, random_uniform (default: random)
        :param kernel_regularizer: kernel regularizer, valid regularizers are ('L2', 0.01) or ('L1', 2)
        :param seed: seed to generate random values
        :param inp_shape: input shape (m, Nc, Nh, Nw)
        """
        self.padding = Padding2D(p=p)
        self.F = filters
        self.inp_shape_x = inp_shape

        if type(kernel_size) == int:
            self.kernel_size = (kernel_size, kernel_size)
        elif type(kernel_size) == tuple and len(kernel_size) == 2:
            self.kernel_size = kernel_size
        self.Kh, self.Kw = self.kernel_size

        if type(s) == int:
            self.s = (s, s)
        elif type(s) == tuple and len(s) == 2:
            self.s = s
        self.Sh, self.Sw = self.s

        self.act = Activation(act=act)
        self.is_bias = is_bias
        self.weight_init = weight_init

        if kernel_regularizer is None:
            self.kernel_regularizer = ("L2", 0)
        else:
            self.kernel_regularizer = kernel_regularizer

        self.seed = seed

    def get_dimensions(self, inp_shape: tuple[int, int, int, int]) -> None:
        """
        :param inp_shape: input shape (m, Nc, Nh, Nw)
        """
        self.inp_shape_x = inp_shape

        self.inp_shape, _ = self.padding.get_dimensions(
            self.inp_shape_x, self.kernel_size, self.s
        )

        if len(inp_shape) == 3:
            self.Nc, self.Nh, self.Nw = self.inp_shape
        elif len(inp_shape) == 4:
            self.m, self.Nc, self.Nh, self.Nw = self.inp_shape

        # Output dimensions
        self.Oh = ((self.Nh - self.Kh) // self.Sh) + 1
        self.Ow = ((self.Nw - self.Kw) // self.Sw) + 1

        if len(inp_shape) == 3:
            self.out_shape = (self.F, self.Oh, self.Ow)
        elif len(inp_shape) == 4:
            self.out_shape = (self.m, self.F, self.Oh, self.Ow)

    def init_params(self, inp_shape: tuple[int, int, int, int], opt_type: str) -> None:
        """
        :param inp_shape: input shape (m, Nc, Nh, Nw)
        :param opt_type: optimizer type (GD, SGD, RMSProp, Adam)
        """
        self.get_dimensions(inp_shape)
        shape_b = (self.F, self.Oh, self.Ow)
        shape_k = (self.F, self.Nc, self.Kh, self.Kw)

        initializer = WeightInitializer(
            shape=shape_k, init=self.weight_init, seed=self.seed
        )
        self.K = initializer.get_initializer()
        self.b = np.zeros(shape=shape_b)

        self.optimizer = Optimizer(op_type=opt_type, shape_W=shape_k, shape_b=shape_b)

    def dilate2D(self, X: np.ndarray, Dr: tuple[int, int] = (1, 1)) -> np.ndarray:
        """
        :param X: input data
        :param Dr: dilation rate (Dh, Dw)
        """

        dh, dw = Dr
        m, C, H, W = X.shape
        Xd = np.insert(arr=X, obj=np.repeat(np.arange(1, W), dw - 1), values=0, axis=-1)
        Xd = np.insert(
            arr=Xd, obj=np.repeat(np.arange(1, H), dh - 1), values=0, axis=-2
        )
        return Xd

    def prepare_subMatrix(
        self, X: np.ndarray, Kh: int, Kw: int, s: tuple[int, int]
    ) -> np.ndarray:
        """
        :param X: input data
        :param Kh: kernel height
        :param Kw: kernel width
        :param s: stride (Sh, Sw)
        """
        m, Nc, Nh, Nw = X.shape
        sh, sw = s

        Oh = (Nh - Kh) // sh + 1
        Ow = (Nw - Kw) // sw + 1

        strides = (Nc * Nh * Nw, Nw * Nh, Nw * sh, sw, Nw, 1)
        strides = tuple(i * X.itemsize for i in strides)

        subM = np.lib.stride_tricks.as_strided(
            X, shape=(m, Nc, Oh, Ow, Kh, Kw), strides=strides
        )

        return subM

    def convolve(
        self,
        X: np.ndarray,
        K: np.ndarray,
        s: tuple[int, int] = (1, 1),
        mode: str = "front",
    ) -> np.ndarray:
        """
        :param X: input data
        :param K: kernel
        :param s: stride (Sh, Sw)
        :param mode: front or back or param
        """

        F, Kc, Kh, Kw = K.shape
        subM = self.prepare_subMatrix(X, Kh, Kw, s)

        if mode == "front":
            return np.einsum("fckl,mcijkl->mfij", K, subM)
        elif mode == "back":
            return np.einsum("fdkl,mcijkl->mdij", K, subM)
        elif mode == "param":
            return np.einsum("mfkl,mcijkl->fcij", K, subM)

    def dZ_D_dX(self, dZ_D: np.ndarray, Nh: int, Nw: int) -> np.ndarray:
        """
        :param dZ_D: dilated dZ
        :param Nh: input height
        :param Nw: input width
        """
        _, _, Hd, Wd = dZ_D.shape

        ph = Nh - Hd + self.Kh - 1
        pw = Nw - Wd + self.Kw - 1

        padding_back = Padding2D(p=(ph, pw))

        dZ_Dp = padding_back.forward(dZ_D, self.kernel_size, self.s)

        # Rotate K by 180 degrees

        K_rotated = self.K[:, :, ::-1, ::-1]

        # convolve dZ_Dp with K_rotated

        dXp = self.convolve(dZ_Dp, K_rotated, mode="back")

        dX = self.padding.backpropagation(dXp)

        return dX

    def forward(self, X: np.ndarray) -> np.ndarray:
        self.X = X

        Xp = self.padding.forward(X, self.kernel_size, self.s)

        # convolve Xp with K
        Z = self.convolve(Xp, self.K, self.s) + self.b

        a = self.act.forward(Z)

        return a

    def backpropagation(self, dA: np.ndarray, lr: float) -> np.ndarray:
        """
        :param dA: gradient of loss with respect to output
        :param lr: learning rate
        """

        Xp = self.padding.forward(self.X, self.kernel_size, self.s)

        m, Nc, Nh, Nw = Xp.shape

        dZ = self.act.backpropagation(dA)

        # Dilate dZ (dZ-> dZ_D)

        dZ_D = self.dilate2D(dZ, Dr=self.s)

        dX = self.dZ_D_dX(dZ_D, Nh, Nw)

        # Gradient dK

        _, _, Hd, Wd = dZ_D.shape

        ph = self.Nh - Hd - self.Kh + 1
        pw = self.Nw - Wd - self.Kw + 1

        padding_back = Padding2D(p=(ph, pw))

        dZ_Dp = padding_back.forward(dZ_D, self.kernel_size, self.s)

        self.dK = self.convolve(Xp, dZ_Dp, mode="param")

        # Gradient db
        self.dB: np.ndarray = np.sum(dZ, axis=0)

        return dX

    def update(self, lr: float, m: int, k: int) -> None:
        """
        lr: learning rate
        m: batch_size (sumber of samples in batch)
        k: iteration_number
        """
        dK, dB = self.optimizer.get_optimizer(self.dK, self.dB, k)

        if self.kernel_regularizer[0].lower() == "l2":
            dK += self.kernel_regularizer[1] * self.K
        elif self.weight_regularizer[0].lower() == "l1":
            dK += self.kernel_regularizer[1] * np.sign(self.K)

        self.K -= self.dK * (lr / m)

        if self.is_bias:
            self.b -= self.dB * (lr / m)

##### 2.2. Pooling Layer

In [42]:
class Pooling2D(BaseLayer):
    def __init__(
        self,
        pool_size: Union[int, tuple[int, int]] = (2, 2),
        s: Union[int, tuple[int, int]] = (2, 2),
        p: Union[str, tuple[int, int]] = "valid",
        pool_type: str = "max",
    ) -> None:
        """
        :param pool_size: size of the pooling window, int or tuple of 2 integers (default: (2,2))
        :param s: stride, int or tuple of 2 integers (default: (2,2))
        :param p: padding type, valid or same or tuple[int,int] (default: valid)
        :param pool_type: type of pooling, max or mean (default: max)
        """
        self.padding = Padding2D(p=p)

        if type(pool_size) == int:
            self.pool_size = (pool_size, pool_size)
        elif type(pool_size) == tuple and len(pool_size) == 2:
            self.pool_size = pool_size

        self.Kh, self.Kw = self.pool_size

        if type(s) == int:
            self.s = (s, s)
        elif type(s) == tuple and len(s) == 2:
            self.s = s

        self.sh, self.sw = self.s

        self.pool_type = pool_type

    def get_dimensions(self, inp_shape: tuple[int, int, int, int]) -> None:
        if len(inp_shape) == 4:
            m, Nc, Nh, Nw = inp_shape
        elif len(inp_shape) == 3:
            Nc, Nh, Nw = inp_shape

        Oh = (Nh - self.Kh) // self.sh + 1
        Ow = (Nw - self.Kw) // self.sw + 1

        if len(inp_shape) == 4:
            self.out_shape = (m, Nc, Oh, Ow)
        elif len(inp_shape) == 3:
            self.out_shape = (Nc, Oh, Ow)

    def prepare_subMatrix(
        self, X: np.ndarray, pool_size: tuple[int, int], s: tuple[int, int]
    ) -> np.ndarray:
        """
        :param X: input data
        :param pool_size: size of the pooling window
        :param s: stride (Sh, Sw)
        """
        m, Nc, Nh, Nw = X.shape
        sh, sw = s
        Kh, Kw = pool_size

        Oh = (Nh - Kh) // sh + 1
        Ow = (Nw - Kw) // sw + 1

        strides = (Nc * Nh * Nw, Nh * Nw, Nw * sh, sw, Nw, 1)
        strides = tuple(i * X.itemsize for i in strides)

        subM = np.lib.stride_tricks.as_strided(
            X, shape=(m, Nc, Oh, Ow, Kh, Kw), strides=strides
        )
        return subM

    def pooling(
        self,
        X: np.ndarray,
        pool_size: tuple[int, int] = (2, 2),
        s: tuple[int, int] = (2, 2),
    ) -> np.ndarray:
        """
        :param X: input data
        :param pool_size: size of the pooling window
        :param s: stride (Sh, Sw)
        """

        subM = self.prepare_subMatrix(X, pool_size, s)

        if self.pool_type == "max":
            return np.max(subM, axis=(-2, -1))
        elif self.pool_type == "mean":
            return np.mean(subM, axis=(-2, -1))
        else:
            raise ValueError("Allowed pool types are only 'max' or 'mean'.")

    def prepare_mask(self, subM: np.ndarray, Kh: int, Kw: int) -> np.ndarray:
        """
        :param subM: submatrix
        :param Kh: kernel height
        :param Kw: kernel width
        """

        m, Nc, Oh, Ow, Kh, Kw = subM.shape

        a = subM.reshape(-1, Kh * Kw)
        idx = np.argmax(a, axis=1)
        b = np.zeros(a.shape)
        b[np.arange(b.shape[0]), idx] = 1
        mask = b.reshape((m, Nc, Oh, Ow, Kh, Kw))

        return mask

    def mask_dXp(
        self, mask: np.ndarray, Xp: np.ndarray, dZ: np.ndarray, Kh: int, Kw: int
    ) -> np.ndarray:
        """
        :param mask: mask
        :param Xp: padded input data
        :param dZ: Output Error
        :param Kh: kernel height
        :param Kw: kernel width
        """
        dA = np.einsum("i,ijk->ijk", dZ.reshape(-1), mask.reshape(-1, Kh, Kw)).reshape(
            mask.shape
        )
        m, Nc, Nh, Nw = Xp.shape
        strides = (Nc * Nh * Nw, Nh * Nw, Nw, 1)
        strides = tuple(i * Xp.itemsize for i in strides)
        dXp = np.lib.stride_tricks.as_strided(dA, Xp.shape, strides)
        return dXp

    def maxpool_backprop(self, dZ: np.ndarray, X: np.ndarray) -> np.ndarray:
        """
        :param dZ: Output Error
        :param X: input data
        """

        Xp = self.padding.forward(X, self.pool_size, self.s)

        subM = self.prepare_subMatrix(Xp, self.pool_size, self.s)

        m, Nc, Oh, Ow, Kh, Kw = subM.shape

        m, Nc, Nh, Nw = Xp.shape

        mask = self.prepare_mask(subM, Kh, Kw)

        dXp = self.mask_dXp(mask, Xp, dZ, Kh, Kw)

        return dXp

    def dZ_dZp(self, dZ: np.ndarray) -> np.ndarray:
        """
        :param dZ: Output Error
        """
        sh, sw = self.s
        Kh, Kw = self.pool_size

        dZp = np.kron(dZ, np.ones((Kh, Kw), dtype=dZ.dtype))

        jh, jw = Kh - sh, Kw - sw  # jump along height and width

        if jw != 0:
            L = dZp.shape[-1] - 1

            l1 = np.arange(sw, L)
            l2 = np.arange(sw + jw, L + jw)

            mask = np.tile([True] * jw + [False] * jw, len(l1) // jw).astype(bool)

            r1 = l1[mask[: len(l1)]]
            r2 = l2[mask[: len(l2)]]

            dZp[:, :, :, r1] += dZp[:, :, :, r2]
            dZp = np.delete(dZp, r2, axis=-1)

        if jh != 0:
            L = dZp.shape[-2] - 1

            l1 = np.arange(sh, L)
            l2 = np.arange(sh + jh, L + jh)

            mask = np.tile([True] * jh + [False] * jh, len(l1) // jh).astype(bool)

            r1 = l1[mask[: len(l1)]]
            r2 = l2[mask[: len(l2)]]

            dZp[:, :, r1, :] += dZp[:, :, r2, :]
            dZp = np.delete(dZp, r2, axis=-2)

        return dZp

    def averagepool_backprop(self, dZ: np.ndarray, X: np.ndarray) -> np.ndarray:
        """
        :param dZ: Output Error
        :param X: input data
        """

        Xp = self.padding.forward(X, self.pool_size, self.s)

        m, Nc, Nh, Nw = Xp.shape

        dZp = self.dZ_dZp(dZ)

        ph = Nh - dZp.shape[-2]
        pw = Nw - dZp.shape[-1]

        padding_back = Padding2D(p=(ph, pw))

        dXp = padding_back.forward(dZp, s=self.s, kernel_size=self.pool_size)

        return dXp / (Nh * Nw)

    def forward(self, X):
        """
        :param X: input data
        """
        self.X = X

        # padding
        Xp = self.padding.forward(X, self.pool_size, self.s)

        Z = self.pooling(Xp, self.pool_size, self.s)

        return Z

    def backpropagation(self, dZ, lr):
        """
        :param dZ: Output Error
        :param lr: learning rate
        """
        if self.pool_type == "max":
            dXp = self.maxpool_backprop(dZ, self.X)
        elif self.pool_type == "mean":
            dXp = self.averagepool_backprop(dZ, self.X)
        dX = self.padding.backpropagation(dXp)
        return dX

##### 2.3. Flatten Layer

In [43]:
class Flatten(BaseLayer):
    def __init__(self) -> None:
        pass

    def forward(self, X):
        self.m, self.Nc, self.Nh, self.Nw = X.shape
        X_flat = X.reshape((self.m, self.Nc * self.Nh * self.Nw))
        return X_flat

    def backpropagation(self, dZ, lr):
        dX = dZ.reshape((self.m, self.Nc, self.Nh, self.Nw))
        return dX

    def get_dimensions(self, input_shape: tuple[int, int, int]):
        if len(input_shape) == 4:
            self.m, self.Nc, self.Nh, self.Nw = input_shape
        elif len(input_shape) == 3:
            self.Nc, self.Nh, self.Nw = input_shape

        self.output_shape = self.Nc * self.Nh * self.Nw

##### 2.4. Dense Layer

In [46]:
class Dense(BaseLayer):
    def __init__(
        self,
        neurons: int,
        act: str = "linear",
        is_bias: bool = True,
        weight_init: str = "random",
        weight_regularizer: tuple[str, float] = ("L2", 0),
        seed: int = 69,
        inp_dim: int = None,
    ) -> None:
        """
        :param neurons: number of neurons in the dense layer
        :param act: activation function, valid activations are linear, reLU, sigmoid, tanh, softmax (default: linear)
        :param is_bias: bool, whether to use bias in the dense layer
        :param weight_init: weight initialization type, valid initializations are zeros, ones, random, random_uniform (default: random)
        :param weight_regularizer: weight regularizer, valid regularizers are ('L2', 0.01) or ('L1', 2)
        :param seed: seed to generate random values
        :param inp_dim: input dimension
        """
        self.neurons = neurons
        self.act = Activation(act=act)
        self.is_bias = is_bias
        self.weight_init = weight_init
        self.weight_regularizer = weight_regularizer
        self.seed = seed
        self.inp_dim = inp_dim

    def init_params(self, hl: int, op_type: str) -> None:
        """
        :param hl: number of neurons in the previous layer
        """
        shape_W = (hl, self.neurons)
        shape_b = (self.neurons, 1)
        initializer = WeightInitializer(
            shape=shape_W, init=self.weight_init, seed=self.seed
        )

        self.W = initializer.get_initializer()
        self.b = np.zeros(shape=shape_b)

        self.optimizer = Optimizer(op_type=op_type, shape_W=shape_W, shape_b=shape_b)

    def forward(self, X):
        self.X = X
        self.net = X @ self.W + self.b.T
        return self.act.forward(self.net)

    def backpropagation(self, dZ, lr):
        dA: np.ndarray = self.act.backpropagation(dZ)
        dR = dA.copy()
        self.dB = np.sum(dA, axis=0).reshape(-1, 1)
        self.dW = (self.X.T) @ dR
        dX = dR @ (self.W.T)
        return dX

    def update(self, lr, m, k):
        dW, dB = self.optimizer.get_optimizer(self.dW, self.dB, k)

        if self.weight_regularizer[0].lower() == "l2":
            dW += self.weight_regularizer[1] * self.W
        elif self.weight_regularizer[0].lower() == "l1":
            dW += self.weight_regularizer[1] * np.sign(self.W)

        self.W -= dW * (lr / m)

        if self.is_bias:
            self.b -= dB * (lr / m)

##### 2.5. Dropout Layer

In [47]:
class Dropout(BaseLayer):
    def __init__(self, p: float) -> None:
        """
        p: Dropout probability
        """
        self.p = p
        if self.p == 0:
            self.p += 1e-6
        if self.p == 1:
            self.p -= 1e-6

    def forward(self, X):
        self.mask = (np.random.rand(*X.shape) < self.p) / self.p
        Z = X * self.mask
        return Z

    def backpropagation(self, dZ, lr):
        dX = dZ * self.mask
        return dX

##### 2.6. CNN class

In [45]:
class CNN:
    def __init__(self, layers: list[BaseLayer] = None) -> None:
        if layers is None:
            self.layers = []
        else:
            self.layers = layers

    def add(self, layer: BaseLayer) -> None:
        self.layers.append(layer)

    def Input(self, inp_shape: tuple[int, int, int, int]) -> None:
        self.d = inp_shape
        self.architecture = [self.d]
        self.layer_name = ["Input"]

    def create_architecture(self):
        for layer in self.layers:
            if layer.__class__.__name__ == "Conv2D":
                if layer.inp_shape_x is not None:
                    self.Input(layer.inp_shape_x)
                layer.get_dimensions(self.architecture[-1])
                self.architecture.append(layer.out_shape)