# hand-sign recognizer
Here we build a convolutional neural-network for recognizing hand-signs.

> Inspired by: [Convolutional Neural Networks](https://www.coursera.org/learn/convolutional-neural-networks)

In [1]:
import torch
import h5py
import numpy as np
from numpy.random import default_rng
from tqdm import tqdm
from matplotlib import pyplot as plt
import math
import util

import importlib
importlib.reload(util)

if torch.cuda.is_available(): # TODO: remove the false
    print("Cuda available.")
    tensor_type = 'torch.cuda.DoubleTensor'
    torch.backends.cuda.matmul.allow_tf32 = True
else:
    print("Cuda not found.")
    tensor_type = 'torch.DoubleTensor'

print(f"Setting {tensor_type} as default dtype...")
torch.set_default_tensor_type(tensor_type)

Cuda available.
Setting torch.cuda.DoubleTensor as default dtype...


In [2]:
train_x, train_y, test_x, test_y, class_labels = None, None, None, None, None
with h5py.File("../data/test-hand-signs.h5", "r") as f:
    test_x = np.array(f["test_set_x"])
    test_y = np.array(f["test_set_y"])

with h5py.File("../data/train-hand-signs.h5", "r") as f:
    train_x = np.array(f["train_set_x"])
    train_y = np.array(f["train_set_y"])
    class_labels = np.array(f["list_classes"])

print(f"# of training-examples: {train_x.shape[0]}")
print(f"# of test-examples: {test_x.shape[0]}")
print(f"image-dimensions: {test_x.shape[1:]}")
print(f"class-labels: {class_labels}")

# of training-examples: 1080
# of test-examples: 120
image-dimensions: (64, 64, 3)
class-labels: [0 1 2 3 4 5]


## # pre-processing
We will perform mean and variance normalization of the input.

In [3]:
train_mean = np.mean(a=train_x, axis=0)
train_std = np.std(a=train_x, axis=0)

# train-set normalization
train_X = (train_x - train_mean) / train_std
train_Y = np.eye(len(class_labels))[train_y, :].copy()

# test-set normalization
test_X = (test_x - train_mean) / train_std
test_Y = np.eye(len(class_labels))[test_y, :].copy()

print(f"Train-set X: {train_X.shape}, Y: {train_Y.shape}")
print(f"Test-set X: {test_X.shape}, Y: {test_Y.shape}")

cu_train_X = torch.tensor(train_X)
cu_train_Y = torch.tensor(train_Y)
cu_test_X = torch.tensor(test_X)
cu_test_Y = torch.tensor(test_Y)

Train-set X: (1080, 64, 64, 3), Y: (1080, 6)
Test-set X: (120, 64, 64, 3), Y: (120, 6)


## # architecture
We will use a 3-layer CNN, as defined below:

<center>

| #-layer | layer-type      | component        | properties                           |         |
|---------|-----------------|------------------|--------------------------------------|---------|
| 1       | 2d-convolution  | kernel           | $(h^{[1]}_k, w^{[1]}_k)$             | $(4,4)$ |
|         |                 | #-kernels        | $c^{[1]}$                            | 8       |
|         |                 | convolve-pding | $(h^{[1]}_p, w^{[1]}_p)$             |`<same>` |
|         |                 | convolve-stride  | $(h^{[1]}_s, w^{[1]}_s)$             | $(1,1)$ |
|         |                 | pooling          | max-pooling                          |         |
|         |                 | pooling-filter   | $(h^{[1]}_l, w^{[1]}_l)$             | $(8,8)$ |
|         |                 | pooling-padding  | $({}^{l}h^{[1]}_p, {}^{l}w^{[1]}_p)$ |`<same>` |
|         |                 | pooling-stride   | $({}^{l}h^{[1]}_s, {}^{l}w^{[1]}_s)$ | $(8,8)$ |
|         |                 | activation       | ReLU                                 |         |
| 2       | 2d-convolution  | kernel           | $(h^{[2]}_k, w^{[2]}_k)$             | $(2,2)$ |
|         |                 | #-kernels        | $c^{[2]}$                            | 16      |
|         |                 | convolve-padding | $(h^{[2]}_p, w^{[2]}_p)$             |`<same>` |
|         |                 | convolve-stride  | $(h^{[2]}_s, w^{[2]}_s)$             | $(1,1)$ |
|         |                 | pooling          | max-pooling                          |         |
|         |                 | pooling-filter   | $(h^{[2]}_l, w^{[2]}_l)$             | $(4,4)$ |
|         |                 | pooling-padding  | $({}^{l}h^{[2]}_p, {}^{l}w^{[2]}_p)$ |`<same>` |
|         |                 | pooling-stride   | $({}^{l}h^{[2]}_s, {}^{l}w^{[2]}_s)$ | $(4,4)$ |
|         |                 | activation       | ReLU                                 |         |
| 3       | fully-connected | #-neurons        | $n^{[3]}$                            | $6$     |
|         |                 | activation       | softmax                              |         |

</center>

Also, the number of channels in the input, i.e. $c^{[0]} = 3$.

* Given the stride and kernel, `<same>` padding refers to the padding regime wherein the output immediately after the convolution is of the same size as that of the input, i.e.
$$
h^{[l]}_z = \left\lfloor\frac{h^{[l]}_a + 2h^{[l]}_p - h^{[l]}_k}{h^{[l]}_s} + 1\right\rfloor = h^{[l]}_a;\qquad w^{[l]}_z = \left\lfloor\frac{w^{[l]}_a + 2w^{[l]}_p - w^{[l]}_k}{w^{[l]}_s} + 1\right\rfloor = w^{[l]}_a
$$

## # forward-propagation
> **Note**: for the forward-propagation equations, read [Section-2, Back-propagation: Conv2D](./back-propagation_Conv2D.pdf).

In [25]:
def forward_propogate(X: torch.Tensor, model_arch: dict, model: dict, **kwargs) -> tuple:
    '''
    X: shape -> (m, h_a^{0}, w_a^{0}, c^{0})
    model_arch: dictionary of attributes that define the model architecture
    model: dictionary containing the model parameters (i.e. kernels, weights, and biases)
    '''

    L: int = model_arch["L"]

    cache = {'m': X.shape[0], 'c-l0': (None, None, None, None, None, None, torch.clone(X).detach())}

    m = X.shape[0]
    Al_1 = X
    Al = None
    for l in np.arange(start=1, stop=L+1):
        if model_arch["t-l" + str(l)] == "conv":
            cl = model_arch["c-l" + str(l)]

            Kl = torch.clone(model["K-l" + str(l)]).detach() # shape -> (c^{l}, h_k^{l}, w_k^{l}, c^{l-1})
            bl = torch.clone(model['b-l' + str(l)]).detach() # shape -> (c^{l})

            # shape -> (h_k^{l} * w_k^{l} * c^{l-1}, c^{l})
            Kl_v = torch.transpose(torch.reshape(torch.permute(
                Kl, dims=(0,3,2,1)), shape=(cl,-1)), dim0=1, dim1=0)

            p = model_arch["p-l" + str(l)]
            # shape -> (m, h_a^{l-1} + 2 * h_p^{l}, w_a^{l-1} + 2 * w_p^{l}, c^{l-1} )
            Al_1 = torch.nn.functional.pad(torch.clone(Al_1).detach(), 
                pad=(p[-1], p[-1], p[-2], p[-2], p[-3], p[-3], 0, 0), # 2d-Conv
                mode="constant", value=0.0)
            # shape -> (m, h_i^{l}, w_i^{l}, h_k^{l}, w_k^{l}, c^{l-1})
            win_Al_1 = torch.squeeze(util.view_as_window(
                arr_in=Al_1, window_shape=(1, *model_arch["k-l" + str(l)]),
                step=(1, *model_arch["s-l" + str(l)])))
            i = model_arch["i-l" + str(l)]
            # shape -> (m, h_i^{l} * w_i^{l}, h_k^{l} * w_k^{l} * c^{l-1})
            win_Al_1_v = torch.reshape(
                torch.permute(win_Al_1, dims=(0,2,1,5,4,3)), 
                shape=(m, i[0] * i[1], Kl_v.shape[0]))
            
            # shape -> (m, h_i^{l} * w_i^{l}, c^{l})
            Zl_v = torch.matmul(win_Al_1_v, Kl_v) \
                + torch.reshape(bl, shape=(1,1,-1)) # shape -> (1,1,c^{l})
            # the following step is only suited for element-wise activation, 
            # since we are not specifying any axis along which to normalize
            Il_v = util.activation(Zl_v, model_arch["g-l" + str(l)])

            # shape -> (m, h_i^{l}, w_i^{l}, c^{l})
            Il = torch.permute(torch.reshape(torch.transpose(
                Il_v, dim0=1,dim1=2), shape=(m, cl, i[1], i[0])), dims=(0,3,2,1))

            p = model_arch["pl-p-l" + str(l)]
            # shape -> (m, h_i^{l} + 2 * l_h_p^{l}, w_i^{l} + 2 * l_w_p^{l}, c^{l})
            Il = torch.nn.functional.pad(torch.clone(Il).detach(), 
                pad=(p[-1], p[-1], p[-2], p[-2], p[-3], p[-3], 0, 0),
                mode="constant", value=0.0)
            # shape -> (m, h_a^{l}, w_a^{l}, c^{l}, l_h_k^{l}, l_w_k^{l})
            win_Il = torch.squeeze(util.view_as_window(
                arr_in=Il,
                window_shape=(1,*model_arch["pl-k-l" + str(l)]),
                step=(1, *model_arch["pl-s-l" + str(l)])))

            a = model_arch["a-l" + str(l)]
            # shape -> (m, c^{l}, h_a^{l} * w_a^{l}, l_h_k^{l} * l_w_k^{l})
            win_Il_v = torch.reshape(torch.permute(
                win_Il, dims=(0,3,2,1,5,4)), 
                shape=(m, cl, a[0] * a[1], math.prod(model_arch["pl-k-l" + str(l)])))
            max_pool = torch.max(win_Il_v, dim=len(win_Il_v.shape) - 1, keepdim=False)

            # shape -> (m, h_a^{l}, w_a^{l}, c^{l})
            Al = torch.permute(torch.reshape(
                max_pool.values, shape=(m, cl, a[1], a[0])), dims=(0, 3, 2, 1))
            # shape -> (m, h_a^{l}, w_a^{l}, c^{l})
            max_pool_switches = torch.permute(torch.reshape(
                max_pool.indices, shape=(m, cl, a[1], a[0])), dims=(0, 3, 2, 1))

            cache['c-l' + str(l + 1)] = (
                model_arch["t-l" + str(l)],
                torch.clone(Kl).detach(), 
                torch.clone(bl).detach(),
                # shape -> (m, h_i^{l}, w_i^{l}, c^{l})
                torch.permute(torch.reshape(torch.transpose(
                    Zl_v, dim0=1,dim1=2), shape=(m, cl, i[1], i[0])), dims=(0,3,2,1)), 
                torch.clone(Il).detach(), 
                max_pool_switches,
                torch.clone(Al).detach()
            )

        elif model_arch["t-l" + str(l)] == "flat":
            # shape -> (m, h_a * w_a * c^{l-1})
            Al_1_v = torch.reshape(torch.permute(
                Al_1, dims=(0,3,2,1)), shape=(m, -1))

            Wl = model["W-l" + str(l)]
            bl = model["b-l" + str(l)]

            # shape -> (m, n^{l})
            Zl = util.linear(Wl, Al_1_v, bl)

            # shape -> (m, n^{l})
            Al = util.activation(Zl, model_arch["g-l" + str(l)])

            cache['c-l' + str(l + 1)] = (
                model_arch["t-l" + str(l)], 
                torch.clone(Al).detach(),
                torch.clone(Wl).detach(), 
                torch.clone(bl).detach(),
                torch.clone(Zl).detach()
            )
        else:
            raise ValueError(
                f"Invalid layer-type: {model_arch['t-l' + str(l)]} at {'t-l' + str(l)}")

        Al_1 = Al

    return Al, cache

## # backward-propagation
> **Note**: for the backward-propagation diagram, equations, and their derivation, read [Section-3.1, Back-propagation: Conv2D](./back-propagation_Conv2D.pdf).
>
>For the derivation of $\frac{\mathrm{d}J}{\mathrm{d}\mathbf{A}^{[3]}}$, and the derivation of $\frac{\mathrm{d}\mathbf{A}^{[3]}}{\mathrm{d}\mathbf{Z}^{[3]}}$, when $\mathbf{A}^{[3]} = \mathrm{softmax}(\mathbf{Z}^{[3]})$; see the section titled `backward-propagation` in [`..\1_multi_layer_perceptrons\hand-sign-recognizer.ipynb`](../1_multi_layer_perceptrons/hand-sign-recognizer.ipynb)
>

In [20]:
def pooling_backprop(l: int, pool_type: str, max_pool_switches: torch.Tensor, model_arch: dict) -> torch.Tensor:
    '''
    computes the derivative dA/dI, where I --(pooling)--> A.

    l: the current layer number
    pool_type: type of pooling used. Could be 'max' or 'avg'
    max_pool_switches: shape -> (m, h_a^{l}, w_a^{l}, c^{l})
    model_arch: dictionary defining the model architecture. This the same 
        dictionary that is passed into the initialize() method.

    dA_dI: shape -> (m, c^{l}, h_a^{l} * w_a^{l}, h_i^{l} * w_i^{l})
    '''

    dA_dI = None
    m, h_a, w_a, cl = max_pool_switches.shape

    if pool_type == 'max':
        h_i, w_i, _ = model_arch["i-l" + str(l)]
        h_a, w_a, _ = model_arch["a-l" + str(l)]
        h_l, w_l, _ = model_arch["pl-k-l" + str(l)]
        l_h_s, l_w_s, _ = model_arch["pl-s-l" + str(l)]
        l_h_p, l_w_p, _ = model_arch["pl-p-l" + str(l)]

        max_pool_switches += 1 # transform from 0-indexed to 1-indexed
        c_b = torch.ceil(max_pool_switches / h_l)
        r_b = max_pool_switches - (c_b - 1) * h_l

        r_a = torch.reshape(torch.arange(
            start=1, end=h_a + 1, step=1), shape=(1, h_a, 1, 1))
        c_a = torch.reshape(torch.arange(
            start=1, end=w_a + 1, step=1), shape=(1, 1, w_a, 1))

        r_t = 1 + l_h_s * (r_a - 1) - l_h_p
        c_t = 1 + l_w_s * (c_a - 1) - l_w_p

        r_i = r_b + r_t - 1
        c_i = c_b + c_t - 1

        # shape -> max_pool_switches.shape
        j = (c_i - 1) * h_i + r_i - 1 # -1 for zero-indexing
        # shape -> (m, c^{l}, h_a^{l} * w_a^{l})
        j = torch.reshape(torch.permute(
            j, dims=(0, 3, 2, 1)), shape=(m, cl, h_a * w_a, 1))
        
        # shape -> (m, c^{l}, h_a^{l} * w_a^{l}, h_i^{l} * w_i^{l})
        dA_dI = torch.zeros(size=(m, cl, h_a * w_a, h_i * w_i))
        source_ones = torch.ones_like(j) * 1.0 # * 1.0 is to convert dtype to float
        dA_dI = torch.scatter(input=dA_dI, dim=len(dA_dI.shape) - 1, index=j, src=source_ones)
    elif pool_type == 'avg':
        # TODO: implement this
        dA_dI = None
    else:
        raise ValueError(f"Invalid pooling-type: {pool_type}")
    
    return dA_dI

def gradients(Y: torch.Tensor, forward_cache: dict, model: dict, model_arch: dict, **kwargs):
    '''Computes gradients for backward propagation.

    Y: shape -> (m, nL)
    forward_cache: the cache of matrices returned by the forward_propogate() method
    model: dictionary containing the model parameters (i.e. kernels, weights, and biases) 
    model_arch: dictionary defining the model architecture. This the same 
        dictionary that is passed into the initialize() method.
    '''

    backward_cache = dict()
    m = forward_cache['m']
    L = model["L"]

    dAl, dWl, dbl, dZl = None, None, None, None
    Al = forward_cache['c-l' + str(L)][1]

    _m, nL = Al.shape
    assert _m == m, f"batch-sizes {m} != {_m}"

    Al_3d = Al.reshape((m, nL, 1))
    dAl = torch.divide(- Y.reshape((m, nL, 1)), m * Al_3d)
    for l in np.arange(start=L, stop=0, step=-1):
        if model_arch['t-l' + str(l)] == 'flat':
            _, Al, Wl, bl, Zl = forward_cache['c-l' + str(l)]

            if model_arch['g-l' + str(l)] == 'softmax':
                # shape -> (m, nL)
                dZl = torch.multiply(dAl , 
                    torch.multiply(torch.eye(nL).reshape((1, nL, nL)), Al_3d) \
                    - torch.multiply(Al_3d, torch.transpose(Al_3d, dim0=1, dim1=2))) \
                    .sum(dim=1, keepdim=False)
            else:
                raise ValueError(
                    f"Unknown activation-function {model['g-l' + str(l)]} for {'g-l' + str(l)}")

            Al_1 = forward_cache['c-l' + str(l-1)][0]
            dWl = torch.matmul(dZl.T, Al_1.T) # .T to get gradients

            dbl = torch.matmul(torch.ones((1, m)), dZl) # .T to get gradients
            backward_cache["dc-l" + str(l)] = (dAl, dZl, dWl, dbl)

            dAl_1 = torch.matmul(dZl, Wl.T) # computes dA^{[l-1]}

            dAl = dAl_1
        elif model_arch['t-l' + str(l)] == 'conv':
            _, Kl, bl, Zl, Il, max_pool_switches, Al = forward_cache['c-l' + str(l)]
            cl = model_arch["c-l" + str(l)]
            i = model_arch["i-l" + str(l)]
            a = model_arch["a-l" + str(l)]

            dIl_mask = pooling_backprop(l=l, pool_type=model_arch["pl-t-l" + str(l)], 
                max_pool_switches=max_pool_switches, model_arch=model_arch)
            
            if l+1 <= L and model_arch["t-l" + str(l+1)] == 'flat':
                # shape -> (m, c^{l}, h_a^{l} * w_a^{l}, 1)
                dAl = torch.reshape(dAl, shape=(m, cl, a[0] * a[1], 1))
            # shape -> (m, c^{l}, h_i^{l} * w_i^{l})
            dIl = torch.squeeze(torch.multiply(dAl, dIl_mask).sum(dim=2))
            
            # shape -> (m, c^{l}, h_i^{l} * w_i^{l})
            dI_dZ = None
            if model_arch["g-l" + str(l)] == 'relu':
                # shape -> (m, c^{l}, h_i^{l} * w_i^{l})
                Zl = torch.reshape(torch.permute(
                    Zl, dims=(0,3,2,1)), shape=(m, cl, i[0] * i[1]))
                dI_dZ = torch.where(Zl > 0, 1, 0)
            else:
                raise ValueError(f"Invalid activation function: {model_arch['g-l' + str(l)]}")
            
            # shape -> (m, c^{l}, h_i^{l} * w_i^{l})
            dZl = torch.multiply(dIl, dI_dZ)

            k = model_arch["k-l" + str(l)]
            s = model_arch["s-l" + str(l)]
            Al_1 = torch.clone(forward_cache["c-l" + str(l-1)][-1]).detach()
            cl_1 = model_arch["c-l" + str(l-1)]
            # shape -> (m, h_i^{l}, w_i^{l}, c^{l-1}, h_k^{l}, w_k^{l})
            Al_1 = torch.squeeze(util.view_as_window(
                arr_in=Al_1, window_shape=(1, k[0], k[1], 1), step=(1, s[0], s[1], 1)))
            # shape -> (m, c^{l-1}, h_i^{l} * w_i^{l}, h_k^{l} * w_k^{l})
            Al_1 = torch.reshape(torch.permute(
                Al_1, dims=(0, 3, 2, 1, 5, 4)), shape=(m, cl_1, i[0] * i[1], k[0] * k[1]))
            # shape -> (m, c^{l-1}, h_i^{l} * w_i^{l}, h_k^{l} * w_k^{l})
            # dZ_dK = torch.matmul(Al_1, torch.eye(n=k[0] * k[1]).reshape((1,k[0],k[1])))
            dZ_dK = Al_1
            # shape -> (m, c^{l-1}, c^{l}, h_k^{l} * w_k^{l})
            dKl = torch.matmul(torch.reshape(
                dZl, shape=(m, 1, cl, i[0] * i[1])), dZ_dK)
            
            # shape -> (m, c^{l})
            dbl = torch.sum(dZl, dim=2)

            backward_cache["dc-l" + str(l)] = (
                # shape -> (m, c^{l}, h_a^{l} * w_a^{l})
                torch.clone(torch.squeeze(dAl)).detach(),
                # shape -> (m, c^{l}, h_i^{l} * w_i^{l})
                torch.clone(dIl).detach(),
                # shape -> (m, c^{l}, h_i^{l} * w_i^{l})
                torch.clone(dZl).detach(),
                torch.clone(
                    torch.transpose(torch.reshape(torch.permute(torch.sum(dKl,dim=0), 
                        # shape -> (c^{l}, c^{l-1}, h_k^{l} * w_k^{l})
                        dims=(1,0,2)),
                        shape=(cl, cl_1, k[1], k[0])), 
                        # shape -> (c^{l}, h_k^{l}, w_k^{l}, c^{l-1})
                        dim0=1, dim1=3)
                ).detach(),
                torch.clone(torch.reshape(torch.sum(dbl, dim=0), shape=(cl,1))).detach()
            )

            al_1 = model_arch["a-l" + str(l-1)]
            int_dA = torch.eye(n=al_1[0] * al_1[1])
            # shape -> (h_a^{l-1}, w_a^{l-1}, h_a^{l-1} * w_a^{l-1})
            int_dA = torch.permute(torch.reshape(
                int_dA, shape=(al_1[0] * al_1[1], al_1[1], al_1[0])), dims=(2,1,0))
            # shape -> (h_i^[l], w_i^{l}, h_a^{l-1} * w_a^{l-1}, h_k^{l}, w_k^{l})
            int_dA = torch.squeeze(util.view_as_window(
                arr_in=int_dA, window_shape=(k[0], k[1], 1), step=(s[0], s[1], 1)))
            # shape -> (1, h_i^[l] * w_i^{l}, h_a^{l-1} * w_a^{l-1}, h_k^{l} * w_k^{l})
            int_dA = torch.reshape(torch.permute(
                int_dA, dims=(2,1,0,4,3)), shape=(1, i[0] * i[1], al_1[0] * al_1[1], k[0] * k[1]))
            # shape -> (c^{l-1}, 1, h_k^{l} * w_k^{l}, c^{l})
            k_vec = torch.transpose(torch.reshape(torch.permute(
                Kl, dims=(3,0,2,1)), shape=(cl_1, 1, cl, k[0] * k[1])), dim0=1, dim1=2)
            # shape -> (c^{l-1}, h_i^[l] * w_i^{l}, h_a^{l-1} * w_a^{l-1}, c^{l})
            #       -> (c^{l-1}, c^{l}, h_i^[l] * w_i^{l}, h_a^{l-1} * w_a^{l-1})
            dZ_dA = torch.permute(torch.matmul(
                int_dA, k_vec), dims=(0,3,1,2))
            # shape -> (m, c^{l-1}, c^{l}, h_i^[l] * w_i^{l}, h_a^{l-1} * w_a^{l-1})
            dAl_1 = torch.multiply(
                torch.reshape(dZl, shape=((m, 1, cl, i[0] * i[1], 1))),
                torch.reshape(dZ_dA, shape=(1, cl_1, cl, i[0] * i[1], al_1[0] * al_1[1])))
            # shape -> (m, c^{l-1}, h_a^{l-1} * w_a^{l-1})
            dAl_1 = torch.sum(dAl_1, dim=(2,3), keepdim=False)

            dAl = dAl_1
        else:
            raise ValueError(
                f"Invalid layer-type: {model_arch['t-l' + str(l)]} in cache: {'c-l' + str(l)}")

In [21]:
def initialize(model_arch: dict):
    L = model_arch["L"]
    model = dict()

    for l in np.arange(start=1,stop=L+1):
        layer_type = model_arch["t-l" + str(l)]
        if layer_type == "conv":
            Kl = torch.nn.init.xavier_normal_(torch.empty(
                    size=[model_arch["c-l" + str(l)]] + list(model_arch["k-l" + str(l)])))
            bl = torch.zeros(size=(model_arch["c-l" + str(l)],1))
            
            model_arch["p-l" + str(l)] = util.compute_padding(
                p_template=model_arch["p-l" + str(l)],
                a_shape=model_arch["a-l" + str(l-1)],
                k_shape=model_arch["k-l" + str(l)],
                s_shape=model_arch["s-l" + str(l)])
            # value -> (h_i^{l}, w_i^{l}, 1)
            i_size = list(util.convolved_size(
                a_shape=model_arch["a-l" + str(l-1)],
                k_shape=model_arch["k-l" + str(l)],
                p_shape=model_arch["p-l" + str(l)],
                s_shape=model_arch["s-l" + str(l)]))
            i_size[-1] = model_arch['c-l' + str(l)]
            # value -> (h_i^{l}, w_i^{l}, c^{l})
            model_arch["i-l" + str(l)] = tuple(i_size)
            
            model_arch["pl-p-l" + str(l)] = util.compute_padding(
                p_template=model_arch["pl-p-l" + str(l)],
                a_shape=model_arch["i-l" + str(l)],
                k_shape=model_arch["pl-k-l" + str(l)],
                s_shape=model_arch["pl-s-l" + str(l)])
            # value -> (h_a^{l}, w_a^{l}, 1)
            a_size = list(util.convolved_size(
                a_shape=model_arch["i-l" + str(l)],
                k_shape=model_arch["pl-k-l" + str(l)],
                p_shape=model_arch["pl-p-l" + str(l)],
                s_shape=model_arch["pl-s-l" + str(l)]))
            a_size[-1] = model_arch["c-l" + str(l)]
            # value -> (h_a^{l}, w_a^{l}, c^{l})
            model_arch["a-l" + str(l)] = tuple(a_size)

            model["K-l" + str(l)] = Kl
            model["b-l" + str(l)] = bl
        elif layer_type == "flat":
            Wl = torch.nn.init.xavier_normal_(torch.empty(
                size=(math.prod(model_arch["a-l" + str(l-1)]), model_arch["n-l" + str(l)])))
            bl = torch.zeros(size=(model_arch["n-l" + str(l)],1))

            model["W-l" + str(l)] = Wl
            model["b-l" + str(l)] = bl
        else:
            raise KeyError(f"Invalid layer-type: {layer_type}")

    return model

In [22]:
last_grad_params = None
def update_model(model: dict, iteration: int,
                 backward_cache: dict, alpha: float=0.001 , beta1: float=0.9, 
                 beta2: float=0.999, epsilon: float=1e-08, **kwargs):
    """Uses the ADAM optimization technique to update the parameters.

    model: the dictionary of parameters returned by the call to initialize()
    iteration: the current iteration across all epochs and batches
    backward_cache: 
        if model_arch["t-l" + str(l)] == 'flat': (dAl, dZl, dWl, dbl)
        if model_arch["t-l" + str(l)] == 'conv': (dAl, dIl, dZl, dKl, dbl)
    alpha: the learning rate
    beta1: the smoothing constant for momentum
    beta2: the smoothing constant for RMS-prop
    epsilon: non-zero factor for RMS-prop
    """

    if last_grad_params is None:
        last_grad_params = dict()
        for l in range(1, model_arch["L"] + 1):
            if model_arch["t-l" + str(l)] == "flat":
                last_grad_params['v-W-l' + str(l)] = torch.zeros_like(model['W-l' + str(l)])
                last_grad_params['s-W-l' + str(l)] = torch.zeros_like(model['W-l' + str(l)])
            elif model_arch["t-l" + str(l)] == "conv":
                last_grad_params['v-K-l' + str(l)] = torch.zeros_like(model['K-l' + str(l)])
                last_grad_params['s-K-l' + str(l)] = torch.zeros_like(model['K-l' + str(l)])
            
            last_grad_params['v-b-l' + str(l)] = torch.zeros_like(model['b-l' + str(l)])
            last_grad_params['s-b-l' + str(l)] = torch.zeros_like(model['b-l' + str(l)])
    
    for l in range(1, model_arch["L"] + 1):
        if model_arch["t-l" + str(l)] == 'flat':
            _, _, dWl, dbl = backward_cache["dc-l" + str(l)]

            last_grad_params['v-W-l' + str(l)] = (1-beta1) * dWl.T \
                + beta1*last_grad_params['v-W-l' + str(l)]
            last_grad_params['v-b-l' + str(l)] = (1-beta1) * dbl.T \
                + beta1*last_grad_params['v-b-l' + str(l)]

            last_grad_params['s-W-l' + str(l)] = (1-beta2) * torch.square(dWl.T) \
                + beta2*last_grad_params['s-W-l' + str(l)]
            last_grad_params['s-b-l' + str(l)] = (1-beta2) * torch.square(dbl.T) \
                + beta2*last_grad_params['s-b-l' + str(l)]

            model['W-l' + str(l)] += -alpha * torch.divide(
                last_grad_params['v-W-l' + str(l)] / (1 - np.power(beta1, iteration)), 
                torch.sqrt(last_grad_params['s-W-l' + str(l)] / (1 - np.power(beta2, iteration))) 
                    + epsilon)
            model['b-l' + str(l)] += -alpha * torch.divide(
                last_grad_params['v-b-l' + str(l)] / (1 - np.power(beta1, iteration)), 
                torch.sqrt(last_grad_params['s-b-l' + str(l)] / (1 - np.power(beta2, iteration)))
                    + epsilon)
        elif model_arch["t-l" + str(l)] == 'conv':
            _, _, _, dKl, dbl = backward_cache["dc-l" + str(l)]

            last_grad_params['v-K-l' + str(l)] = (1-beta1) * dKl \
                + beta1*last_grad_params['v-K-l' + str(l)]
            last_grad_params['v-b-l' + str(l)] = (1-beta1) * dbl \
                + beta1*last_grad_params['v-b-l' + str(l)]

            last_grad_params['s-K-l' + str(l)] = (1-beta2) * torch.square(dKl) \
                + beta2*last_grad_params['s-K-l' + str(l)]
            last_grad_params['s-b-l' + str(l)] = (1-beta2) * torch.square(dbl) \
                + beta2*last_grad_params['s-b-l' + str(l)]

            model['K-l' + str(l)] += -alpha * torch.divide(
                last_grad_params['v-K-l' + str(l)] / (1 - np.power(beta1, iteration)), 
                torch.sqrt(last_grad_params['s-K-l' + str(l)] / (1 - np.power(beta2, iteration))) 
                    + epsilon)
            model['b-l' + str(l)] += -alpha * torch.divide(
                last_grad_params['v-b-l' + str(l)] / (1 - np.power(beta1, iteration)), 
                torch.sqrt(last_grad_params['s-b-l' + str(l)] / (1 - np.power(beta2, iteration)))
                    + epsilon)
        else:
            raise ValueError(f"Unknown layer-type: {model_arch['t-l' + str(l)]}")


def optimize(X: torch.Tensor, Y: torch.Tensor, model: dict, model_arch: dict,
        epochs: int = 1500, batch_size=None,
        # debug tools
        debug_mode: bool=False, cache_per_iter: int=100, 
        # updating the gradients
        alpha: float = 0.0001, **kwargs):
    """Optimizes the weights and biases, i.e. trains them.

    X: shape -> (m, h_a^{0}, w_a^{0}, c^{0})
    Y: shape -> (m, h_a^{L}) ; here L is the final-layers' number

    """

    m = X.shape[0]
    
    batch_size = m if not batch_size else batch_size

    train_cache = list()
    
    print(f"Gradient-descent... {{alpha: {alpha}, epochs: {epochs}}}")
    print(f"................... {{batch-size: {batch_size}}}")
    print(f"................... {{kwargs: {kwargs}}}")
    
    iter_cost = None
    iter_count = 0
    rng = default_rng(2)
    X_idx = np.arange(start=0, stop=m, step=1)
    for e in tqdm(range(epochs)):
        rng.shuffle(X_idx)
        print(X.shape)
        print(Y.shape)
        X_shfl, Y_shfl = X[X_idx, :, :, :].reshape(X.shape), Y[X_idx, :].reshape(Y.shape)
        for i in range(np.int32(np.ceil(m / batch_size))):
            Xi, Yi = X_shfl[batch_size*i:batch_size*(i+1), :], \
                Y_shfl[batch_size*i:batch_size*(i+1), :]

            Al, forward_cache = forward_propogate(Xi, model_arch, model, **kwargs)

            backward_cache = gradients(Yi, forward_cache, model, model_arch, **kwargs)
            update_model(model=model, forward_cache=forward_cache, 
                backward_cache=backward_cache, iteration=iter_count+1, 
                alpha=alpha, **kwargs)
            
            if iter_count % cache_per_iter == 0:
                iter_cost = util.softmax_cost(Al=Al, Y=Yi, **kwargs)
                if debug_mode:
                    train_cache.append(((e,i, iter_count), iter_cost, 
                        forward_cache, backward_cache))
                else:
                    train_cache.append(((e, i, iter_count), iter_cost))

            if torch.isnan(iter_cost):
                return model, train_cache, f"iter-(epoch, batch)-{(e,i)}... NaN-Abort!"

            iter_count += 1

    return model, train_cache, None

In [23]:
def multiclass_classify(Al):
    return torch.argmax(Al, axis=1)

def measure_accuracy(X_train: torch.Tensor, X_test: torch.Tensor, 
        Y_train: torch.Tensor, Y_test: torch.Tensor, 
        model: dict, model_arch: dict):
    m_train = X_train.shape[0]
    Al, _ = forward_propogate(X_train, model_arch=model_arch, model=model)
    train_acc = 100 * torch.sum(multiclass_classify(Al) 
            == torch.argmax(Y_train, axis=1)) / m_train
    
    m_test = X_test.shape[0]    
    Al, _ = forward_propogate(X_test, model)
    test_acc = 100 * torch.sum(multiclass_classify(Al) 
            == torch.argmax(Y_test, axis=0)) / m_test

    print(f"Train accuracy: {train_acc}%")
    print(f"Test accuracy: {test_acc}%")

def model(model_arch: dict, **kwargs):
    params = initialize(model_arch=model_arch)
    params, train_cache, err_msg = optimize(
        X=cu_train_X, Y=cu_train_Y, model=params, model_arch=model_arch, **kwargs)
    if err_msg:
        print(err_msg)
    else:
        measure_accuracy(X_train=cu_train_X, X_test=cu_test_X, 
            Y_train=cu_train_Y, Y_test=cu_test_Y, 
            model=params, model_arch=model_arch)

        if train_cache is not None:
            y = [cache[1].cpu().detach().numpy() for cache in train_cache]
            x = [cache[0][2] for cache in train_cache]
            fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15,10))
            ax.plot(x, y)
            ax.set_title("Cost-function vs Iteration")
            ax.set_ylabel("cost-function ($J$)")
            ax.set_xlabel("training-iteration ($i$)")

            plt.show()

    return params, train_cache

## # training-model

In [None]:
model_arch = {
    "L": 3,

    "a-l0": cu_train_X.shape[1:], # set this equal to X.shape[1:]
    "c-l0": cu_train_X.shape[-1], # set this equal to X.shape[-1]

    "t-l1": "conv",
    "k-l1": (4,4,3),
    "c-l1": 8,
    "p-l1": (-1,-1,0), # -1 demotes same-padding in that dimension
    "s-l1": (1,1,1),
    "pl-t-l1": "max",
    "pl-k-l1": (8,8,1),
    "pl-p-l1": (-1,-1,0),
    "pl-s-l1": (8,8,1),
    "g-l1": "relu",

    "t-l2": "conv",
    "k-l2": (2,2,3),
    "c-l2": 16,
    "p-l2": (-1,-1,0),
    "s-l2": (1,1,1),
    "pl-t-l2": "max",
    "pl-k-l2": (4,4,1),
    "pl-p-l2": (-1,-1,0),
    "pl-s-l2": (4,4,1),
    "g-l2": "relu",

    "t-l3": "flat",
    "n-l3": 6,
    "g-l3": "softmax"
}

params, train_cache = model(model_arch=model_arch, 
        scaling_type='xavier', epochs=1500, batch_size=cu_train_X.shape[1], alpha=0.0001, 
        grad_scheme='batch',
        debug_mode=False, cache_per_iter=100)