Convolutional layer implementation using NumPy

In [None]:
import numpy as np

Convolutional layer implementation:
It takes the number and size of kernels as inputs

In [None]:
class ConvolutionLayer:
    def __init__(self, kernel_num, kernel_size):
        # kernel size n -> (n x n)
        self.kernel_num = kernel_num
        self.kernel_size = kernel_size
        # Random generation and normalization of kernel weights
        self.kernels = np.random.randn(kernel_num, kernel_size, kernel_size) / (kernel_size**2)

    def patches_generator(self, image):
        # patches for the kernel saved as tuple with the coordinates
        # image height and width
        image_h, image_w = image.shape
        self.image = image
        # number of patches (fxf filter) = h-f+1 for height and w-f+1 for width
        for h in range(image_h-self.kernel_size+1):
            for w in range(image_w-self.kernel_size+1):
                patch = image[h:(h+self.kernel_size), w:(w+self.kernel_size)]
                yield patch, h, w

    def forward_prop(self, image):
        image_h, image_w = image.shape
        # convolution output initialization
        convolution_output = np.zeros((image_h-self.kernel_size+1, image_w-self.kernel_size+1, self.kernel_num))
        # unpack the generator
        for patch, h, w in self.patches_generator(image):
            # convolution for the patches
            convolution_output[h,w] = np.sum(patch*self.kernels, axis=(1,2))
        return convolution_output

    def back_prop(self, dE_dY, alpha):
        # initialize gradient of the loss function with respect to the kernel weights
        dE_dk = np.zeros(self.kernels.shape)
        for patch, h, w in self.patches_generator(self.image):
            for f in range(self.kernel_num):
                dE_dk[f] += patch * dE_dY[h, w, f]
        # update the parameters
        self.kernels -= alpha*dE_dk
        return dE_dk

AvgPooling layer takes the convolutional layer's output as an input and the size of pooling kernel for the constructor

In [None]:
class AvgPoolingLayer:
    def __init__(self, kernel_size):
        self.kernel_size = kernel_size

    def patches_generator(self, image):
        # Generate patches for pooling
        output_h = image.shape[0] // self.kernel_size
        output_w = image.shape[1] // self.kernel_size
        self.image = image

        for h in range(output_h):
            for w in range(output_w):
                patch = image[(h*self.kernel_size):(h*self.kernel_size+self.kernel_size), (w*self.kernel_size):(w*self.kernel_size+self.kernel_size)]
                yield patch, h, w

    def forward_prop(self, image):
        image_h, image_w, num_kernels = image.shape
        avg_pooling_output = np.zeros((image_h//self.kernel_size, image_w//self.kernel_size, num_kernels))
        for patch, h, w in self.patches_generator(image):
            avg_pooling_output[h,w] = np.mean(patch, axis=(0,1))
        return avg_pooling_output

    def back_prop(self, dE_dY):
        dE_dk = np.zeros(self.image.shape)
        output_h, output_w, num_kernels = dE_dY.shape
        for h in range(output_h):
            for w in range(output_w):
                patch_h = h * self.kernel_size
                patch_w = w * self.kernel_size
                patch = self.image[patch_h : patch_h + self.kernel_size, patch_w : patch_w + self.kernel_size, :]
                dE_dY_patch = dE_dY[h, w, :] / (self.kernel_size ** 2)
                dE_dk[patch_h : patch_h + self.kernel_size, patch_w : patch_w + self.kernel_size, :] = dE_dY_patch

        return dE_dk

Softmax activation is used for the final predictions:

In [None]:
class SoftmaxLayer:
    def __init__(self, input_units, output_units):
        # random initialization of w and b
        self.weight = np.random.randn(input_units, output_units)/input_units
        self.bias = np.zeros(output_units)

    def forward_prop(self, image):
        self.original_shape = image.shape # stored for backprop
        image_flattened = image.flatten()
        self.flattened_input = image_flattened # stored for backprop
        # linear output
        first_output = np.dot(image_flattened, self.weight) + self.bias
        self.output = first_output
        # softmax activation
        softmax_output = np.exp(first_output) / np.sum(np.exp(first_output), axis=0)
        return softmax_output

    def back_prop(self, dE_dY, alpha):
        for i, gradient in enumerate(dE_dY):
            if gradient == 0:
                continue
            transformation_eq = np.exp(self.output)
            S_total = np.sum(transformation_eq)

            # gradients with respect to output (Z)
            dY_dZ = -transformation_eq[i]*transformation_eq / (S_total**2)
            dY_dZ[i] = transformation_eq[i]*(S_total - transformation_eq[i]) / (S_total**2)

            # gradients of linear output with respect to weight, bias, input
            dZ_dw = self.flattened_input
            dZ_db = 1
            dZ_dX = self.weight

            # gradient of loss with respect ot output
            dE_dZ = gradient * dY_dZ

            # gradient of loss with respect to weight, bias, input
            dE_dw = dZ_dw[np.newaxis].T @ dE_dZ[np.newaxis]
            dE_db = dE_dZ * dZ_db
            dE_dX = dZ_dX @ dE_dZ

            # update parameters
            self.weight -= alpha*dE_dw
            self.bias -= alpha*dE_db

            return dE_dX.reshape(self.original_shape)

Functions for performing the training using the entire network

In [None]:
def CNN_forward(image, label, layers):
    output = image/255.
    for layer in layers:
        output = layer.forward_prop(output)
    # loss (cross-entropy) and accuracy
    loss = -np.log(output[label])
    accuracy = 1 if np.argmax(output) == label else 0
    return output, loss, accuracy

def CNN_backprop(gradient, layers, alpha=0.05):
    grad_back = gradient
    for layer in layers[::-1]:
        if type(layer) in [ConvolutionLayer, SoftmaxLayer]:
            grad_back = layer.back_prop(grad_back, alpha)
        elif type(layer) == AvgPoolingLayer:
            grad_back = layer.back_prop(grad_back)
    return grad_back


def CNN_training(image, label, layers, alpha=0.05):
    # forward step
    output, loss, accuracy = CNN_forward(image, label, layers)

    # initial gradient
    gradient = np.zeros(10)
    gradient[label] = -1/output[label]

    # backprop step
    gradient_back = CNN_backprop(gradient, layers, alpha)

    return loss, accuracy

In [None]:
import tensorflow as tf

This snippet performs the training only for 4 epochs and shows the results on the training set.

In [None]:
def main():
  # MNIST data
  (X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
  X_train = X_train[:5000]
  y_train = y_train[:5000]

  layers = [
    ConvolutionLayer(16,3), # layer with 16 3x3 filters, output (26,26,16)
    AvgPoolingLayer(2), # pooling layer 2x2, output (13,13,16)
    SoftmaxLayer(13*13*16, 10) # softmax layer with 13*13*16 input and 10 outputs
    ]

  for epoch in range(4):
    print('Epoch {} ->'.format(epoch+1))
    # shuffle training data
    permutation = np.random.permutation(len(X_train))
    X_train = X_train[permutation]
    y_train = y_train[permutation]
    # training the CNN
    loss = 0
    accuracy = 0
    for i, (image, label) in enumerate(zip(X_train, y_train)):
      if i % 100 == 0: # print for every 100 steps
        print("Step {}. For the last 100 steps: average loss {}, accuracy {}".format(i+1, loss/100, accuracy))
        loss = 0
        accuracy = 0
      loss_1, accuracy_1 = CNN_training(image, label, layers)
      loss += loss_1
      accuracy += accuracy_1


if __name__ == '__main__':
  main()

Epoch 1 ->
Step 1. For the last 100 steps: average loss 0.0, accuracy 0
Step 101. For the last 100 steps: average loss 1.8847034394040136, accuracy 38
Step 201. For the last 100 steps: average loss 1.0730965966293853, accuracy 71
Step 301. For the last 100 steps: average loss 0.7937386502381307, accuracy 79
Step 401. For the last 100 steps: average loss 0.9550129525211772, accuracy 70
Step 501. For the last 100 steps: average loss 0.7335506267417213, accuracy 77
Step 601. For the last 100 steps: average loss 0.6321832950583985, accuracy 78
Step 701. For the last 100 steps: average loss 0.5957851476769828, accuracy 80
Step 801. For the last 100 steps: average loss 0.6497543607863918, accuracy 81
Step 901. For the last 100 steps: average loss 0.8993925826000817, accuracy 71
Step 1001. For the last 100 steps: average loss 0.438797420956733, accuracy 90
Step 1101. For the last 100 steps: average loss 0.7580861045273292, accuracy 79
Step 1201. For the last 100 steps: average loss 0.69948501

KeyboardInterrupt: ignored