# Parallel CNN layers for image classification

# Group members

| Member | Student ID |
|--------|------------|
| Nguyễn Thế Hưng | 19127154 |
| Trần Minh Thiện | 19127281 |
| Lê Tâm Anh | 19127330 |

In [11]:
import numpy as np
from numba import config
from numba import jit, cuda, prange
import math
import tensorflow as tf
import time

# Functions for Parallell Convolutional Layer:

In [12]:
@cuda.jit
def cnn_forward_kernel(patches, kernels, convolution_output):
    r, c = cuda.grid(2)

    if r < patches.shape[0] and c < patches.shape[1]:
        for k in range(kernels.shape[0]):
            sum = 0
            for i in range(kernels.shape[1]):
                for j in range(kernels.shape[2]):
                    sum += patches[r, c, i, j] * kernels[k, i, j]
            convolution_output[r, c, k] = sum

@cuda.jit
def cnn_backward_kernel(patches, dE_dY, dE_dk):
    x, y, z = cuda.grid(3)

    if x < dE_dk.shape[0] and y < dE_dk.shape[1] and z < dE_dk.shape[2]:
        temp = 0
        for h in range(patches.shape[0]):
            for w in range(patches.shape[1]):
                temp += patches[h, w, y, z] * dE_dY[h, w, x]
        dE_dk[x, y, z] = temp

# Functions for Parallell Max Pooling Layer:

In [13]:
#@cuda.jit(device=True)
def patches_generator(image,kernel_size):
        """
        Divide the input image in patches to be used during pooling.
        Yields the tuples containing the patches and their coordinates.
        """
        # Compute the ouput size
        output_h = image.shape[0] // kernel_size
        output_w = image.shape[1] // kernel_size
        #self.image = image
        #c,r=cuda.grid(2)
        for h in range(output_h):
            for w in range(output_w):
                patch = image[(h*kernel_size):(h*kernel_size+kernel_size), (w*kernel_size):(w*kernel_size+kernel_size)]
                yield patch,h,w

def forward_prop(image,kernel_size):
        image_h, image_w, num_kernels = image.shape
        max_pooling_output = np.zeros((image_h//kernel_size, image_w//kernel_size, num_kernels))
        for patch, h, w in patches_generator(image,kernel_size):
            max_pooling_output[h,w] = np.amax(patch, axis=(0,1))
        return max_pooling_output



def back_prop(image,dE_dY,kernel_size):
        """
        Takes the gradient of the loss function with respect to the output and computes the gradients of the loss function with respect
        to the kernels' weights.
        dE_dY comes from the following layer, typically softmax.
        There are no weights to update, but the output is needed to update the weights of the convolutional layer.
        """
        dE_dk_temp1 = np.zeros(image.shape)
        #dE_dk=np.ascontiguousarray(dE_dk_temp1)
        #cuda.pinned(dE_dk)
        for patch,h,w in patches_generator(image,kernel_size):
            image_h, image_w, num_kernels = patch.shape
            max_val = np.amax(patch, axis=(0,1))
            block_size = (16, 16)
            grid_size = (math.ceil(image_w / block_size[0]),
                        math.ceil(image_h / block_size[1]))
            patch=np.ascontiguousarray(patch)

            dA=cuda.to_device(patch)
            dE=cuda.to_device(dE_dk_temp1)
            back_prob_sup[grid_size, block_size](image_h,image_w,num_kernels,dA,max_val,dE_dY,dE,h,w,kernel_size)
            dE_dk=dE.copy_to_host()
            return dE_dk

@cuda.jit
def back_prob_sup(image_h,image_w,num_kernels,patch,max_val,dE_dY,dE_dk,h,w,kernel_size):
    c,r=cuda.grid(2)
    #print("hihi")
    if r < image_h and c < image_w:
          for idx_k in range(num_kernels):
                if patch[r,c,idx_k] == max_val[idx_k]:
                  #print("hihi")
                  dE_dk[h*kernel_size+r, w*kernel_size+c, idx_k] = dE_dY[h,w,idx_k]
    #return dE_dk

# Functions for Parallell Softmax Layer:

In [14]:
#Hàm nhân vector với ma trận
@cuda.jit
def dot(a, b, c):
  col = cuda.grid(1)
  if (col < b.shape[1]):
    sum = 0.0
    for i in range(b.shape[0]):
      sum += a[i] * b[i, col]
    c[col] = sum

#Hàm nhân ma trận với vector
@cuda.jit
def cu_matrix_vector(A, b, c):
  row = cuda.grid(1)
  if (row < A.shape[0]):
    sum = 0.0
    for i in range(A.shape[1]):
      sum += A[row, i] * b[i]
    c[row] = sum

#Hàm nhân hai ma trận
@cuda.jit
def matmul(A,B,C):
  i,j = cuda.grid(2)
  if i < C.shape[0] and j < C.shape[1]:
    tmp = A[i,0] * B[0,j]
    C[i,j] = tmp

# CNN classes

In [15]:
class ConvolutionLayer:
    def __init__(self, kernel_num, kernel_size):
        """
        Constructor takes as input the number of kernels and their size. I assume only squared filters of size kernel_size x kernel_size
        """
        self.kernel_num = kernel_num
        self.kernel_size = kernel_size
        # Generate random filters of shape (kernel_num, kernel_size, kernel_size). Divide by kernel_size^2 for weight normalization
        self.kernels = np.random.randn(kernel_num, kernel_size, kernel_size) / (kernel_size**2)

    def patches_generator(self, image):
        """
        Divide the input image in patches to be used during convolution.
        Yields the tuples containing the patches and their coordinates.
        """
        # Extract image height and width
        image_h, image_w = image.shape
        self.image = image
        # The number of patches, given a fxf filter is h-f+1 for height and w-f+1 for width
        patches = np.empty((image_h-self.kernel_size+1, image_w-self.kernel_size+1, self.kernel_size, self.kernel_size))
        for h in range(image_h-self.kernel_size+1):
            for w in range(image_w-self.kernel_size+1):
                patches[h, w] = image[h:(h+self.kernel_size), w:(w+self.kernel_size)]
        return patches

    def forward_prop(self, image):
        """
        Perform forward propagation for the convolutional layer.
        """
        # Extract image height and width
        image_h, image_w = image.shape
        # Initialize the convolution output volume of the correct size
        convolution_output = np.zeros((image_h-self.kernel_size+1, image_w-self.kernel_size+1, self.kernel_num))
        # Unpack the generator
        patches = self.patches_generator(image)
        block_size = (16, 16)
        grid_size = (math.ceil(convolution_output.shape[1] / block_size[0]),
                     math.ceil(convolution_output.shape[0] / block_size[1]))
        cnn_forward_kernel[grid_size, block_size](patches, self.kernels, convolution_output)
        cuda.synchronize()
        return convolution_output

    def back_prop(self, dE_dY, alpha):
        """
        Takes the gradient of the loss function with respect to the output and computes the gradients of the loss function with respect
        to the kernels' weights.
        dE_dY comes from the following layer, typically max pooling layer.
        It updates the kernels' weights
        """
        # Initialize gradient of the loss function with respect to the kernel weights
        dE_dk = np.zeros(self.kernels.shape)
        patches = self.patches_generator(self.image)
        block_size = (16, 4, 4)
        grid_size = (math.ceil(dE_dk.shape[2] / block_size[0]),
                     math.ceil(dE_dk.shape[1] / block_size[1]),
                     math.ceil(dE_dk.shape[0] / block_size[2]))
        cnn_backward_kernel[grid_size, block_size](patches, dE_dY, dE_dk)
        cuda.synchronize()
        # Update the parameters
        self.kernels -= alpha*dE_dk
        return dE_dk


class MaxPoolingLayer:
    def __init__(self, kernel_size):
        """
        Constructor takes as input the size of the kernel
        """
        self.kernel_size = kernel_size

    def patches_generator(self, image):
        """
        Divide the input image in patches to be used during pooling.
        Yields the tuples containing the patches and their coordinates.
        """
        # Compute the ouput size
        output_h = image.shape[0] // self.kernel_size
        output_w = image.shape[1] // self.kernel_size
        self.image = image

        for h in range(output_h):
            for w in range(output_w):
                patch = image[(h*self.kernel_size):(h*self.kernel_size+self.kernel_size), (w*self.kernel_size):(w*self.kernel_size+self.kernel_size)]
                yield patch, h, w

    def forward_prop(self, image):
        image_h, image_w, num_kernels = image.shape
        max_pooling_output = np.zeros((image_h//self.kernel_size, image_w//self.kernel_size, num_kernels))
        for patch, h, w in self.patches_generator(image):
            max_pooling_output[h,w] = np.amax(patch, axis=(0,1)) # Chia nhỏ để tìm số lớn nhất song song sau đó so sánh với nhau
        return max_pooling_output

    def back_prop(self, dE_dY):
        """
        Takes the gradient of the loss function with respect to the output and computes the gradients of the loss function with respect
        to the kernels' weights.
        dE_dY comes from the following layer, typically softmax.
        There are no weights to update, but the output is needed to update the weights of the convolutional layer.
        """
        dE_dk = np.zeros(self.image.shape)
        for patch,h,w in self.patches_generator(self.image):
            image_h, image_w, num_kernels = patch.shape
            max_val = np.amax(patch, axis=(0,1))

            for idx_h in range(image_h):
                for idx_w in range(image_w):
                    for idx_k in range(num_kernels):
                        if patch[idx_h,idx_w,idx_k] == max_val[idx_k]:
                            dE_dk[h*self.kernel_size+idx_h, w*self.kernel_size+idx_w, idx_k] = dE_dY[h,w,idx_k]
            return dE_dk


class SoftmaxLayer:
    """
    Takes the volume coming from convolutional & pooling layers. It flattens it and it uses it in the next layers.
    """
    def __init__(self, input_units, output_units):
        # Initiallize weights and biases
        self.weight = np.random.randn(input_units, output_units)/input_units
        self.bias = np.zeros(output_units)

    def forward_prop(self, image):
      self.original_shape = image.shape # stored for backprop
      # Flatten the image
      #print("image: ", image)
      image_flattened = image.flatten()
      #print("image_flattened: ", image_flattened)
      self.flattened_input = image_flattened # stored for backprop

      # Perform matrix multiplication and add bias
      C = np.empty(10)
      dA = cuda.to_device(image_flattened)
      dB = cuda.to_device(self.weight)
      dC = cuda.to_device(C)
      dot[(self.weight.shape[0]+255)//256, 256](dA,dB,dC)
      #cu_matrix_vector[(dZ_dX.shape[0]+511)//512, 512](dZ_dX,dE_dZ,C)
      result = dC.copy_to_host()
      first_output = result  + self.bias
      self.output = first_output
      # Apply softmax activation
      softmax_output = np.exp(first_output) / np.sum(np.exp(first_output), axis=0)

      return softmax_output


    def back_prop(self, dE_dY, alpha):
      for i, gradient in enumerate(dE_dY):
        if gradient == 0:
          continue
        transformation_eq = np.exp(self.output)
        S_total = np.sum(transformation_eq)

        # Compute gradients with respect to output (Z)
        dY_dZ = -transformation_eq[i]*transformation_eq / (S_total**2)
        dY_dZ[i] = transformation_eq[i]*(S_total - transformation_eq[i]) / (S_total**2)

        # Compute gradients of output Z with respect to weight, bias, input
        dZ_dw = self.flattened_input
        dZ_db = 1
        dZ_dX = self.weight

        # Gradient of loss with respect ot output
        dE_dZ = gradient * dY_dZ

        # Gradient of loss with respect to weight, bias, input

        #C = np.empty((dZ_dw[np.newaxis].T.shape[0], dE_dZ[np.newaxis].shape[1]))
        #dA = cuda.to_device(dZ_dw[np.newaxis].T)
        #dB = cuda.to_device(dE_dZ[np.newaxis])
        #dC = cuda.to_device(C)
        #blockx = int(np.ceil(C.shape[0] / 16))
        #blocky = int(np.ceil(C.shape[1] / 16))
        #blockspergrid = (blockx, blocky)
        #matmul[blockspergrid, (16,16)](dA,dB,dC)
        #dE_dW = dC.copy_to_host()
        dE_dW = dZ_dw[np.newaxis].T @ dE_dZ[np.newaxis]

        # Matrix-vector multiply function
        C = np.empty(dZ_dX.shape[0])
        dA = cuda.to_device(dZ_dX)
        dB = cuda.to_device(dE_dZ)
        dC = cuda.to_device(C)
        cu_matrix_vector[(dZ_dX.shape[0]+15)//16, 16](dA,dB,dC)
        dE_dX = dC.copy_to_host()

       # dE_dX = dZ_dX @ dE_dZ

        # Update parameters
        self.weight -= alpha * dE_dW
        self.bias -= alpha * (dE_dZ * dZ_db)

        return dE_dX.reshape(self.original_shape)

def CNN_forward(image, label, layers):
    output = image/255.
    for layer in layers:
        output = layer.forward_prop(output)
    # Compute loss (cross-entropy) and accuracy
    loss = -np.log(output[label])
    accuracy = 1 if np.argmax(output) == label else 0
    return output, loss, accuracy

def CNN_backprop(gradient, layers, alpha=0.05):
    grad_back = gradient
    for layer in layers[::-1]:
        if type(layer) in [ConvolutionLayer, SoftmaxLayer]:
            grad_back = layer.back_prop(grad_back, alpha)
        elif type(layer) == MaxPoolingLayer:
            grad_back = layer.back_prop(grad_back)
    return grad_back


def CNN_training(image, label, layers, alpha=0.05):
    # Forward step
    output, loss, accuracy = CNN_forward(image, label, layers)

    # Initial gradient
    gradient = np.zeros(10)
    gradient[label] = -1/output[label]

    # Backprop step
    gradient_back = CNN_backprop(gradient, layers, alpha)

    return loss, accuracy

# Main function

## Train function

In [16]:
def train(network, X_train, y_train, epochs = 1, learning_rate = 0.05, verbose = True):
    for epoch in range(1):
        if verbose == True:
            print('Epoch {} ->'.format(epoch+1))
        # Shuffle training data
        permutation = np.random.permutation(len(X_train))
        X_train = X_train[permutation]
        y_train = y_train[permutation]
        # Training the CNN
        loss = 0
        accuracy = 0
        for i, (image, label) in enumerate(zip(X_train, y_train)):
            if i % 100 == 0: # Every 100 examples
                if verbose == True:
                    print("Step {}. For the last 100 steps: average loss {}, accuracy {}".format(i+1, loss/100, accuracy))
                loss = 0
                accuracy = 0
            loss_1, accuracy_1 = CNN_training(image, label, network)
            loss += loss_1
            accuracy += accuracy_1

## Predict & Evaluate functions

In [17]:
def predict(network, image):
    output = image/255.
    for layer in network:
        output = layer.forward_prop(output)
    return np.argmax(output) # return a number
def evaluate(network, X_test, y_test):
    correct = 0
    for x, y in zip(X_test, y_test):
        pred = predict(network, x)
        if y == pred:
            correct += 1
    acc = correct / y_test.shape[0]
    print(f'Accuracy for the test set is {acc *100}')

## Loading train data and define network layers


In [18]:
network = [
    ConvolutionLayer(16,3), # layer with 8 3x3 filters, output (26,26,16)
    MaxPoolingLayer(2), # pooling layer 2x2, output (13,13,16)
    SoftmaxLayer(13*13*16, 10) # softmax layer with 13*13*16 input and 10 output
    ]

# Load training data
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train[:10000]
y_train = y_train[:10000]

## Training CNN and calculating running time

In [19]:
%%time
train(network, X_train, y_train, epochs=1, learning_rate=0.05)

Epoch 1 ->
Step 1. For the last 100 steps: average loss 0.0, accuracy 0




Step 101. For the last 100 steps: average loss 1.9097160814767398, accuracy 35
Step 201. For the last 100 steps: average loss 1.109052480010852, accuracy 67
Step 301. For the last 100 steps: average loss 0.8324638541247977, accuracy 76
Step 401. For the last 100 steps: average loss 0.7470558742310915, accuracy 79
Step 501. For the last 100 steps: average loss 0.6985758651196469, accuracy 79
Step 601. For the last 100 steps: average loss 0.7519752213159417, accuracy 78
Step 701. For the last 100 steps: average loss 0.680425700848134, accuracy 78
Step 801. For the last 100 steps: average loss 0.475219389278869, accuracy 86
Step 901. For the last 100 steps: average loss 0.606489227591856, accuracy 82
Step 1001. For the last 100 steps: average loss 0.4948608858440089, accuracy 84
Step 1101. For the last 100 steps: average loss 0.35734936609070594, accuracy 93
Step 1201. For the last 100 steps: average loss 0.4959704418301374, accuracy 83
Step 1301. For the last 100 steps: average loss 0.48

## Predict for 10000 images and find the accuracy

In [20]:
evaluate(network, X_test, y_test)

Accuracy for the test set is 91.47
