In [1]:
#!/usr/bin/env python


import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import numpy as np
import time
import matplotlib.pyplot as plt
from scipy.signal import convolve2d


In [12]:
class Convolution:
	def __init__(self):
		# """
		# Attributes for instance of EncoderDecoder module
		# """
		self.mod = None
		pass
	
	def getSourceModule(self, method):
		# kernel code wrapper
		kernelwrapper_naive = """
			#include <stdio.h>
			__global__ 
			void conv_gpu_naive(float *N, float *P, float *M, int height, int width, int mask_width){

				// the coordinate of thread (also coordinate in N or P)
				int col = blockDim.x * blockIdx.x + threadIdx.x;
				int row = blockDim.y * blockIdx.y + threadIdx.y;

				// copy to register
				int mask_w = mask_width;
				int n_w = width;
				int n_h = height;
				// start point of the kernel
				int col_start = col - mask_w/2;
				int row_start = row - mask_w/2;

				float p_value = 0.0f;

				// in y direction of mask
				for(int i=0; i<mask_w; i++){
					// x coordinate in mask
					int row_mask = mask_w - 1 - i;
					// x coordinate in N
					int row_n = row_start + i;

					// in x direction of mask
					for(int j=0; j<mask_w; j++){
						// y coordinate in mask
						int col_mask = mask_w - 1 - j;
						// y coordinate in N
						int col_n = col_start + j;

						// if in the range of N
						if ((col_n>=0) && (col_n<n_w) && (row_n>=0) && (row_n<n_h)){
							p_value += N[row_n*n_w+col_n] * M[row_mask*mask_w+col_mask];
						}
					}
				}
				P[row*n_w+col] = p_value;
			}
		
		""" # you can either use a string or save the kernel in kernel.cu file and reference it here.
		# Compile the kernel code when an instance
		# of this class is made. 
		if method == 'naive':
			mod = SourceModule(kernelwrapper_naive)
		else:
			print("Please enter the correct method name! -naive")
			
		self.mod = mod

	def getBlockGridDim(self, N, blocksize=32):
		BlockDim = (blocksize, blocksize,1)
		GridDim = (N.shape[0]//blocksize+1, N.shape[1]//blocksize+1,1)
		return BlockDim, GridDim

	def conv_gpu_naive(self, N, M):
		"""
		convolution with global memory
		:param N: input matrix
		:param M: mask
		:return:
		- out: a tensor with the same shape as x
		- cache: (train phase) cache a random dropout mask used in feedforward process
				(test phase) None
		"""
		# implement this, note you can change the function signature (arguments and return type)
		# convert the datatype
		N = N.astype(np.float32)
		M = M.astype(np.float32)

		self.getSourceModule('naive')
		func_conv = self.mod.get_function("conv_gpu_naive")
		height, width = N.shape
		# print(height,width)
		mask_width = M.shape[0]
		# print(mask_width)
		# the result matrix
		P = np.empty_like(N)
		# copy to device global memory
		N_d = gpuarray.to_gpu(N)
		M_d = gpuarray.to_gpu(M)
		P_d = gpuarray.to_gpu(P)
		# block and grid size
		BlockDim, GridDim = self.getBlockGridDim(N)

		func_conv(N_d, P_d, M_d, np.int32(height), np.int32(width), np.int32(mask_width), block=BlockDim, grid = GridDim)

		P = P_d.get()

		return P


	def conv_gpu_shared_mem(self):
		# implement this, note you can change the function signature (arguments and return type)
		pass

	def conv_gpu_shared_and_constant_mem(self):
		# implement this, note you can change the function signature (arguments and return type)
		pass

	def test_conv_pycuda(self):
		# implement this, note you can change the function signature (arguments and return type)
		pass

In [13]:
N = np.random.rand(6,5)
M = np.random.rand(3,3)
M = np.array([[1,0,0],[0,0,0],[0,0,0]])
# print(M)
conver = Convolution()
P_cu = conver.conv_gpu_naive(N,M)
P_sp = convolve2d(N.astype(np.float32), M.astype(np.float32), mode='same')
print(np.allclose(P_cu, P_sp))
print(N,'\n')
print(P_cu,'\n')
print(P_sp)

False
[[0.65003337 0.00969795 0.51755718 0.12056966 0.98800904]
 [0.28205336 0.25314011 0.08230002 0.04768282 0.42372167]
 [0.91771188 0.8662492  0.34550165 0.57290916 0.66774025]
 [0.12833394 0.32253131 0.23299593 0.2697578  0.20994982]
 [0.80223568 0.88153013 0.95294165 0.96739439 0.06743692]
 [0.93834359 0.86327429 0.32940206 0.89496798 0.89563055]] 

[[0.2531401 0.        0.        0.        0.       ]
 [0.8662492 0.        0.        0.        0.       ]
 [0.        0.        0.        0.        0.       ]
 [0.        0.        0.        0.        0.       ]
 [0.        0.        0.        0.        0.       ]
 [0.        0.        0.        0.        0.       ]] 

[[0.2531401  0.08230002 0.04768282 0.42372167 0.        ]
 [0.8662492  0.34550166 0.5729092  0.6677402  0.        ]
 [0.3225313  0.23299593 0.2697578  0.20994982 0.        ]
 [0.8815301  0.95294166 0.9673944  0.06743692 0.        ]
 [0.8632743  0.32940206 0.894968   0.89563054 0.        ]
 [0.         0.         0.      





In [33]:
"""
for(int i=0; i<mask_w; i++){
    // x coordinate in mask
    int row_mask = mask_w - 1 - i
    // x coordinate in N
    int row_n = row_start + i;

    for(int j=0; j<mask_w; j++){
        // y coordinate in mask
        int col_mask = mask_w - 1 - j
        // y coordinate in N
        int col_n = col_start + j;

        // if in the range of N
        if(col_n>=0 && col_n<n_w && row_n>=0 && row_n<n_h ){
            p_value += N[row_n*n_w+col_n] * M[row_mask*mask_w+col_mask];
        }
    }
    
}
"""
mask_w = 5
height, width = 20,30
row, col = 10,20
row_n_start = row - mask_w//2
col_n_start = col - mask_w//2
for i in range(mask_w):
    row_mask = mask_w - 1 - i
    row_n = row_n_start + i
    for j in range(mask_w):
        col_mask = mask_w - 1 - j
        col_n = col_n_start + j
        if (row_n>=0) & (row_n<height) & (col_n>=0) & (col_n<width):
            # print(row_n,'  ', col_n, '  ', row_mask, '  ', col_mask)
            print("(", row_n, ',', col_n, ')', '  ', row_n*width+col_n, '  ', row_mask*mask_w+col_mask)
    print('\n')




( 8 , 18 )    258    24
( 8 , 19 )    259    23
( 8 , 20 )    260    22
( 8 , 21 )    261    21
( 8 , 22 )    262    20


( 9 , 18 )    288    19
( 9 , 19 )    289    18
( 9 , 20 )    290    17
( 9 , 21 )    291    16
( 9 , 22 )    292    15


( 10 , 18 )    318    14
( 10 , 19 )    319    13
( 10 , 20 )    320    12
( 10 , 21 )    321    11
( 10 , 22 )    322    10


( 11 , 18 )    348    9
( 11 , 19 )    349    8
( 11 , 20 )    350    7
( 11 , 21 )    351    6
( 11 , 22 )    352    5


( 12 , 18 )    378    4
( 12 , 19 )    379    3
( 12 , 20 )    380    2
( 12 , 21 )    381    1
( 12 , 22 )    382    0




In [10]:
N = np.random.rand(5,6)
height, width = N.shape
print(height, width)
BlockDim = (*N.shape,1)
print(BlockDim)
M = np.array(([0,0,0],[0,1,0],[0,0,0]))
print(M)

5 6
(5, 6, 1)
[[0 0 0]
 [0 1 0]
 [0 0 0]]


In [11]:
import numpy as np

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

# DEVICE SETUP
BLOCK_SIZE = 32  # Max 32. 32**2 = 1024, max for GTX1060

def getSourceModule():
    kernel = """
    __global__ 
    void conv(const float *A, const float *B, int aw, int ah, int bw, int bh, int b_sum, float *C){

        /*Get row and column to operate on from thread coordinates*/
        int tx = threadIdx.x;
        int ty = threadIdx.y;
        
        int bx = blockIdx.x;
        int by = blockIdx.y;
        
        int row = by*blockDim.y + ty;
        int col = bx*blockDim.x + tx;
        
        /*Calculate "padding" radius of convolution kernel (distance around central pixel)*/
        int pw = (bw-1)/2;
        int ph = (bh-1)/2;

        /*If within the range of C (ie A - padding)*/
        if( row < (ah-2*ph) && col < (aw-2*pw) ) {
            
            /*Set initial pixel value*/
            int val = 0;
            
            /*For each vertical position on the kernel matrix, relative to the central pixel*/
            for(int i=-ph; i<=ph; i=i+1){
                /*Calculate zero-indexed row ID on kernel matrix*/
                int b_row = i+ph; 

                /*For each horizontal position on the kernel matrix, relative to the central pixel*/
                for(int j=-pw; j<=pw; j=j+1){
                    /*Calculate zero-indexed column ID on kernel matrix*/
                    int b_col = j+pw;

                    /*Add product of kernel value and corresponding image value to running total*/
                    val += A[ (row+ph +i)*aw + (col+pw +j) ] * B[ b_row*bw + b_col ];
                }
            }
            
            /*Copy appropriately normalised resulting pixel value to position on C matrix*/
            C[row*(aw-2*pw) + col] = val/b_sum;
        }
    }
    """
    return SourceModule(kernel)
    
# Compile kernel
mod = getSourceModule()

# Get functions
conv = mod.get_function("conv")




def convolve(a, b):
    global BLOCK_SIZE
    global conv
    
    a, b = [np.array(i).astype(np.float32) for i in [a, b]]
    
    # Matrix A 
    aw = np.int32(a.shape[1])  # Widthof in matrix
    ah = np.int32(a.shape[0])  # Height of in matrix
    
    # Matrix B (kernel)
    bw = np.int32(b.shape[1])  # Widthof in matrix
    if bw % 2 == 0:
        print("Kernel width is not an odd number! Strange things will happen...")
    bh = np.int32(b.shape[0])  # Height of in matrix
    if bh % 2 == 0:
        print("Kernel height is not an odd number! Strange things will happen...")
    b_sum = np.int32(np.absolute(b).sum())
    
    # Matrix C, subtract 2*padding, *2 because it's taken off all sides
    c = np.empty([ah-(bh-1), aw-(bw-1)])
    c = c.astype(np.float32)
    
    # Allocate memory on device
    a_gpu = cuda.mem_alloc(a.nbytes)
    b_gpu = cuda.mem_alloc(b.nbytes)
    c_gpu = cuda.mem_alloc(c.nbytes)
    
    # Copy matrix to memory
    cuda.memcpy_htod(a_gpu, a)
    cuda.memcpy_htod(b_gpu, b)

    # Set grid size from A matrix
    grid = (int(aw/BLOCK_SIZE+(0 if aw % BLOCK_SIZE is 0 else 1)), 
            int(ah/BLOCK_SIZE+(0 if ah % BLOCK_SIZE is 0 else 1)), 
                          1)
    
    # Call gpu function
    conv(a_gpu, b_gpu, aw, ah, bw, bh, b_sum, c_gpu, block=(BLOCK_SIZE, BLOCK_SIZE, 1), grid=grid)
    
    # Copy back the result
    cuda.memcpy_dtoh(c, c_gpu)
    
    # Free memory. May not be useful? Ask about this.
    a_gpu.free()
    b_gpu.free()
    c_gpu.free()
    
    # Return the result
    return c

In [16]:
N = np.random.rand(30,40)*10
M = np.random.rand(3,3)*10
# M = np.array([[1,0,0],[0,0,0],[0,0,0]])
# print(M)
P_cu = convolve(N,M)
# P_sp = convolve2d(N.astype(np.float32), M.astype(np.float32), mode='full')
# print(np.allclose(P_cu, P_sp))
print(N,'\n')
print(P_cu,'\n')
# print(P_sp)

[[6.23089391 0.12648244 9.84181206 ... 4.38169318 9.23371247 8.44490616]
 [9.0537361  1.22756095 1.12970777 ... 1.99929716 8.66201555 1.32353132]
 [2.44116335 6.99184915 3.61355678 ... 8.37585567 1.13886745 2.69067491]
 ...
 [0.43791003 3.87850462 1.00171959 ... 4.45572115 3.38204623 7.43212451]
 [1.80912737 0.2074232  5.94952211 ... 9.50651707 7.78914735 1.64754239]
 [1.48632327 3.8389076  7.07190159 ... 5.33207501 1.98681875 7.21444586]] 

[[3. 3. 5. ... 5. 4. 4.]
 [3. 4. 4. ... 4. 6. 4.]
 [2. 3. 3. ... 5. 6. 5.]
 ...
 [4. 5. 5. ... 5. 5. 5.]
 [3. 4. 4. ... 5. 5. 5.]
 [2. 3. 5. ... 4. 5. 6.]] 

