In [1]:
import pyopencl as cl
import numpy as np
import pyopencl.array as array
import matplotlib.pyplot as plt
from pyopencl import Buffer, MemoryObject
from scipy.signal import convolve2d
import time


In [4]:
class Convolution:
	def __init__(self):
		"""
		Attributes for instance of clModule
		Includes OpenCL context, command queue, kernel code.
		"""

		# Get platform and device property
		NAME = 'NVIDIA CUDA'
		platforms = cl.get_platforms()
		devs = None
		for platform in platforms:
			if platform.name == NAME:
				devs = platform.get_devices()       

		# Create Context:
		self.ctx = cl.Context(devs)

		# Setup Command Queue:
		self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

		# kernel - Write your kernel code here or in a .cu file and link it here.
		kernel_code = """
		//****************************************************************************************
		// global memory function

		__kernel void conv_gpu_naive_openCL(__global float *N, __global float *M, __global float *P, 
											const int width, const int height, const int mask_width){
			
			// position of threads
			//const int row = get_local_size(0)*get_gruop_id(0) + get_local_id(0);
			//const int col = get_local_size(1)*get_gruop_id(1) + get_local_id(1);
			const int row = get_global_id(0);
			const int col = get_global_id(1);

			// copy to register
			const int mask_w = mask_width;
			const int n_w = width;
			const int n_h = height;

			// start point of the kernel
			const int col_start = col - mask_w/2;
			const int row_start = row - mask_w/2;

			float p_value = 0.0f;

			// see if the thread in the range of N
			if((row<n_h)&&(col<n_w)){

				for(int i=0; i<mask_w; i++){			// in y direction of mask

					int row_mask = mask_w - 1 - i;		// x coordinate in mask
					int row_n = row_start + i;			// x coordinate in N
					
					for(int j=0; j<mask_w; j++){		// in x direction of mask

						int col_mask = mask_w - 1 - j;	// y coordinate in mask
						int col_n = col_start + j;		// y coordinate in N

						// if in the range of N
						if ((col_n>=0) && (col_n<n_w) && (row_n>=0) && (row_n<n_h)){
							p_value += N[row_n*n_w+col_n] * M[row_mask*mask_w+col_mask];
						}
					}
				}
				P[row*n_w+col] = p_value;
			}
		}

		//****************************************************************************************

		// predefined parameters
		#define MASK_SIZE 5
		#define TILE_SIZE 12
		#define TILE_SIZE_PAD (TILE_SIZE + MASK_SIZE - 1)

		//****************************************************************************************
		// local memory function (which is shared memory in cuda)

		__kernel void conv_gpu_shared_mem_openCL(__global float *N, __global float *M, __global float *P, 
											const int width, const int height){
		
			// copy to register
			const int n_w = width;
			const int n_h = height;

			__local float N_ds[TILE_SIZE][TILE_SIZE];		// copy input matrix to local memory (which is shared memory in cuda)
			//__local float M_ds[MASK_SIZE][MASK_SIZE];		// copy mask to lcoal memory

			// current position of thread
			const int tx = get_local_id(0);
			const int ty = get_local_id(1);

			// current position of input matrix
			const int col = tx + get_group_id(0)*TILE_SIZE;
			const int row = ty + get_group_id(1)*TILE_SIZE;

			// start point of mask
			const int col_start = col - MASK_SIZE/2;
			const int row_start = row - MASK_SIZE/2;

			// copy mask to shared memory
			//for(int i=0; i<MASK_SIZE; i++){				// in x direction of mask
				//for(int j=0; j<MASK_SIZE; j++){			// in y direction of mask
					//M_ds[i][j] = M[i*MASK_SIZE+j];
				//}
			//}

			// copy each tiled matrix into local memory
			if((row_start>=0) && (row_start<n_h) && (col_start>=0) && (col_start<n_w)){
				N_ds[ty][tx] = N[row_start*n_w+col_start];
			}
			else{
				N_ds[ty][tx] = 0.0f;
			}

			// need to wait all the thread have done copy
    		barrier(CLK_LOCAL_MEM_FENCE);
			
			float p_value = 0.0f;
			if((ty<TILE_SIZE) && (tx<TILE_SIZE)){			// in range of tile

				// in y direction of mask
				for(int i=0; i<MASK_SIZE; i++){
					int row_n = ty+i;						// y coordinate in N_ds
					int row_mask = MASK_SIZE - 1 - i;		// y coordinate in mask

					// in x direction of mask
					for(int j=0; j<MASK_SIZE; j++){
						int col_n = tx+j;					// x coordinate in N_ds
						int col_mask = MASK_SIZE - 1 - j;	// x coordinate in mask
						p_value += N_ds[row_n][col_n] * M[row_mask*MASK_SIZE+col_mask];
					}
				}
				if((row<n_h) && (col<n_w)){					// in range of input matrix
					P[row*n_w+col] = p_value;
				}
			}
		}


		//****************************************************************************************


		""" 

		# Build kernel code
		self.prg = cl.Program(self.ctx, kernel_code).build()
		self.mask_size = 5
		self.tile_size = 12
		self.tile_size_pad = self.tile_size + self.mask_size - 1

	def conv_gpu_naive(self, N, M):
		"""
		parallel convolution using global memory
		visit input matrix and mask from global memory in the kernel
		mask can be any size

		params:
		- N: input matrix
		- M: mask

		return:
		- P: result
		- time
		"""
		import time
		# start to record
		t_start = time.time()

		height, width = N.shape
		mask_width = M.shape[0]

		P = np.empty_like(N)

		# device memory allocation
		N_d = cl.array.to_device(self.queue, N)
		M_d = cl.array.to_device(self.queue, M)
		# P_d = cl.array.empty_like(N)
		P_d = cl.array.to_device(self.queue, P)
		
		GlobalSize = (height, width, 1)
		# workgroup size
		# Local_size = (height//workitem_size+1, width//workitem_size+1,1)
		self.prg.conv_gpu_naive_openCL(self.queue, GlobalSize, None, N_d.data, M_d.data, P_d.data, np.int32(width), 
										np.int32(height), np.int32(mask_width))

		# wait for execution to complete.
		self.queue.finish()

		# Copy output from GPU to CPU
		P = np.array(P_d.get())

		# Record execution time 
		time = time.time() - t_start

		return np.array(P), time*1e3




	def conv_gpu_shared_mem(self, N, M):
		"""
		parallel convolution using local memory
		visit input matrix and mask from local memory in the kernel
		mask have to be 5*5

		params:
		- N: input matrix
		- M: mask

		return:
		- P: result
		- time
		"""
		import time
		# start to record
		t_start = time.time()

		height, width = N.shape

		P = np.empty_like(N)

		# device memory allocation
		N_d = cl.array.to_device(self.queue, N)
		M_d = cl.array.to_device(self.queue, M)
		# P_d = cl.array.empty_like(N)
		P_d = cl.array.to_device(self.queue, P)
		
		# grid size   
		GlobalSize = ((height//self.tile_size+1)*self.tile_size_pad, (width//self.tile_size+1)*self.tile_size_pad)
		# block size
		LocalSize = (self.tile_size_pad, self.tile_size_pad)

		# k = self.prg.conv_gpu_shared_mem_openCL

		# k.set_arg(0, N_d.data)
		# k.set_arg(1, M_d.data)
		# k.set_arg(2, P_d.data)
		# k.set_arg(3, np.int32(width))
		# k.set_arg(4, np.int32(height))
		# cl.enqueue_nd_range_kernel(self.queue, k, global_work_size = (height, width, 1), local_work_size = LocalSize)

		# print(LocalSize)
		# print(GlobalSize)
		# workgroup size
		# Local_size = (height//workitem_size+1, width//workitem_size+1,1)
		event = self.prg.conv_gpu_shared_mem_openCL(self.queue, GlobalSize, LocalSize, N_d.data, M_d.data, P_d.data, 
											np.int32(width), np.int32(height))

		event.wait()

		# wait for execution to complete.
		# self.queue.finish()

		# Copy output from GPU to CPU
		P = P_d.get()

		# Record execution time 
		time = time.time() - t_start

		return np.array(P), time*1e3

	def conv_gpu_shared_and_constant_mem(self):
		# implement this, note you can change the function signature (arguments and return type)
		pass

	def test_conv_pyopencl(self):
		# implement this, note you can change the function signature (arguments and return type)
		pass

	def conv_scipy(self, N, M):
		"""
		Serial convolution using scipy.signal.convolve2d

		params:
		- N: input matrix
		- M: mask

		return:
		- P: result
		- time
		"""
		start = time.time()

		P = convolve2d(N.astype(np.float32), M.astype(np.float32), mode='same')

		return P, (time.time()-start)*1000

In [5]:
n_size = 1024
N = np.random.rand(n_size,n_size).astype(np.float32)
M = np.random.rand(5,5).astype(np.float32)

conver = Convolution()

# P_cl_naive, t_naive = conver.conv_gpu_naive(N,M)
P_cl_shared, t_shared = conver.conv_gpu_shared_mem(N,M)

P_scipy, t_scipy = conver.conv_scipy(N,M)

# print(np.allclose(P_cl_naive, P_scipy))
print(np.allclose(P_cl_shared, P_scipy))
# print(P_cl_naive)
# print(P_scipy)

LogicError: clWaitForEvents failed: <unknown error -9999>