In [22]:
!pip install -r requirements.txt



In [23]:
import numpy as np
from numba import cuda, float32
import time

In [24]:
def myconvolve2d_cpu(img, kernel):
    kernel_height = kernel.shape[0]
    kernel_width = kernel.shape[1]
    pad_h = kernel_height // 2
    pad_w = kernel_width // 2

    img = np.pad(img, ((pad_h, pad_h), (pad_w, pad_w)), mode='constant')
    out = np.zeros_like(img)

    for i in range(pad_h, img.shape[0]-pad_h):
        for j in range(pad_w, img.shape[1]-pad_w):
            region = img[i-pad_h:i+pad_h+1, j-pad_w:j+pad_w+1]
            out[i, j] = np.sum(region * kernel)

    return out[pad_h:-pad_h, pad_w:-pad_w]

@cuda.jit
def gpu_convolve2d(img, kernel, out):
  x, y = cuda.grid(2)

  H, W = img.shape
  if x >= 1 and x < H-1 and y >= 1 and y < W-1:
    val = 0.0
    for i in range(-1, 2):
      for j in range(-1, 2):
        val += img[x+i][y+j] * kernel[i+1][j+1]
      out[x, y] = val

def benchmark():
    # Image and kernel
    img = np.random.rand(1024, 1024).astype(np.float32)
    kernel = np.array([[1, 0, -1],
                       [1, 0, -1],
                       [1, 0, -1]], dtype=np.float32)  # Sobel filter

    # CPU time
    start = time.perf_counter()
    out_cpu = myconvolve2d_cpu(img, kernel)
    cpu_time = time.perf_counter() - start
    print(f"CPU time: {cpu_time:.4f}s")

    # GPU mem alloc
    d_img = cuda.to_device(img)
    d_kernel = cuda.to_device(kernel)
    d_out = cuda.device_array_like(img)

    # configure grid/block
    threads_per_block = (16,16) # threads per block in x and y directions
    blocks_per_grid_x = (img.shape[0] + 15) // 16
    blocks_per_grid_y = (img.shape[1] + 15) // 16
    blocks_per_grid = (blocks_per_grid_x, blocks_per_grid_y)

    # GPU time
    start = time.perf_counter()
    gpu_convolve2d[blocks_per_grid, threads_per_block](d_img, d_kernel, d_out)
    cuda.synchronize()
    gpu_time = time.perf_counter() - start
    print(f"GPU time: {gpu_time:.4f}s")

    out_gpu = d_out.copy_to_host()
    # match gpu output to cpu output
    out_gpu = out_gpu[1:-1, 1:-1]
    print("mean absolute error: ", np.mean(np.abs(out_cpu - out_gpu)))

In [None]:
benchmark()