## Example of using Numba for adding two vectors and computing average of elements in a vector.

Make sure to change the run time to have GPU.

In [1]:
from numba import cuda
import numpy as np
import math
import time
import matplotlib.pyplot as plt

gpu = cuda.get_current_device()
print("name = %s" % gpu.name)
print("maxThreadsPerBlock = %s" % str(gpu.MAX_THREADS_PER_BLOCK))
print("maxBlockDimX = %s" % str(gpu.MAX_BLOCK_DIM_X))
print("maxBlockDimY = %s" % str(gpu.MAX_BLOCK_DIM_Y))
print("maxBlockDimZ = %s" % str(gpu.MAX_BLOCK_DIM_Z))
print("maxGridDimX = %s" % str(gpu.MAX_GRID_DIM_X))
print("maxGridDimY = %s" % str(gpu.MAX_GRID_DIM_Y))
print("maxGridDimZ = %s" % str(gpu.MAX_GRID_DIM_Z))
print("maxSharedMemoryPerBlock = %s" % str(gpu.MAX_SHARED_MEMORY_PER_BLOCK))

name = b'NVIDIA GeForce RTX 3070 Ti Laptop GPU'
maxThreadsPerBlock = 1024
maxBlockDimX = 1024
maxBlockDimY = 1024
maxBlockDimZ = 64
maxGridDimX = 2147483647
maxGridDimY = 65535
maxGridDimZ = 65535
maxSharedMemoryPerBlock = 49152


## Vector summation
This can be done without synchronization across blocks and threads. When the kernel is called for the first time, it takes sometime to compile the function to run on GPU, but subsequent calls are faster.

In [2]:
@cuda.jit
def add_numba(a, b, c):
    # Obtain the thread id w.r.t. the grid
    # Alternatively: threadIdx.x + (blockIdx.x * blockDim.x)
    tid = cuda.grid(1) 
    size = len(c)

    if tid < size:
        c[tid] = a[tid] + b[tid]


* The kernel is invoked by also specifying the grid and block dimensions in a square bracket. 
* Kernels do not have return values, so variables have to be passed in to store the results.
* By default, the variables are copied to and from device automatically.

In [3]:
a = np.random.randint(low=0, high=3, size=3)
b = np.random.randint(low=0, high=3, size=3)
c = np.zeros_like(a)
num_blocks = 1
num_threads = len(a)
assert num_threads<gpu.MAX_THREADS_PER_BLOCK

add_numba[num_blocks, num_threads](a, b, c)
print("{}+{}={}".format(a, b, c))



[0 2 0]+[0 1 2]=[0 3 2]




Memory can be pre-allocated on device and copied back to host.

In [4]:
# Specify a and b in host memory
N = int(1e8)
a_cpu = np.random.random(N)
b_cpu = np.random.random(N)

# Move a, b to device, and allocate memory for c on device
a_gpu = cuda.to_device(np.random.random(N))
b_gpu = cuda.to_device(np.random.random(N))
c_gpu = cuda.device_array_like(a_cpu)

In [5]:
# Specify the number of threads and blocks
num_threads_per_block=100
num_blocks = int(N//num_threads_per_block)+1

# Time GPU code
t0 = time.time()
add_numba[num_blocks, num_threads_per_block](a_gpu, b_gpu, c_gpu)
print("First run took {}s to compute on GPU.".format(time.time()-t0))
t0 = time.time()
add_numba[num_blocks, num_threads_per_block](a_gpu, b_gpu, c_gpu)
print("Second run took {}s to compute on GPU.".format(time.time()-t0))

t0 = time.time()
_ = c_gpu.copy_to_host()
print("Took {}s to copy from GPU to CPU.".format(time.time()-t0))

# Time CPU code
t0 = time.time()
c = a_cpu+b_cpu
print("Took {}s to compute on CPU.".format(time.time()-t0))


First run took 0.04236555099487305s to compute on GPU.
Second run took 0.000324249267578125s to compute on GPU.
Took 0.1907796859741211s to copy from GPU to CPU.
Took 0.16444754600524902s to compute on CPU.


## Compute sum. Need synchronization.

In [6]:
@cuda.jit(fastmath=True)
def mean_numba(v):
    tid = cuda.threadIdx.x
    numel = len(v)
    num_threads = cuda.blockDim.x
    repeat = int(math.ceil(numel/num_threads))

    if tid<numel:
        s = 1
        while s < numel:
            for ri in range(repeat):
                gi = tid + ri*num_threads
                if (gi % (2*s) == 0) and ((gi+s)<numel):
                    v[gi] += v[gi+s]
            s *= 2
            cuda.syncthreads()
        
    if tid==0:
        v[0] = v[0]/numel
    

v_cpu = c_gpu.copy_to_host()
v_gpu = cuda.to_device(v_cpu)

In [7]:
t0 = time.time()
num_threads_per_block = 100
mean_numba[1,num_threads_per_block](c_gpu)
print("First run took {}s on GPU".format(time.time()-t0))
t0 = time.time()
num_threads_per_block = 100
mean_numba[1,num_threads_per_block](c_gpu)
print("Second run took {}s on GPU".format(time.time()-t0))

t0 = time.time()
mean_cpu = np.mean(v_cpu)
print("{}s on CPU".format(time.time()-t0))

First run took 0.07258892059326172s on GPU
Second run took 0.00034928321838378906s on GPU
0.0423429012298584s on CPU




In [8]:
%timeit -n 5 -r 5 mean_numba[1,num_threads_per_block](v_gpu)

%timeit -n 5 -r 5 np.mean(v_cpu)

The slowest run took 5.72 times longer than the fastest. This could mean that an intermediate result is being cached.
38.9 µs ± 33 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)
41.3 ms ± 599 µs per loop (mean ± std. dev. of 5 runs, 5 loops each)
