In [1]:
import numpy as np
import math
import random

## Numba JIT (on CPU)

The numba.jit() decorator indicates the function to optimize with the Numba Just-In-Time compiler. See also numba.njit() decorator.

In [2]:
from numba import jit
# from numba import njit

In [3]:
@jit
def test01(x, y):
    x = abs(x)
    y = abs(y)
    x1 = min(x, y)
    y1 = max(x, y)
    z = y1 / x1
    return math.sqrt(z*z + 1)

In [4]:
%timeit test01.py_func(10.0, 20.0)

376 ns ± 4.1 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [5]:
%timeit test01(10.0, 20.0)

The slowest run took 20.44 times longer than the fastest. This could mean that an intermediate result is being cached.
1.2 μs ± 2.02 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
@jit
def test02(n):
    s = 0
    for i in range(n):
        x = random.random()
        y = random.random()
        if (x**2 + y**2) < 1.0:
            s += 1
    pi = s / (0.25 * n)
    return pi

In [7]:
%timeit test02.py_func(1000)

221 μs ± 3.12 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [8]:
%timeit test02(1000)

5.86 μs ± 31.5 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


## Numba Vectorize (on CPU or on GPU)

In [9]:
from numba import cuda
from numba import vectorize

In [10]:
d = 1000
x = np.random.rand(d).astype(np.float32)
y = np.random.rand(d).astype(np.float32)

In [11]:
# numpy
def test03a(x, y):
    return x**2 + y**4 + 17

In [12]:
%timeit test03a(x, y)

9 μs ± 39.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [13]:
# numba on CPU
@vectorize
def test03b(x, y):
    return x**2 + y**4 + 17

In [14]:
%timeit test03b(x, y)

1.64 μs ± 8.28 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [15]:
# numba on GPU
@vectorize(['float32(float32, float32)'], target='cuda')
def test03c(x, y):
    return x**2 + y**4 + 17

In [16]:
%timeit test03c(x, y)



558 μs ± 18.8 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


**REMARK:** Could you explain why numba on GPU is much slower than numba on CPU or even numpy (in the example above)?

In [17]:
d = 1000000
x = np.random.rand(d).astype(np.float32)
y = np.random.rand(d).astype(np.float32)

In [18]:
# numpy
def test04a(x, y):
    return np.exp((x + y)**2)

In [19]:
%timeit test04a(x, y)

1.85 ms ± 163 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [20]:
# numba on CPU
@vectorize
def test04b(x, y):
    return math.exp((x + y)**2)

In [21]:
%timeit test04b(x, y)

4.13 ms ± 131 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [22]:
# numba on GPU
@vectorize(['float32(float32, float32)'], target='cuda')
def test04c(x, y):
    return math.exp((x + y)**2)

In [23]:
%timeit test04c(x, y)

2.79 ms ± 98.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Numba CUDA JIT (on GPU)

In [24]:
from numba import cuda

@cuda.jit(device=True)
def polar_to_cartesian(rho, theta):
    x = rho * math.cos(theta)
    y = rho * math.sin(theta)
    return x, y

@vectorize(['float32(float32, float32, float32, float32)'], target='cuda')
def polar_distance(rho1, theta1, rho2, theta2):
    x1, y1 = polar_to_cartesian(rho1, theta1)
    x2, y2 = polar_to_cartesian(rho2, theta2)

    return ((x1 - x2)**2 + (y1 - y2)**2)**0.5

In [25]:
d = 1000000
r1 = np.random.uniform(0.25, 1.75, d).astype(np.float32)
t1 = np.random.uniform(-np.pi, np.pi, d).astype(np.float32)
r2 = np.random.uniform(0.25, 1.75,d).astype(np.float32)
t2 = np.random.uniform(-np.pi, np.pi, d).astype(np.float32)

In [26]:
%timeit polar_distance(r1, t1, r2, t2)

5.31 ms ± 112 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


**ASSIGNMENT:** Compare with numpy and numba on CPU implementations.

#### Numba on cpu

In [27]:
from numba import jit

@jit
def polar_to_cartesian_cpu(rho, theta):
    x = rho * math.cos(theta)
    y = rho * math.sin(theta)
    return x, y

@jit
def polar_distance_cpu(rho1, theta1, rho2, theta2):
    n = rho1.shape[0]
    result = np.empty(n, dtype=np.float32)
    for i in range(n):
        x1, y1 = polar_to_cartesian_cpu(rho1[i], theta1[i])
        x2, y2 = polar_to_cartesian_cpu(rho2[i], theta2[i])
        result[i] = ((x1 - x2)**2 + (y1 - y2)**2)**0.5
    return result

In [28]:
%timeit polar_distance_cpu(r1, t1, r2, t2)

16.6 ms ± 440 μs per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Numpy

In [29]:
def polar_to_cartesian_numpy(rho, theta):
    x = rho * np.cos(theta)
    y = rho * np.sin(theta)
    return x, y

def polar_distance_numpy(rho1, theta1, rho2, theta2):
    x1, y1 = polar_to_cartesian_numpy(rho1, theta1)
    x2, y2 = polar_to_cartesian_numpy(rho2, theta2)
    return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)

In [30]:
%timeit polar_distance_numpy(r1, t1, r2, t2)

21.2 ms ± 883 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Numba CUDA kernels

In [31]:
@cuda.jit
def kernel01(x, y, out):
    thread_idx = cuda.grid(1)
    out[thread_idx] = x[thread_idx] + y[thread_idx]

In [32]:
d = 1e6
x = np.arange(1, d+1).astype(np.int32)
y = np.arange(2, d+2).astype(np.int32)

x_device = cuda.to_device(x)
y_device = cuda.to_device(y)
z_device = cuda.device_array_like(x_device)

In [33]:
blocks_per_grid = 32
threads_per_block = 128

kernel01[blocks_per_grid, threads_per_block](x_device, y_device, z_device)
cuda.synchronize()



In [34]:
z_host = z_device.copy_to_host()
z_host

array([3, 5, 7, ..., 0, 0, 0], dtype=int32)

**ASSIGNMENT:** In `kernel01`, each thread processes only one coordinate of the vector. It may be inefficient for very long vectors (e.g. `d = 1e6`), because a very large number of threads must be used. Refactor the kernel so that each thread processes multiple coordinates, for instance, each thread $k$ may process coordinates $i$ such that $i \text{ mod } \text{number\_of\_threads} = k$. You may assume that the length of the vector is a multiple of the number of threads.

In [None]:
from numba import cuda
import numpy as np

@cuda.jit
def kernel01(x, y, out):
	thread_idx = cuda.grid(1)
	out[thread_idx] = x[thread_idx] + y[thread_idx]

d = int(1e6)
x = np.arange(1, d+1).astype(np.int32)
y = np.arange(2, d+2).astype(np.int32)

x_device = cuda.to_device(x)
y_device = cuda.to_device(y)
z_device = cuda.device_array_like(x_device)

threads_per_block = 128
blocks_per_grid = (d + threads_per_block - 1) // threads_per_block

kernel01[blocks_per_grid, threads_per_block](x_device, y_device, z_device)
cuda.synchronize()

z = z_device.copy_to_host()
print(z[:10])


[ 3  5  7  9 11 13 15 17 19 21]
