In [135]:
import numpy as np
from timeit import default_timer as timer
from numba import vectorize, jit, cuda
import time

In [110]:
@numba.jit
def clamp(x, xmin, xmax):
    if x < xmin:
        return xmin
    elif x > xmax:
        return xmax
    else:
        return x
    
@numba.cuda.jit
def clamp_array(x, xmin, xmax, out):
    # Assuming 1D array
    start = numba.cuda.grid(1)
    stride = numba.cuda.gridsize(1)
    
    for i in range(start, x.shape[0], stride):
        out[i] = clamp(x[i], xmin, xmax)  # call "CPU" function here



In [111]:
x = np.linspace(-10, 10, 1000)
out_device = numba.cuda.device_array_like(x)
clamp_array[64, 64](x, -1, 1, out_device)

out = out_device.copy_to_host()
print(out[:10])
print(out[500:510])
print(out[-10:])

[-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
[0.01001001 0.03003003 0.05005005 0.07007007 0.09009009 0.11011011
 0.13013013 0.15015015 0.17017017 0.19019019]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [161]:
SQRT_TWOPI = np.float32(math.sqrt(2 * math.pi))

@numba.vectorize(['float32(float32, float32, float32)'], target='cuda')
def gaussian(x, x0, sigma):
    return math.exp(-((x - x0) / sigma)**2 / 2) / SQRT_TWOPI / sigma

def gaussianno(x, x0, sigma):
    return np.exp(-((x - x0) / sigma)**2 / 2) / SQRT_TWOPI / sigma

@jit(nopython = True)
def gaussianjit(x, x0, sigma):
    return np.exp(-((x - x0) / sigma)**2 / 2) / SQRT_TWOPI / sigma

In [159]:
x = np.linspace(-3, 3, 100000000, dtype=np.float32)

start = time.time()
for i in range(100):
    g = gaussian(x, 0, 1)  # 1D result
    x2d = x.reshape((10000,10000))
    g2d = gaussian(x2d, 0, 1) # 2D result

end = time.time()
print(end - start)

start = time.time()
for i in range(100):
    g2 = gaussianno(x, 0, 1)  # 1D result
    x2d2 = x.reshape((10000,10000))
    g2d2 = gaussianno(x2d2, 0, 1) # 2D result

end = time.time()
print(end - start)


152.97770643234253
291.8466238975525


In [162]:
start = time.time()
for i in range(100):
    g2 = gaussianno(x, 0, 1)  # 1D result
    x2d2 = x.reshape((10000,10000))
    g2d2 = gaussianjit(x2d2, 0, 1) # 2D result

end = time.time()
print(end - start)

197.19673657417297


In [115]:
@numba.cuda.jit
def gpu_cos(x, out):
    # Assuming 1D array
    start = numba.cuda.grid(1)
    stride = numba.cuda.gridsize(1)
    
    for i in range(start, x.shape[0], stride):
        out[i] = math.cos(x[i])
        
def do_cos(x):
    out = numba.cuda.device_array_like(x)
    gpu_cos[64, 64](x, out)
    return out.copy_to_host()

In [116]:
# check if works locally first
test_x = np.random.uniform(-10, 10, 1000).astype(np.float32)
result = do_cos(test_x)
np.testing.assert_allclose(result, np.cos(test_x), rtol=1e-6)