In [1]:
from numba import cuda
import numpy as np
import IPython

In [2]:
n = 5000000
x = np.arange(n).astype(np.float32)
y = 2 * x
out = np.empty_like(x)

In [3]:
x[:3]

array([0., 1., 2.], dtype=float32)

In [4]:
y[:3]

array([0., 2., 4.], dtype=float32)

In [5]:
@cuda.jit
def add_kernel_long(x, y, out):
    tx = cuda.threadIdx.x # this is the unique thread ID within a 1D block
    ty = cuda.blockIdx.x  # Similarly, this is the unique block ID within the 1D grid

    block_size = cuda.blockDim.x  # number of threads per block
    
    i = tx + ty * block_size


    # assuming x and y inputs are same length
    if i < len(out):
        out[i] = x[i] + y[i]

In [7]:
threads_per_block = 128
blocks_per_grid = 30

In [9]:
30 * 128

3840

In [8]:
add_kernel_long[blocks_per_grid, threads_per_block](x, y, out)
print(out[:10])

[ 0.  3.  6.  9. 12. 15. 18. 21. 24. 27.]


In [10]:
add_kernel_long[blocks_per_grid, threads_per_block](x, y, out)
print(out[3840:3850])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [16]:
int(n / threads_per_block)

39062

In [17]:
add_kernel_long[int(n / threads_per_block), threads_per_block](x, y, out)
print(out[3840:3850])

[11520. 11523. 11526. 11529. 11532. 11535. 11538. 11541. 11544. 11547.]


In [18]:
app = IPython.Application.instance()
app.kernel.do_shutdown(False)

{'status': 'ok', 'restart': False}

In [1]:
from numba import cuda
import numpy as np
import IPython

In [2]:
n = 5000000
x = np.arange(n).astype(np.float32)
y = 2 * x
out = np.empty_like(x)

In [5]:
@cuda.jit
def add_kernel_short(x, y, out):
    i = cuda.grid(1)

    # assuming x and y inputs are same length
    if i < len(out):
        out[i] = x[i] + y[i]

In [6]:
threads_per_block = 128
blocks_per_grid = 30

In [7]:
add_kernel_short[blocks_per_grid, threads_per_block](x, y, out)
print(out[:10])

[ 0.  3.  6.  9. 12. 15. 18. 21. 24. 27.]


In [8]:
add_kernel_short[blocks_per_grid, threads_per_block](x, y, out)
print(out[3840:3850])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [11]:
add_kernel_short[int(n / threads_per_block), threads_per_block](x, y, out)
print(out[3840:3850])

[11520. 11523. 11526. 11529. 11532. 11535. 11538. 11541. 11544. 11547.]


In [12]:
x_device = cuda.to_device(x)
y_device = cuda.to_device(y)
out_device = cuda.device_array_like(x)

In [None]:
%time add_kernel_short[int(n / blocks_per_grid), threads_per_block](x, y, out)

In [13]:
%time add_kernel_short[int(n / blocks_per_grid), threads_per_block](x, y, out)

CPU times: user 24.8 ms, sys: 4.53 ms, total: 29.3 ms
Wall time: 27.8 ms


In [14]:
%time add_kernel_short[int(n / blocks_per_grid), threads_per_block](x_device, y_device, out_device)

CPU times: user 219 µs, sys: 226 µs, total: 445 µs
Wall time: 383 µs


In [15]:
%time add_kernel_short[int(n / blocks_per_grid), threads_per_block](x_device, y_device, out_device); out_device.copy_to_host()

CPU times: user 5.26 ms, sys: 9.34 ms, total: 14.6 ms
Wall time: 13.1 ms


array([0.0000000e+00, 3.0000000e+00, 6.0000000e+00, ..., 1.4999991e+07,
       1.4999994e+07, 1.4999997e+07], dtype=float32)

In [None]:
cuda.synchronize()
%time add_kernel_short[int(n / blocks_per_grid), threads_per_block](x_device, y_device, out_device)
cuda.synchronize()

In [None]:
# GPU input/output arrays, include explicit synchronization in timing
cuda.synchronize()
%time add_kernel_short[int(n / blocks_per_grid), threads_per_block](x_device, y_device, out_device); cuda.synchronize()