In [9]:
# you can call a function "on device"
import numpy as np
import cupy as cp
from cupyx import jit

@jit.rawkernel(device=True)
def do_a_thing(x, y):
    return (y, x)
    
@jit.rawkernel()
def do_more_things(x, y, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.gridDim.x * jit.blockDim.x
    for i in range(tid, size, ntid):
        (x[i], y[i]) = do_a_thing(x[i], y[i])

size = 10
x = cp.random.random(size, dtype=np.float32)
y = cp.random.random(size, dtype=np.float32)

print(x)
print(y)

do_more_things((128,),(1024,),(x, y, size))

print(x)
print(y)


[0.73879254 0.36430386 0.5560938  0.13465711 0.49230078 0.8651738
 0.01775549 0.39530468 0.26143515 0.01097945]
[0.96701926 0.31582886 0.45942357 0.02675973 0.44531643 0.38146228
 0.26420838 0.38977566 0.22158593 0.36414465]
[0.96701926 0.31582886 0.45942357 0.02675973 0.44531643 0.38146228
 0.26420838 0.38977566 0.22158593 0.36414465]
[0.73879254 0.36430386 0.5560938  0.13465711 0.49230078 0.8651738
 0.01775549 0.39530468 0.26143515 0.01097945]


In [26]:
# python float scalar type doesn't work, use numpy type
import numpy as np
import cupy as cp
from cupyx import jit

@jit.rawkernel()
def scalar_multiply(a, m, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.gridDim.x * jit.blockDim.x
    for i in range(tid, size, ntid):
        a[i] = a[i] * m

size = 5
a = cp.random.random(size, dtype=np.float32)

print(a)
scalar_multiply((128,),(1024,),(a, np.float32(5.0), size))
print(a)
scalar_multiply((128,),(1024,),(a, 5.0, size))
print(a)

[0.17959814 0.42873758 0.77541053 0.8213136  0.8284943 ]
[0.8979907 2.143688  3.8770528 4.1065683 4.1424713]
[0. 0. 0. 0. 0.]


In [117]:
%%time
x = cp.random.random(10000000, dtype=np.float32) * 2

CPU times: user 153 µs, sys: 87 µs, total: 240 µs
Wall time: 176 µs


In [102]:
%%time
y = cp.random.uniform(0, 2, 10000000, dtype=np.float32)

CPU times: user 236 µs, sys: 0 ns, total: 236 µs
Wall time: 181 µs
