In [17]:
import cupy as cp
import numpy as np
cp.cuda.is_available()

True

In [3]:
x = cp.array([1,2,3])

In [7]:
cp.linalg.norm(x)

array(3.74165739)

In [8]:
x.device

<CUDA Device 0>

In [11]:
with cp.cuda.Device(0):
    x = cp.arange(10)

In [12]:
x.device

<CUDA Device 0>

In [16]:
# from cupy import ElementwiseKernel

squared_diff = cp.ElementwiseKernel(
    'float32 x, float32 y',
    'float32 z',
    'z = (x - y) * (x - y)',
    'squared_diff'
)

In [22]:
xp = cp.arange(10, dtype=np.float32).reshape(2,5)
yp = cp.arange(5, dtype=np.float32)

squared_diff(xp, yp)

array([[ 0.,  0.,  0.,  0.,  0.],
       [25., 25., 25., 25., 25.]], dtype=float32)

In [21]:
squared_diff_generic = cp.ElementwiseKernel(
    'T x, T y',
    'T z',
    'z = (x - y) * (x - y)',
)

In [23]:
squared_diff_generic(xp, yp)

array([[ 0.,  0.,  0.,  0.,  0.],
       [25., 25., 25., 25., 25.]], dtype=float32)

In [32]:
import math
kernel = cp.RawKernel(
    r"""
    extern "C" __global__
    void kernel(double* x, double* y, double* z) {
        const int tx = threadIdx.x;
        const int gx = tx + blockIdx.x * blockDim.x;
        z[gx] = x[gx] + y[gx];
    }
    """,
    'kernel'
)

xp = cp.arange(10, dtype='float64')
yp = cp.arange(10, dtype='float64')
zp = cp.empty_like(xp)

n = len(xp)
THREADS_PER_BLOCK = (4,1,1)
BLOCKS_PER_GRID = (math.ceil(n / THREADS_PER_BLOCK[0]), 1, 1)
kernel(
    BLOCKS_PER_GRID, THREADS_PER_BLOCK,
    (xp,yp,zp)
)
zp

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18.])