In [4]:
import cupy as cp
import numpy as np
assert cp.cuda.is_available()

In [5]:
import torch
torch.cuda.is_available()

True

In [6]:
x = cp.array([1,2,3])

In [15]:
import cupy
import torch

cupy_custom_kernel_fwd = cupy.RawKernel(
    r"""
extern "C" __global__
void cupy_custom_kernel_fwd(const float* x, float* y, int size) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < size)
        y[tid] = log(x[tid]);
}
""",
    "cupy_custom_kernel_fwd",
)


cupy_custom_kernel_bwd = cupy.RawKernel(
    r"""
extern "C" __global__
void cupy_custom_kernel_bwd(const float* x, float* gy, float* gx, int size) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < size)
        gx[tid] = gy[tid] / x[tid];
}
""",
    "cupy_custom_kernel_bwd",
)


class CuPyLog(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        ctx.input = x
        # Enforce contiguous arrays to simplify RawKernel indexing.
        cupy_x = cupy.ascontiguousarray(cupy.from_dlpack(x.detach()))
        cupy_y = cupy.empty(cupy_x.shape, dtype=cupy_x.dtype)
        x_size = cupy_x.size
        bs = 128
        cupy_custom_kernel_fwd(
            (bs,), ((x_size + bs - 1) // bs,), (cupy_x, cupy_y, x_size)
        )
        # the ownership of the device memory backing cupy_y is implicitly
        # transferred to torch_y, so this operation is safe even after
        # going out of scope of this function.
        torch_y = torch.from_dlpack(cupy_y)
        return torch_y

    @staticmethod
    def backward(ctx, grad_y):
        # Enforce contiguous arrays to simplify RawKernel indexing.
        cupy_input = cupy.from_dlpack(ctx.input.detach()).ravel()
        cupy_grad_y = cupy.from_dlpack(grad_y.detach()).ravel()
        cupy_grad_x = cupy.zeros(cupy_grad_y.shape, dtype=cupy_grad_y.dtype)
        gy_size = cupy_grad_y.size
        bs = 128
        cupy_custom_kernel_bwd(
            (bs,),
            ((gy_size + bs - 1) // bs,),
            (cupy_input, cupy_grad_y, cupy_grad_x, gy_size),
        )
        # the ownership of the device memory backing cupy_grad_x is implicitly
        # transferred to torch_y, so this operation is safe even after
        # going out of scope of this function.
        torch_grad_x = torch.from_dlpack(cupy_grad_x)
        return torch_grad_x
    
    
inp = torch.randn(10).cuda()
outp = CuPyLog.apply(inp)

In [16]:
outp

tensor([    nan,     nan, -0.8041,     nan,  0.3397,     nan,     nan,     nan,
        -2.3179,     nan], device='cuda:0')

In [3]:
cp.linalg.norm(x)

array(3.74165739)

In [8]:
x.device

<CUDA Device 0>

In [11]:
with cp.cuda.Device(0):
    x = cp.arange(10)

In [12]:
x.device

<CUDA Device 0>

In [16]:
# from cupy import ElementwiseKernel

squared_diff = cp.ElementwiseKernel(
    'float32 x, float32 y',
    'float32 z',
    'z = (x - y) * (x - y)',
    'squared_diff'
)

In [22]:
xp = cp.arange(10, dtype=np.float32).reshape(2,5)
yp = cp.arange(5, dtype=np.float32)

squared_diff(xp, yp)

array([[ 0.,  0.,  0.,  0.,  0.],
       [25., 25., 25., 25., 25.]], dtype=float32)

In [21]:
squared_diff_generic = cp.ElementwiseKernel(
    'T x, T y',
    'T z',
    'z = (x - y) * (x - y)',
)

In [23]:
squared_diff_generic(xp, yp)

array([[ 0.,  0.,  0.,  0.,  0.],
       [25., 25., 25., 25., 25.]], dtype=float32)

In [32]:
import math
kernel = cp.RawKernel(
    r"""
    extern "C" __global__
    void kernel(double* x, double* y, double* z) {
        const int tx = threadIdx.x;
        const int gx = tx + blockIdx.x * blockDim.x;
        z[gx] = x[gx] + y[gx];
    }
    """,
    'kernel'
)

xp = cp.arange(10, dtype='float64')
yp = cp.arange(10, dtype='float64')
zp = cp.empty_like(xp)

n = len(xp)
THREADS_PER_BLOCK = (4,1,1)
BLOCKS_PER_GRID = (math.ceil(n / THREADS_PER_BLOCK[0]), 1, 1)
kernel(
    BLOCKS_PER_GRID, THREADS_PER_BLOCK,
    (xp,yp,zp)
)
zp

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18.])