I'd prefer columns over 3d arrays.  Is the norm just as fast?  Yes.

In [1]:
import numpy as np
import cupy as cp
from cupyx import jit
print(f"CuPy version {cp.__version__}")

CuPy version 11.0.0


In [2]:
size = 100000000
mempool = cp.get_default_memory_pool()
mempool.free_all_blocks()

Using linalg.norm

In [3]:
x_v = cp.random.random((size, 3), dtype=np.float32)
print(f"mempool.used_bytes {mempool.used_bytes()}")

mempool.used_bytes 1200000000


In [4]:
%%time
norm = cp.linalg.norm(x_v, axis=1)

CPU times: user 80.9 ms, sys: 8.08 ms, total: 88.9 ms
Wall time: 89.7 ms


In [5]:
print(f"mempool.used_bytes {mempool.used_bytes()}")
del norm
mempool.free_all_blocks()
print(f"mempool.used_bytes {mempool.used_bytes()}")

mempool.used_bytes 1600000000
mempool.used_bytes 1200000000


In [6]:
%%time
norm = cp.linalg.norm(x_v, axis=1)

CPU times: user 2.02 ms, sys: 334 µs, total: 2.35 ms
Wall time: 24.1 ms


In [7]:
del norm, x_v

using cp.sqrt

In [8]:
x_col = cp.random.random(size, dtype=np.float32)
y_col = cp.random.random(size, dtype=np.float32)
z_col = cp.random.random(size, dtype=np.float32)

In [9]:
@cp.fuse()
def norm_from_cols(x,y,z):
    return cp.sqrt(x * x + y * y + z * z)

In [10]:
%%time
norm_col = norm_from_cols(x_col, y_col, z_col)

CPU times: user 633 ms, sys: 144 µs, total: 633 ms
Wall time: 633 ms


In [11]:
mempool.free_all_blocks()
print(f"mempool.used_bytes {mempool.used_bytes()}")
del norm_col
mempool.free_all_blocks()
print(f"mempool.used_bytes {mempool.used_bytes()}")

mempool.used_bytes 1600000000
mempool.used_bytes 1200000000


In [12]:
%%time
norm_col = norm_from_cols(x_col, y_col, z_col)

CPU times: user 74 µs, sys: 0 ns, total: 74 µs
Wall time: 76.3 µs


In [13]:
del norm_col

using rawkernel

In [14]:
@jit.rawkernel()
def normalize_in_place(x, y, z, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.gridDim.x * jit.blockDim.x
    for i in range(tid, size, ntid):
        norm = cp.sqrt(x[i] * x[i] + y[i] * y[i] + z[i] * z[i])
        x[i] = x[i] / norm
        y[i] = y[i] / norm
        z[i] = z[i] / norm

  cupy._util.experimental('cupyx.jit.rawkernel')


In [15]:
%%time
normalize_in_place((128,), (1024,), (x_col, y_col, z_col, size))

CPU times: user 88.5 ms, sys: 85 µs, total: 88.6 ms
Wall time: 88.3 ms


In [16]:
%%time
normalize_in_place((128,), (1024,), (x_col, y_col, z_col, size))

CPU times: user 83 µs, sys: 0 ns, total: 83 µs
Wall time: 85.4 µs


In [17]:
mempool.free_all_blocks()
print(f"mempool.used_bytes {mempool.used_bytes()}")
del x_col, y_col, z_col
mempool.free_all_blocks()
print(f"mempool.used_bytes {mempool.used_bytes()}")

mempool.used_bytes 1200000000
mempool.used_bytes 0
