This demonstrates the simpler slow rotation method, with lots of temporary allocations.

In [1]:
import numpy as np
import cupy as cp
print(f"CuPy version {cp.__version__}")
mempool = cp.get_default_memory_pool()
mempool.free_all_blocks()
print(f"mempool.used_bytes {mempool.used_bytes()}")

CuPy version 11.0.0
mempool.used_bytes 0


In [2]:
def anyPerpendicular(vx, vy, vz):
    return (
        cp.where(
            vz < vx, vy, cp.zeros( len(vy) )),
        cp.where(
            vz < vx, -vx, -vz),
        cp.where(
            vz < vx, cp.zeros(len(vy)), vy)
    )
    
@cp.fuse()
def norm(vx, vy, vz):
    return cp.sqrt(vx * vx + vy * vy + vz * vz)

@cp.fuse()
def normalize(vx, vy, vz):
    vn = norm(vx, vy, vz)
    #vn = vn[:, None]
    return (vx/vn, vy/vn, vz/vn)

@cp.fuse()
def rotateAround(vx, vy, vz, ux, uy, uz, theta):
    (ux, uy, uz) = normalize(ux, uy, uz)
    
    cost = cp.cos(theta)
    sint = cp.sin(theta)
    one_cost = 1 - cost
        
    X = vx
    Y = vy
    Z = vz
    
    x = (cost + ux * ux * one_cost) * X + (ux * uy * one_cost - uz * sint) * Y + (
            ux * uz * one_cost + uy * sint) * Z
    y = (uy * ux * one_cost + uz * sint) * X + (cost + uy * uy * one_cost) * Y + (
            uy * uz * one_cost - ux * sint) * Z
    z = (uz * ux * one_cost - uy * sint) * X + (uz * uy * one_cost + ux * sint) * Y + (
            cost + uz * uz * one_cost) * Z

    return (x, y, z)

In [3]:
size = 15000000
vx = cp.random.random(size, dtype=np.float32)
print(f"mempool.used_bytes {mempool.used_bytes()}")
vy = cp.random.random(size, dtype=np.float32)
print(f"mempool.used_bytes {mempool.used_bytes()}")
vz = cp.random.random(size, dtype=np.float32)
print(f"mempool.used_bytes {mempool.used_bytes()}")
# these should be perpendicular
(ux, uy, uz) = anyPerpendicular(vx, vy, vz)
#ux = cp.random.random(size, dtype=np.float32)
#uy = cp.random.random(size, dtype=np.float32)
#uz = cp.random.random(size, dtype=np.float32)
#theta = cp.random.random(size, dtype=np.float32)
theta = cp.full(size, np.pi/2,dtype=np.float32)
print(f"mempool.used_bytes {mempool.used_bytes()}")

mempool.used_bytes 60000256
mempool.used_bytes 120000512
mempool.used_bytes 180000768
mempool.used_bytes 540001280


In [4]:
%%time
(rx, ry, rz) = rotateAround(vx, vy, vz, ux, uy, uz, theta)

CPU times: user 4.29 ms, sys: 139 µs, total: 4.43 ms
Wall time: 4.11 ms


In [5]:
%%time
(rx, ry, rz) = rotateAround(vx, vy, vz, ux, uy, uz, theta)

CPU times: user 0 ns, sys: 1.96 ms, total: 1.96 ms
Wall time: 1.73 ms


In [6]:
# check perpendicularity
dot = (vx * rx + vy * ry + vz * rz)
print(cp.amax(dot))

-2.4984611907609303e-12


In [7]:
del vx, vy, vz, ux, uy, uz, rx, ry, rz, theta