try some cupy basics here.

do some stuff kinda like the photon simulator does.  the main hard things it does are

* rotation
* scattering


In [1]:
## %load_ext pyinstrument
import numpy as np
import cupy as cp
from cupyx import jit
import math

print(f"CuPy version {cp.__version__}")

CuPy version 13.2.0


In [2]:
def rotateAround(X, Y, Z, ux, uy, uz, theta):
    # u.normalize()
    # print(theta.v)
    cost = np.cos(theta)
    sint = np.sin(theta)
    one_cost = (1 - cost)

    X = (cost + ux * ux * one_cost) * X + (ux * uy * one_cost - uz * sint) * Y + (
        ux * uz * one_cost + uy * sint) * Z
    Y = (uy * ux * one_cost + uz * sint) * X + (cost + uy * uy * one_cost) * Y + (
        uy * uz * one_cost - ux * sint) * Z
    Z = (uz * ux * one_cost - uy * sint) * X + (uz * uy * one_cost + ux * sint) * Y + (
        cost + uz * uz * one_cost) * Z


    

    
    
    
gpu_k = cp.ElementwiseKernel(
    in_params = 'float32 x, float32 y, float32 theta',
    out_params = 'float32 z1, float32 z2',
    operation = '''
        float cosx = cosf(theta);
        float sinx = sinf(theta);
        float xt = y * x * x * cosx;
        float yt = y * x * x * sinx;
        z1 = xt;
        z2 = yt;
    ''',
    name = 'gpu_k',
    no_return = False)

@cp.fuse()
def gpu_m(x, y, theta): # return tuple
    cosx = cp.cos(theta)
    sinx = cp.sin(theta)
    xt = y * x * x * cosx
    yt = y * x * x * sinx
    return (xt, yt)

def cpu_m(x, y, theta):
    cosx = np.cos(theta)
    sinx = np.sin(theta)
    xt = y * x * x * cosx
    yt = y * x * x * sinx
    return (xt, yt)

@jit.rawkernel()
def elementwise_m(x, y, theta, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.gridDim.x * jit.blockDim.x
    for i in range(tid, size, ntid):
        cosx = cp.cos(theta[i])
        sinx = cp.sin(theta[i])
        yt = y[i] * x[i] * x[i] * cosx
        xt = y[i] * x[i] * x[i] * sinx
        y[i] = yt
        x[i] = xt

# shuffle in place

def gpu_shuffle(x):
    cp.random.shuffle(x)

#rnd = cp.random.default_rng()
rnd = cp.random.Generator(cp.random.Philox4x3210())

def cp_random(generator, size, out):
    generator.random(size, dtype = np.float32, out = out)

# this is 5x faster.  ?
def cp_legacy_random(size):
    return cp.random.random(size, dtype=np.float32)
    
bin_edges = cp.linspace(0, 1.0, num=101, endpoint=True, dtype=np.float32)
def make_hist(x):
    return cp.histogram(x, bin_edges)

def choice(x):
    return cp.random.choice(x, 100)

# def get_phi_elementwise():
# need curand

@cp.fuse()
def get_phi_fuse(size):
    return cp.random.uniform(0, 2 * np.pi, size)
#    return cp.random.random(size) * 2 * np.pi

def get_phi_cpu(size):
    return np.random.random(size) * 2 * np.pi
# def get_phi_rawkernel(theta, phi, size):
# need curand

@cp.fuse()
def get_theta_fuse(size, g):
    temp = (1 - g * g) / (1 - g + cp.random.uniform(0, 2 * g, size))
    cost = (1 + g * g - temp * temp) / (2 * g)
    return cp.arccos(cost)

def get_theta_cpu(size, g):
    temp = (1 - g * g) / (1 - g + 2 * g * np.random.random(size))
    cost = (1 + g * g - temp * temp) / (2 * g)
    return np.arccos(cost)

  cupy._util.experimental('cupyx.jit.rawkernel')


In [3]:
size = 20000000
x_gpu = None
y_gpu = None
theta_gpu = None
x_cpu = None
y_cpu = None

In [4]:
mempool = cp.get_default_memory_pool()
mempool.free_all_blocks()
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu = cp.random.random(size, dtype=np.float32)
y_gpu = cp.ones(size, dtype=np.float32)
theta_gpu = cp.random.random(size, dtype=np.float32)

# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 786944
mempool.total_bytes 786944
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944


In [5]:
x_cpu = np.array(np.random.random(size), dtype=np.float32)
y_cpu = np.ones(size, dtype=np.float32)
theta_cpu = np.array(np.random.random(size), dtype=np.float32)

In [6]:
%%time
(x_cpu, y_cpu) = cpu_m(x_cpu, y_cpu, theta_cpu)

CPU times: user 95.1 ms, sys: 30.3 ms, total: 125 ms
Wall time: 125 ms


In [7]:
%%time
(x_cpu, y_cpu) = cpu_m(x_cpu, y_cpu, theta_cpu)

CPU times: user 108 ms, sys: 16.4 ms, total: 124 ms
Wall time: 123 ms


In [8]:
print(f"x_cpu.bytes {x_cpu.nbytes}")  
print(f"y_cpu_bytes {y_cpu.nbytes}") 

x_cpu.bytes 80000000
y_cpu_bytes 80000000


In [9]:
x_gpu = None
y_gpu = None
theta_gpu = None
# before freeing
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu = cp.random.random(size, dtype=np.float32)
y_gpu = cp.ones(size, dtype=np.float32)
theta_gpu = cp.random.random(size, dtype=np.float32)

# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 786944
mempool.total_bytes 240786944
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944


In [10]:
%%time
(x_gpu, y_gpu) = gpu_k(x_gpu, y_gpu, theta_gpu)

CPU times: user 118 ms, sys: 3.78 ms, total: 121 ms
Wall time: 122 ms


In [11]:
%%time
(x_gpu, y_gpu) = gpu_k(x_gpu, y_gpu, theta_gpu)

CPU times: user 65 µs, sys: 21 µs, total: 86 µs
Wall time: 89.9 µs


In [12]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 400786944
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944


In [13]:
x_gpu = None
y_gpu = None
theta_gpu=None
# before freeing
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu = cp.random.random(size, dtype=np.float32)
y_gpu = cp.ones(size, dtype=np.float32)
theta_gpu = cp.random.random(size, dtype=np.float32)

# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 786944
mempool.total_bytes 240786944
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944


In [14]:
%%time
(x_gpu, y_gpu) = gpu_m(x_gpu, y_gpu, theta_gpu)

CPU times: user 119 ms, sys: 6.96 ms, total: 126 ms
Wall time: 126 ms


In [15]:
%%time
(x_gpu, y_gpu) = gpu_m(x_gpu, y_gpu, theta_gpu)

CPU times: user 172 µs, sys: 0 ns, total: 172 µs
Wall time: 222 µs


In [16]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 400786944
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944


In [17]:
x_gpu = None
y_gpu = None
theta_gpu = None
# before freeing
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu = cp.random.random(size, dtype=np.float32)
y_gpu = cp.ones(size, dtype=np.float32)
theta_gpu = cp.random.random(size, dtype=np.float32)

# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 786944
mempool.total_bytes 240786944
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944


In [18]:
%%time
elementwise_m((128,), (1024,), (x_gpu, y_gpu, theta_gpu, size))

CPU times: user 245 ms, sys: 21 ms, total: 266 ms
Wall time: 281 ms


In [19]:
%%time
elementwise_m((128,), (1024,), (x_gpu, y_gpu, theta_gpu, size))

CPU times: user 123 µs, sys: 35 µs, total: 158 µs
Wall time: 157 µs


In [20]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 240786944
mempool.total_bytes 240786944


In [21]:
x_gpu = None
y_gpu = None
theta_gpu = None
# before freeing
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu = cp.random.random(size, dtype=np.float32)
#y_gpu = cp.ones(size, dtype=np.float32)
#theta_gpu = cp.random.random(size, dtype=np.float32)

# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 786944
mempool.total_bytes 240786944
x_gpu.bytes 80000000
mempool.used_bytes 80786944
mempool.total_bytes 240786944
x_gpu.bytes 80000000
mempool.used_bytes 80786944
mempool.total_bytes 80786944


In [22]:
%%time
gpu_shuffle(x_gpu)

CPU times: user 260 ms, sys: 34.1 ms, total: 294 ms
Wall time: 304 ms


In [23]:
%%time
gpu_shuffle(x_gpu)

CPU times: user 77.9 ms, sys: 0 ns, total: 77.9 ms
Wall time: 77.2 ms


In [24]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
mempool.used_bytes 80786944
mempool.total_bytes 640865280
x_gpu.bytes 80000000
mempool.used_bytes 80786944
mempool.total_bytes 80786944


In [25]:
x_gpu = None
y_gpu = None
theta_gpu=None
# before freeing
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 786944
mempool.total_bytes 80786944


In [26]:
x_gpu = cp.empty(size, dtype=np.float32)

In [27]:
%%time
cp_random(rnd, size, x_gpu)

CPU times: user 90.4 ms, sys: 886 µs, total: 91.3 ms
Wall time: 90.4 ms


In [28]:
%%time
x_gpu = cp_random(rnd, size, None)

CPU times: user 67 µs, sys: 18 µs, total: 85 µs
Wall time: 86.8 µs


In [29]:
%%time
x_gpu = cp_legacy_random(size)

CPU times: user 113 µs, sys: 0 ns, total: 113 µs
Wall time: 109 µs


In [30]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
mempool.used_bytes 80786944
mempool.total_bytes 160786944
x_gpu.bytes 80000000
mempool.used_bytes 80786944
mempool.total_bytes 80786944


In [31]:
%%time
x_hist = make_hist(x_gpu)

  jitify._init_module()


CPU times: user 28.1 s, sys: 1.27 s, total: 29.4 s
Wall time: 29.3 s


In [32]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
mempool.used_bytes 80787968
mempool.total_bytes 80789504
x_gpu.bytes 80000000
mempool.used_bytes 80787968
mempool.total_bytes 80787968


In [33]:
%%time
sample_gpu = choice(x_gpu)

CPU times: user 536 ms, sys: 283 µs, total: 536 ms
Wall time: 537 ms


In [34]:
print(sample_gpu)

[7.92707205e-01 7.17558920e-01 6.34726763e-01 2.98439294e-01
 3.15965444e-01 1.49325624e-01 9.36188161e-01 8.57110918e-02
 3.36252153e-01 3.48491478e-04 7.25599587e-01 4.68829006e-01
 9.16691422e-01 9.51784074e-01 1.29345655e-01 9.94839311e-01
 9.11161780e-01 6.47370338e-01 3.62567544e-01 4.67382520e-01
 6.64483488e-01 1.29149884e-01 8.17031980e-01 6.51130497e-01
 1.97072372e-01 3.46116632e-01 8.33095074e-01 5.93230247e-01
 8.61332595e-01 1.38607323e-01 4.09931034e-01 3.39515150e-01
 8.96784604e-01 8.45306516e-02 7.22557247e-01 3.00334781e-01
 6.13482833e-01 5.05394757e-01 9.80029225e-01 3.50664556e-01
 5.77210784e-01 1.50985166e-03 6.31036222e-01 3.15086432e-02
 3.04389894e-01 1.28392518e-01 3.11612695e-01 7.92914391e-01
 4.97273326e-01 9.00324434e-02 7.65472233e-01 7.28279233e-01
 2.95508742e-01 7.87179708e-01 3.08787733e-01 7.06596375e-01
 9.17851686e-01 5.90957068e-02 8.58026087e-01 3.09485614e-01
 2.36553967e-01 3.90042394e-01 5.98381221e-01 4.09037471e-01
 8.67384434e-01 6.190707

In [35]:
x_gpu_10 = x_gpu * 10


In [36]:
x_gpu_10 = x_gpu_10.astype(np.int16)


In [37]:
%%time
bc = cp.bincount(x_gpu_10)

CPU times: user 1.28 s, sys: 28.2 ms, total: 1.31 s
Wall time: 1.3 s


In [38]:
%%time
phi_gpu = get_phi_fuse(size)

CPU times: user 175 ms, sys: 3.91 ms, total: 178 ms
Wall time: 179 ms


In [39]:
%%time
phi_cpu = get_phi_cpu(size)

CPU times: user 128 ms, sys: 16 ms, total: 144 ms
Wall time: 144 ms


In [40]:
%%time
theta_gpu = get_theta_fuse(size, 0.9)

CPU times: user 547 ms, sys: 12.2 ms, total: 559 ms
Wall time: 555 ms


In [41]:
%%time
theta_cpu = get_theta_cpu(size, 0.9)

CPU times: user 397 ms, sys: 60.3 ms, total: 457 ms
Wall time: 454 ms
