try some cupy basics here.

do some stuff kinda like the photon simulator does.  the main hard things it does are

* rotation
* scattering


In [1]:
## %load_ext pyinstrument
import numpy as np
import cupy as cp
from cupyx import jit
import math

print(f"CuPy version {cp.__version__}")

CuPy version 11.0.0


In [2]:
def rotateAround(X, Y, Z, ux, uy, uz, theta):
    # u.normalize()
    # print(theta.v)
    cost = np.cos(theta)
    sint = np.sin(theta)
    one_cost = (1 - cost)

    X = (cost + ux * ux * one_cost) * X + (ux * uy * one_cost - uz * sint) * Y + (
        ux * uz * one_cost + uy * sint) * Z
    Y = (uy * ux * one_cost + uz * sint) * X + (cost + uy * uy * one_cost) * Y + (
        uy * uz * one_cost - ux * sint) * Z
    Z = (uz * ux * one_cost - uy * sint) * X + (uz * uy * one_cost + ux * sint) * Y + (
        cost + uz * uz * one_cost) * Z


    

    
    
    
gpu_k = cp.ElementwiseKernel(
    in_params = 'float32 x, float32 y, float32 theta',
    out_params = 'float32 z1, float32 z2',
    operation = '''
        float cosx = cosf(theta);
        float sinx = sinf(theta);
        float xt = y * x * x * cosx;
        float yt = y * x * x * sinx;
        z1 = xt;
        z2 = yt;
    ''',
    name = 'gpu_k',
    no_return = False)

@cp.fuse()
def gpu_m(x, y, theta): # return tuple
    cosx = cp.cos(theta)
    sinx = cp.sin(theta)
    xt = y * x * x * cosx
    yt = y * x * x * sinx
    return (xt, yt)

def cpu_m(x, y, theta):
    cosx = np.cos(theta)
    sinx = np.sin(theta)
    xt = y * x * x * cosx
    yt = y * x * x * sinx
    return (xt, yt)

@jit.rawkernel()
def elementwise_m(x, y, theta, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.gridDim.x * jit.blockDim.x
    for i in range(tid, size, ntid):
        cosx = cp.cos(theta[i])
        sinx = cp.sin(theta[i])
        yt = y[i] * x[i] * x[i] * cosx
        xt = y[i] * x[i] * x[i] * sinx
        y[i] = yt
        x[i] = xt

# shuffle in place

def gpu_shuffle(x):
    cp.random.shuffle(x)

#rnd = cp.random.default_rng()
rnd = cp.random.Generator(cp.random.Philox4x3210())

def cp_random(generator, size, out):
    generator.random(size, dtype = np.float32, out = out)

# this is 5x faster.  ?
def cp_legacy_random(size):
    return cp.random.random(size, dtype=np.float32)
    
bin_edges = cp.linspace(0, 1.0, num=101, endpoint=True, dtype=np.float32)
def make_hist(x):
    return cp.histogram(x, bin_edges)

def choice(x):
    return cp.random.choice(x, 100)

# def get_phi_elementwise():
# need curand

@cp.fuse()
def get_phi_fuse(size):
    return cp.random.uniform(0, 2 * np.pi, size)
#    return cp.random.random(size) * 2 * np.pi

def get_phi_cpu(size):
    return np.random.random(size) * 2 * np.pi
# def get_phi_rawkernel(theta, phi, size):
# need curand

@cp.fuse()
def get_theta_fuse(size, g):
    temp = (1 - g * g) / (1 - g + cp.random.uniform(0, 2 * g, size))
    cost = (1 + g * g - temp * temp) / (2 * g)
    return cp.arccos(cost)

def get_theta_cpu(size, g):
    temp = (1 - g * g) / (1 - g + 2 * g * np.random.random(size))
    cost = (1 + g * g - temp * temp) / (2 * g)
    return np.arccos(cost)

  cupy._util.experimental('cupyx.jit.rawkernel')


In [3]:
size = 20000000
x_gpu = None
y_gpu = None
theta_gpu = None
x_cpu = None
y_cpu = None

In [4]:
mempool = cp.get_default_memory_pool()
mempool.free_all_blocks()
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu = cp.random.random(size, dtype=np.float32)
y_gpu = cp.ones(size, dtype=np.float32)
theta_gpu = cp.random.random(size, dtype=np.float32)

# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 16384512
mempool.total_bytes 16384512
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512


In [5]:
x_cpu = np.array(np.random.random(size), dtype=np.float32)
y_cpu = np.ones(size, dtype=np.float32)
theta_cpu = np.array(np.random.random(size), dtype=np.float32)

In [6]:
%%time
(x_cpu, y_cpu) = cpu_m(x_cpu, y_cpu, theta_cpu)

CPU times: user 103 ms, sys: 13.1 ms, total: 116 ms
Wall time: 115 ms


In [7]:
%%time
(x_cpu, y_cpu) = cpu_m(x_cpu, y_cpu, theta_cpu)

CPU times: user 81.7 ms, sys: 32.2 ms, total: 114 ms
Wall time: 113 ms


In [8]:
print(f"x_cpu.bytes {x_cpu.nbytes}")  
print(f"y_cpu_bytes {y_cpu.nbytes}") 

x_cpu.bytes 80000000
y_cpu_bytes 80000000


In [9]:
x_gpu = None
y_gpu = None
theta_gpu = None
# before freeing
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu = cp.random.random(size, dtype=np.float32)
y_gpu = cp.ones(size, dtype=np.float32)
theta_gpu = cp.random.random(size, dtype=np.float32)

# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 16384512
mempool.total_bytes 256384512
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512


In [10]:
%%time
(x_gpu, y_gpu) = gpu_k(x_gpu, y_gpu, theta_gpu)

CPU times: user 3.88 ms, sys: 0 ns, total: 3.88 ms
Wall time: 3.4 ms


In [11]:
%%time
(x_gpu, y_gpu) = gpu_k(x_gpu, y_gpu, theta_gpu)

CPU times: user 66 µs, sys: 0 ns, total: 66 µs
Wall time: 68.9 µs


In [12]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 416384512
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512


In [13]:
x_gpu = None
y_gpu = None
theta_gpu=None
# before freeing
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu = cp.random.random(size, dtype=np.float32)
y_gpu = cp.ones(size, dtype=np.float32)
theta_gpu = cp.random.random(size, dtype=np.float32)

# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 16384512
mempool.total_bytes 256384512
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512


In [14]:
%%time
(x_gpu, y_gpu) = gpu_m(x_gpu, y_gpu, theta_gpu)

CPU times: user 1.93 ms, sys: 569 µs, total: 2.5 ms
Wall time: 2.22 ms


In [15]:
%%time
(x_gpu, y_gpu) = gpu_m(x_gpu, y_gpu, theta_gpu)

CPU times: user 80 µs, sys: 0 ns, total: 80 µs
Wall time: 82.3 µs


In [16]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 416384512
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512


In [17]:
x_gpu = None
y_gpu = None
theta_gpu = None
# before freeing
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu = cp.random.random(size, dtype=np.float32)
y_gpu = cp.ones(size, dtype=np.float32)
theta_gpu = cp.random.random(size, dtype=np.float32)

# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 16384512
mempool.total_bytes 256384512
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512


In [18]:
%%time
elementwise_m((128,), (1024,), (x_gpu, y_gpu, theta_gpu, size))

CPU times: user 79 ms, sys: 7.75 ms, total: 86.8 ms
Wall time: 86 ms


In [19]:
%%time
elementwise_m((128,), (1024,), (x_gpu, y_gpu, theta_gpu, size))

CPU times: user 190 µs, sys: 0 ns, total: 190 µs
Wall time: 188 µs


In [20]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512
x_gpu.bytes 80000000
y_gpu.bytes 80000000
mempool.used_bytes 256384512
mempool.total_bytes 256384512


In [21]:
x_gpu = None
y_gpu = None
theta_gpu = None
# before freeing
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu = cp.random.random(size, dtype=np.float32)
#y_gpu = cp.ones(size, dtype=np.float32)
#theta_gpu = cp.random.random(size, dtype=np.float32)

# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 16384512
mempool.total_bytes 256384512
x_gpu.bytes 80000000
mempool.used_bytes 96384512
mempool.total_bytes 256384512
x_gpu.bytes 80000000
mempool.used_bytes 96384512
mempool.total_bytes 96384512


In [22]:
%%time
gpu_shuffle(x_gpu)

CPU times: user 54.4 ms, sys: 4.72 ms, total: 59.1 ms
Wall time: 58.7 ms


In [23]:
%%time
gpu_shuffle(x_gpu)

CPU times: user 86.7 ms, sys: 0 ns, total: 86.7 ms
Wall time: 86.6 ms


In [24]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
mempool.used_bytes 96384512
mempool.total_bytes 656462848
x_gpu.bytes 80000000
mempool.used_bytes 96384512
mempool.total_bytes 96384512


In [25]:
x_gpu = None
y_gpu = None
theta_gpu=None
# before freeing
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.used_bytes 16384512
mempool.total_bytes 96384512


In [26]:
x_gpu = cp.empty(size, dtype=np.float32)

In [27]:
%%time
cp_random(rnd, size, x_gpu)

CPU times: user 1.04 ms, sys: 4.28 ms, total: 5.32 ms
Wall time: 4.67 ms


In [28]:
%%time
x_gpu = cp_random(rnd, size, None)

CPU times: user 2.96 ms, sys: 525 µs, total: 3.48 ms
Wall time: 2.72 ms


In [29]:
%%time
x_gpu = cp_legacy_random(size)

CPU times: user 154 µs, sys: 0 ns, total: 154 µs
Wall time: 108 µs


In [30]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
mempool.used_bytes 96384512
mempool.total_bytes 336384512
x_gpu.bytes 80000000
mempool.used_bytes 96384512
mempool.total_bytes 96384512


In [31]:
%%time
x_hist = make_hist(x_gpu)

CPU times: user 164 ms, sys: 0 ns, total: 164 ms
Wall time: 164 ms


In [32]:
# before freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

mempool.free_all_blocks()

# after freeing
print(f"x_gpu.bytes {x_gpu.nbytes}")  
#print(f"y_gpu.bytes {y_gpu.nbytes}")  
print(f"mempool.used_bytes {mempool.used_bytes()}")
print(f"mempool.total_bytes {mempool.total_bytes()}")

x_gpu.bytes 80000000
mempool.used_bytes 96385536
mempool.total_bytes 96402432
x_gpu.bytes 80000000
mempool.used_bytes 96385536
mempool.total_bytes 96385536


In [33]:
%%time
sample_gpu = choice(x_gpu)

CPU times: user 3.51 ms, sys: 833 µs, total: 4.35 ms
Wall time: 3.63 ms


In [34]:
print(sample_gpu)

[0.9673034  0.78861386 0.03630557 0.969549   0.49728525 0.4267892
 0.08688607 0.61109346 0.06837947 0.5307299  0.938618   0.9567219
 0.8148009  0.8731207  0.49270767 0.09155727 0.6196373  0.09936445
 0.06785747 0.7607554  0.32874042 0.9426855  0.97815406 0.28870896
 0.00219207 0.26553318 0.09715155 0.4749295  0.39415708 0.9606625
 0.5845747  0.76110905 0.71495086 0.48257342 0.00472326 0.90373063
 0.7164281  0.7696101  0.7337606  0.6353176  0.72164303 0.96857464
 0.61266106 0.14326867 0.79571736 0.04455008 0.73452824 0.8434999
 0.83892715 0.47755793 0.12048929 0.10405807 0.7094321  0.418502
 0.5909079  0.5034845  0.09191117 0.9248774  0.96532196 0.01893075
 0.94168454 0.8625469  0.14171694 0.7849885  0.52917933 0.6521083
 0.7624057  0.7122722  0.32651362 0.29371423 0.88430387 0.45636043
 0.744007   0.7520833  0.90566504 0.8794248  0.16618133 0.52297086
 0.57631135 0.10248693 0.9287606  0.33522478 0.0474641  0.534089
 0.7982924  0.8083813  0.35821313 0.78831685 0.77486926 0.11912993
 0.8

In [35]:
x_gpu_10 = x_gpu * 10


In [36]:
x_gpu_10 = x_gpu_10.astype(np.int16)


In [37]:
%%time
bc = cp.bincount(x_gpu_10)

CPU times: user 1.02 ms, sys: 3.31 ms, total: 4.34 ms
Wall time: 3.64 ms


In [38]:
%%time
phi_gpu = get_phi_fuse(size)

CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 19.4 ms


In [39]:
%%time
phi_cpu = get_phi_cpu(size)

CPU times: user 131 ms, sys: 11.6 ms, total: 142 ms
Wall time: 141 ms


In [65]:
%%time
theta_gpu = get_theta_fuse(size, 0.9)

CPU times: user 1.66 ms, sys: 0 ns, total: 1.66 ms
Wall time: 1.2 ms


In [45]:
%%time
theta_cpu = get_theta_cpu(size, 0.9)

CPU times: user 479 ms, sys: 70.7 ms, total: 550 ms
Wall time: 549 ms
