In [10]:
import numpy as np
a = np.array([1, 2, 3, 4])
b = np.array([i*10 for i in range(50000)])
np.add(a, 500)

c = np.arange(50000*4).reshape((50000, 4)).astype(np.int32)
b_col = b[:, np.newaxis].astype(np.int32)
b_col

array([[     0],
       [    10],
       [    20],
       ...,
       [499970],
       [499980],
       [499990]])

In [8]:
from numba import vectorize

@vectorize(['int32(int32, int32)'], target='cuda')
def add_ufunc(x, y):
    return x + y

In [12]:
%timeit np.add(b_col, c)

1.27 ms ± 119 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [11]:
%timeit add_ufunc(b_col, c)

10.9 ms ± 1.59 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
import math

SQRT_2PI = np.float32((2*math.pi)**0.5)

@vectorize(['float32(float32, float32, float32)'], target='cuda')
def gaussian_pdf(x, mu, sigma):
  return math.exp(0.5 * ((x - mu)/sigma)**2) / (sigma * SQRT_2PI)

x = np.random.uniform(-3, 3, size=1000000).astype(np.float32)
mu =  np.float32(0.0)
sigma = np.float32(1.0)

import scipy.stats
norm_pdf = scipy.stats.norm
%timeit norm_pdf.pdf(x, loc=mu, scale=sigma)

%timeit gaussian_pdf(x, mu, sigma)

129 ms ± 16.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
19.9 ms ± 4.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [36]:
from numba import cuda

@cuda.jit(device=True)
def polar_to_cartesian(r, theta):
  x = r * math.cos(theta)
  y = r * math.sin(theta)
  return x,y

@vectorize(['float32(float32, float32, float32, float32)'], target='cuda')
def polar_distance(r1, r2, theta1, theta2):
  x1, y1 = polar_to_cartesian(r1, theta1)
  x2, y2 = polar_to_cartesian(r2, theta2)

  return ((x1 - x2)**2 + (y1 - y2)**2)**0.5

n = 10000000
r1 = np.random.uniform(0.5, 1.5, size=n).astype(np.float32)
r2 = np.random.uniform(0.5, 1.5, size=n).astype(np.float32)
theta1 = np.random.uniform(-np.pi, np.pi, size=n).astype(np.float32)
theta2 = np.random.uniform(-np.pi, np.pi, size=n).astype(np.float32)

print(polar_distance(r1, r2, theta1, theta2))

cuda.select_device(0)
cuda.close()

[1.4192139 1.1245332 1.4098022 ... 0.8995225 2.4052236 1.9447448]
