In [2]:
from numba import vectorize

@vectorize(['int64(int64, int64)'], target='cuda')
def add_ufunc(x,y):
    return x+y

In [3]:
import numpy as np
a = np.array([1, 2, 3, 4])
b = np.array([10, 20, 30, 40])
b_col = b[:, np.newaxis]
b_col.shape

(4, 1)

In [4]:
b_col

array([[10],
       [20],
       [30],
       [40]])

In [5]:
c = np.arange(4*4).reshape((4,4))

print('a+b:\n', add_ufunc(a, b))
print()
print('b_col + c:\n', add_ufunc(b_col, c))

a+b:
 [11 22 33 44]

b_col + c:
 [[10 11 12 13]
 [24 25 26 27]
 [38 39 40 41]
 [52 53 54 55]]




In [6]:
%timeit np.add(b_col, c)

873 ns ± 3.16 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [7]:
%timeit add_ufunc(b_col, c) # Numba on GPU

672 µs ± 2.33 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [8]:
import math  # Note that for the CUDA target, we need to use the scalar functions from the math module, not NumPy

SQRT_2PI = np.float32((2*math.pi)**0.5)  # Precompute this constant as a float32.  Numba will inline it at compile time.

@vectorize(['float32(float32, float32, float32)'], target='cuda')
def gaussian_pdf(x, mean, sigma):
    '''Compute the value of a Gaussian probability density function at x with given mean and sigma.'''
    return math.exp(-0.5 * ((x - mean) / sigma)**2) / (sigma * SQRT_2PI)

In [9]:
# Evaluate the Gaussian distribution PDF a million times!
x = np.random.uniform(-3, 3, size=1000000).astype(np.float32)
mean = np.float32(0.0)
sigma = np.float32(1.0)

# Quick test
gaussian_pdf(x[0], 0.0, 1.0)



array([0.00869729])

In [10]:
import scipy.stats # for definition of gaussian distribution
norm_pdf = scipy.stats.norm
%timeit norm_pdf.pdf(x, loc=mean, scale=sigma)

29.3 ms ± 147 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
%timeit gaussian_pdf(x, mean, sigma)

4.45 ms ± 49.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
from numba import cuda

@cuda.jit(device=True)
def polar_to_cartesian(rho, theta):
    x = rho * math.cos(theta)
    y = rho * math.sin(theta)
    return x, y  # This is Python, so let's return a tuple

@vectorize(['float32(float32, float32, float32, float32)'], target='cuda')
def polar_distance(rho1, theta1, rho2, theta2):
    x1, y1 = polar_to_cartesian(rho1, theta1)
    x2, y2 = polar_to_cartesian(rho2, theta2)
    
    return ((x1 - x2)**2 + (y1 - y2)**2)**0.5

In [14]:
n = 1000000
rho1 = np.random.uniform(0.5, 1.5, size=n).astype(np.float32)
theta1 = np.random.uniform(-np.pi, np.pi, size=n).astype(np.float32)
rho2 = np.random.uniform(0.5, 1.5, size=n).astype(np.float32)
theta2 = np.random.uniform(-np.pi, np.pi, size=n).astype(np.float32)

In [15]:
polar_distance(rho1, theta1, rho2, theta2)

array([1.28781067, 1.10001969, 1.90797696, ..., 1.52320821, 1.20499107,
       1.90555966])

In [16]:
import numba

@numba.jit
def polar_to_cartesian_cpu(rho, theta):
    x = rho * math.cos(theta)
    y = rho * math.sin(theta)
    return x, y  # This is Python, so let's return a tuple

@vectorize(['float32(float32, float32, float32, float32)'])  # default target is CPU
def polar_distance_cpu(rho1, theta1, rho2, theta2):
    x1, y1 = polar_to_cartesian_cpu(rho1, theta1)
    x2, y2 = polar_to_cartesian_cpu(rho2, theta2)
    
    return ((x1 - x2)**2 + (y1 - y2)**2)**0.5

np.testing.assert_allclose(polar_distance(rho1, theta1, rho2, theta2),
                           polar_distance_cpu(rho1, theta1, rho2, theta2),
                           rtol=1e-7, atol=5e-7)

In [17]:
%timeit polar_distance_cpu(rho1, theta1, rho2, theta2)

18.6 ms ± 150 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
%timeit polar_distance(rho1, theta1, rho2, theta2)

7.99 ms ± 31.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


: 

In [11]:
def find_all_primes_cpu(upper):
    all_prime_numbers = []
    for num in range(2, upper):
        for i in range(2, (num // 2) + 1):
            if (num % i) == 0:
                break
        else:
            all_prime_numbers.append(num)
    return all_prime_numbers

In [2]:
%timeit -n 10 -r 1 find_all_primes_cpu(10_000)

125 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


In [9]:
from numba import jit

@jit(nopython=True)
def find_all_primes_cpu(upper):
    all_prime_numbers = []
    for num in range(2, upper):
        for i in range(2, (num // 2) + 1):
            if (num % i) == 0:
                break
        else:
            all_prime_numbers.append(num)
    return all_prime_numbers

In [7]:
%timeit -n 10 -r 1 find_all_primes_cpu(10_000)

29.2 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


In [12]:
from numba import jit

upper = 10_000
%timeit -n 10 -r 1 jit(nopython=True)(find_all_primes_cpu)(upper)

114 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)
