In [1]:
from numba import jit

### Basic jit inline (compile via LLVM)

In [2]:
@jit
def f(x, y):
    return x*y
hex(f(0xfffffff, 0xfffffff))

'0xffffffe0000001'

### Signature specifications
Explicit `@jit` signatures can use a number of types. Here are some common ones:

* `void` is the return type of functions returning nothing (which actually return None when called from Python)
* `intp` and `uintp` are pointer-sized integers (signed and unsigned, respectively)
* `intc` and `uintc` are equivalent to C int and unsigned int integer types
* `int8`, `uint8`, `int16`, `uint16`, `int32`, `uint32`, `int64`, `uint64` are fixed-width integers of the corresponding bit width (signed and unsigned)
* `float32` and `float64` are single- and double-precision floating-point numbers, respectively
* `complex64` and `complex128` are single- and double-precision complex numbers, respectively
* array types can be specified by indexing any numeric type, e.g. `float32[:]` for a one-dimensional single-precision array or `int8[:,:]` for a two-dimensional array of 8-bit integers.

In [3]:
from numba import jit, int32, int64

@jit(int32(int32, int32))
def f(x, y):
    # A somewhat trivial example
    return x * y
hex(f(0xfffffff, 0xfffffff))

'-0x1fffffff'

In [4]:
@jit(int64(int32, int32))
def f(x, y):
    # A somewhat trivial example
    return x * y
hex(f(0xfffffff, 0xfffffff))

'0xffffffe0000001'

### Can call or inline other jitted functions

In [5]:
import math
@jit
def square(x):
    return x ** 2

@jit
def hypot(x, y):
    return math.sqrt(square(x) + square(y))

In [6]:
hypot(1,2)

2.23606797749979

In [7]:
def square(x):
    return 0
hypot(1,2)

2.23606797749979

### Compilation options
`nopython`

A Numba compilation mode that generates code that does not access the Python C API. This compilation mode produces the highest performance code, but requires that the native types of all values in the function can be inferred. Unless otherwise instructed, the @jit decorator will automatically fall back to object mode if nopython mode cannot be used.

In [8]:
@jit(nopython=True)
def f(x, y):
    return x + y

`nogil`

Whenever Numba optimizes Python code to native code that only works on native types and variables (rather than Python objects), it is not necessary anymore to hold Python’s global interpreter lock (GIL). Numba will release the GIL when entering such a compiled function if you passed `nogil=True`.

In [9]:
@jit(nogil=True)
def f(x, y):
    return x + y

`cache`

To avoid compilation times each time you invoke a Python program, you can instruct Numba to write the result of function compilation into a file-based cache. 

In [10]:
@jit(cache=True)
def f(x, y):
    return x + y

`parallel`

Enables automatic parallelization (and related optimizations) for those operations in the function known to have parallel semantics. For a list of supported operations, see Automatic parallelization with @jit. This feature is enabled by passing `parallel=True` and must be used in conjunction with `nopython=True`:

In [11]:
@jit(nopython=True, parallel=True)
def f(x, y):
    return x + y

### Creating NumPy universal functions
`@vectorize` takes python function with scalar input arguments to be used as numpy's ufuncs. 

The `@vectorize` decorator has two modes of operation:

* Eager, or decoration-time, compilation: If you pass one or more type signatures to the decorator, you will be building a Numpy universal function (ufunc). The rest of this subsection describes building ufuncs using decoration-time compilation.
* Lazy, or call-time, compilation: When not given any signatures, the decorator will give you a Numba dynamic universal function (DUFunc) that dynamically compiles a new kernel when called with a previously unsupported input type. A later subsection, “Dynamic universal functions”, describes this mode in more depth.

In [12]:
from numba import vectorize, float64
import numpy as np

@vectorize([float64(float64, float64)])
def f(x, y):
    return x + y

f(np.arange(10, dtype=np.float64), np.arange(10, dtype=np.float64))

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14., 16., 18.])

In [13]:
f([1,2,3], [3,2,1])

array([4., 4., 4.])

If you pass several signatures, beware that you have to pass most specific signatures before least specific ones (e.g., single-precision floats before double-precision floats), otherwise type-based dispatching will not work as expected:

In [14]:
from numba import vectorize, float64, float32, int32, int64
@vectorize([int32(int32, int32),
            int64(int64, int64),
            float32(float32, float32),
            float64(float64, float64)])
def f(x, y):
    return x + y
f([1,2,3], [3,2,1])

array([4, 4, 4])

support broadcasting, reduction, accumulation

In [15]:
a = np.arange(12).reshape(3, 4)
print(a)
f.reduce(a)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


array([12, 15, 18, 21])

In [16]:
f.reduce(a, axis=1)

array([ 6, 22, 38])

In [17]:
f.accumulate(a)

array([[ 0,  1,  2,  3],
       [ 4,  6,  8, 10],
       [12, 15, 18, 21]])

In [18]:
f(np.arange(10)[:, None], np.arange(10)[None,:])

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10],
       [ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
       [ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
       [ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
       [ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
       [ 6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
       [ 7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
       [ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17],
       [ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]])

In [19]:
f(np.arange(5), np.arange(5), where=[True, False, False, True, True])

array([  0, 606,   0,   6,   8])

`vectorize` supports multiple targets: `cpu`, `parallel`, `cuda`

In [21]:
from numba import vectorize, float64, float32, int32, int64
@vectorize([int32(int32, int32),
            int64(int64, int64),
            float32(float32, float32),
            float64(float64, float64)], target='cuda')
def f(x, y):
    return x + y
f(np.arange(10000), np.arange(10000))

array([    0,     2,     4, ..., 19994, 19996, 19998])

### The @guvectorize decorator
the `guvectorize()` decorator takes the concept one step further and allows you to write ufuncs that will work on an arbitrary number of elements of input arrays, and take and return arrays of differing dimensions. The typical example is a running median or a convolution filter.

`guvectorize()` functions don’t return their result value: they take it as an array argument, which must be filled in by the function. This is because the array is actually allocated by NumPy’s dispatch mechanism, which calls into the Numba-generated code.

In [23]:
from numba import guvectorize
@guvectorize([(int64[:], int64, int64[:])], '(n),()->(n)')
def g(x, y, res):
    for i in range(x.shape[0]):
        res[i] = x[i] + y

In [24]:
g([1,2,3], 5)

array([6, 7, 8], dtype=int64)

### AOT 

In [26]:
from numba.pycc import CC

cc = CC('my_module')
cc.verbose = True

@cc.export('multf', 'f8(f8, f8)')
@cc.export('multi', 'i4(i4, i4)')
def mult(a, b):
    return a * b

@cc.export('square', 'f8(f8)')
def square(a):
    return a ** 2
cc.compile()

generating LLVM code for 'my_module' into C:\Users\tzerj\AppData\Local\Temp\pycc-build-my_module-qml2jywq\my_module.cp37-win_amd64.o
creating C:\Users\tzerj\AppData\Local\Temp\pycc-build-my_module-qml2jywq\Users
creating C:\Users\tzerj\AppData\Local\Temp\pycc-build-my_module-qml2jywq\Users\tzerj
creating C:\Users\tzerj\AppData\Local\Temp\pycc-build-my_module-qml2jywq\Users\tzerj\Anaconda3
creating C:\Users\tzerj\AppData\Local\Temp\pycc-build-my_module-qml2jywq\Users\tzerj\Anaconda3\envs
creating C:\Users\tzerj\AppData\Local\Temp\pycc-build-my_module-qml2jywq\Users\tzerj\Anaconda3\envs\test
creating C:\Users\tzerj\AppData\Local\Temp\pycc-build-my_module-qml2jywq\Users\tzerj\Anaconda3\envs\test\lib
creating C:\Users\tzerj\AppData\Local\Temp\pycc-build-my_module-qml2jywq\Users\tzerj\Anaconda3\envs\test\lib\site-packages
creating C:\Users\tzerj\AppData\Local\Temp\pycc-build-my_module-qml2jywq\Users\tzerj\Anaconda3\envs\test\lib\site-packages\numba
creating C:\Users\tzerj\AppData\Local\Temp

In [28]:
!ls

01-numba-intro.ipynb
my_module.cp37-win_amd64.pyd


In [29]:
import my_module
my_module.multi(3, 4)

12

### Explicit Parallel Loops


In [30]:
from numba import njit, prange

@njit(parallel=True)
def prange_test(A):
    s = 0
    # Without "parallel=True" in the jit-decorator
    # the prange statement is equivalent to range
    for i in prange(A.shape[0]):
        s += A[i]
    return s