In [22]:
import numpy as np

rng = np.random.default_rng(42)

X = rng.standard_normal(size=(50_000, 10_000))

# np.sum

In [3]:
%%timeit -n 20 -r 5

np.sum(X, axis=1)

94.7 ms ± 2.59 ms per loop (mean ± std. dev. of 5 runs, 20 loops each)


# @

In [4]:
Y = rng.standard_normal(size=(10_000, 2_000))

In [6]:
Z = X @ Y

## How to run `add` in parallel?

## Numba

In [41]:
from numba import njit, prange

In [42]:
@njit(parallel=True)
def numba_sum_axis_1(X):
    rows, cols = X.shape
    out = np.zeros(rows)
    
    for i in prange(rows):
        for j in range(cols):
            out[i] += X[i][j]
    return out

In [44]:
%%timeit -n 20 -r 5

numba_sum_axis_1(X)

2.4 ms ± 173 µs per loop (mean ± std. dev. of 5 runs, 20 loops each)


## Pytorch

In [10]:
import torch

In [11]:
def torch_sum(X, axis):
    X_torch = torch.asarray(X)
    
    X_sum = torch.sum(X_torch, dim=axis)
    
    return np.asarray(X_sum)

In [23]:
%%timeit -n 20 -r 5

torch_sum(X, axis=1)

33.3 ms ± 924 µs per loop (mean ± std. dev. of 5 runs, 20 loops each)


In [27]:
from threadpoolctl import threadpool_limits

In [31]:
%%timeit -n 20 -r 5

with threadpool_limits(limits=1, user_api="openmp"):
    torch_sum(X, axis=1)

66.7 ms ± 321 µs per loop (mean ± std. dev. of 5 runs, 20 loops each)


## Pandas!

In [32]:
import pandas as pd
rng = np.random.default_rng(0)
X = rng.poisson(lam=3.0, size=(1_000_000, 10))

In [33]:
data = pd.DataFrame(X)
roll = data.rolling(100)

### Default Pandas

In [37]:
%%timeit -n 10 -r 3
_ = roll.mean()

143 ms ± 350 µs per loop (mean ± std. dev. of 3 runs, 10 loops each)


### Numba with Parallel

In [40]:
%%timeit -n 20 -r 10
_ = roll.mean(engine="numba", engine_kwargs={"parallel":True})

38.4 ms ± 622 µs per loop (mean ± std. dev. of 10 runs, 20 loops each)
