https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html

In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.__version__

'0.24.2'

In [3]:
df = pd.DataFrame({'a': np.random.randn(1000),
   ...:                    'b': np.random.randn(1000),
   ...:                    'N': np.random.randint(100, 1000, (1000)),
   ...:                    'x': 'x'})

In [4]:
df.head()

Unnamed: 0,a,b,N,x
0,0.745424,-0.040583,369,x
1,-1.045403,0.409226,957,x
2,-0.131158,-0.767767,348,x
3,0.197363,-1.023801,325,x
4,1.114628,0.737428,263,x


In [5]:
def f(x):
   ...:     return x * (x - 1)
   ...: 

def integrate_f(a, b, N):
   ...:     s = 0
   ...:     dx = (b - a) / N
   ...:     for i in range(N):
   ...:         s += f(a + i * dx)
   ...:     return s * dx

In [6]:
print(f(2))

2


In [7]:
integrate_f(0, 1, 100)

-0.16664999999999994

In [8]:
%timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1)

161 ms ± 6.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


161 ms ± 6.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [9]:
%prun -l 4 df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) 

 

```
675658 function calls (670638 primitive calls) in 0.313 seconds

   Ordered by: internal time
   List reduced from 212 to 4 due to restriction <4>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     1000    0.159    0.000    0.231    0.000 <ipython-input-6-a096a18ae598>:5(integrate_f)
   556371    0.072    0.000    0.072    0.000 <ipython-input-6-a096a18ae598>:1(f)
     3000    0.010    0.000    0.053    0.000 base.py:3090(get_value)
     3000    0.006    0.000    0.061    0.000 series.py:764(__getitem__)

```

In [10]:
%load_ext Cython

In [11]:
%%cython
   ...: def f_plain(x):
   ...:     return x * (x - 1)
   ...: def integrate_f_plain(a, b, N):
   ...:     s = 0
   ...:     dx = (b - a) / N
   ...:     for i in range(N):
   ...:         s += f_plain(a + i * dx)
   ...:     return s * dx

In [12]:
f_plain(1)

0

In [13]:
%timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1)

89.9 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


91.9 ms ± 571 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [14]:
%%cython
   ...: cdef double f_typed(double x) except? -2:
   ...:     return x * (x - 1)
   ...: cpdef double integrate_f_typed(double a, double b, int N):
   ...:     cdef int i
   ...:     cdef double s, dx
   ...:     s = 0
   ...:     dx = (b - a) / N
   ...:     for i in range(N):
   ...:         s += f_typed(a + i * dx)
   ...:     return s * dx

In [15]:
%timeit df.apply(lambda x: integrate_f_plain(x['a'], x['b'], x['N']), axis=1)

89.2 ms ± 980 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


89.2 ms ± 980 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

### Using Numba

In [16]:
import numba

In [17]:
numba.__version__

'0.45.0'

In [20]:
@numba.jit
def f_plain(x):
    return x * (x - 1)


@numba.jit
def integrate_f_numba(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        s += f_plain(a + i * dx)
    return s * dx


@numba.jit
def apply_integrate_f_numba(col_a, col_b, col_N):
    n = len(col_N)
    result = np.empty(n, dtype='float64')
    assert len(col_a) == len(col_b) == n
    for i in range(n):
        result[i] = integrate_f_numba(col_a[i], col_b[i], col_N[i])
    return result

@numba.jit
def compute_numba(df):
    result = apply_integrate_f_numba(df['a'].to_numpy(),
                                     df['b'].to_numpy(),
                                     df['N'].to_numpy())
    return pd.Series(result, index=df.index, name='result')

In [22]:
%timeit df_out = compute_numba(df)

1.16 ms ± 12.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


1.36 ms ± 271 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [25]:
df_out = compute_numba(df)

In [26]:
df_out.head()

0    0.139163
1    0.868182
2   -0.435132
3   -0.860688
4    0.021135
Name: result, dtype: float64

### short intro to Numba

https://numba.pydata.org/numba-doc/dev/user/5minguide.html

In [29]:
from numba import jit
import numpy as np

x = np.arange(100).reshape(10, 10)

# @jit(nopython=True) # Set "nopython" mode for best performance, equivalent to @njit
def go_fast(a): # Function is compiled to machine code when called the first time
    trace = 0
    for i in range(a.shape[0]):   # Numba likes loops
        trace += np.tanh(a[i, i]) # Numba likes NumPy functions
    return a + trace              # Numba likes NumPy broadcasting

print(go_fast(x))

[[  9.  10.  11.  12.  13.  14.  15.  16.  17.  18.]
 [ 19.  20.  21.  22.  23.  24.  25.  26.  27.  28.]
 [ 29.  30.  31.  32.  33.  34.  35.  36.  37.  38.]
 [ 39.  40.  41.  42.  43.  44.  45.  46.  47.  48.]
 [ 49.  50.  51.  52.  53.  54.  55.  56.  57.  58.]
 [ 59.  60.  61.  62.  63.  64.  65.  66.  67.  68.]
 [ 69.  70.  71.  72.  73.  74.  75.  76.  77.  78.]
 [ 79.  80.  81.  82.  83.  84.  85.  86.  87.  88.]
 [ 89.  90.  91.  92.  93.  94.  95.  96.  97.  98.]
 [ 99. 100. 101. 102. 103. 104. 105. 106. 107. 108.]]


In [30]:
%timeit go_fast(x)

29.6 µs ± 405 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


* with @git: 1.02 µs ± 15.6 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)

* no @git: 29.6 µs ± 405 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [31]:
x

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])