# Pandas Speed Compairison - Complex Function 

In [1]:
import numpy as np
import pandas as pd

In [2]:
column_size = 100_000
df = pd.DataFrame(
    {
        "A" : np.random.random(column_size),
        "B" : np.random.random(column_size),
        "C" : np.random.random(column_size),
        "D" : np.random.random(column_size),
        "E" : np.random.random(column_size),
    }
)

In [3]:
def complex_function(a, b, c, d, e):
    if (0 <= a < 0.3) and (0 <= b < 0.3):
        return True
    else:
        return False

## Iterrows

In [4]:
%%timeit
result = []
for each_index, each_row in df.iterrows():
    each_result = complex_function(each_row.A, each_row.B, each_row.C, each_row.D, each_row.E)
    result.append(each_result)
df["RESULT"] = result

12.5 s ± 1.01 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Apply

In [5]:
%%timeit
df["RESULT"] = df.apply(lambda x : complex_function(x.A, x.B, x.C, x.D, x.E), axis='columns')

4.54 s ± 446 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Pandarallel

In [6]:
from pandarallel import pandarallel

In [7]:
pandarallel.initialize()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [8]:
%%timeit
df["RESULT"] = df.parallel_apply(lambda x : complex_function(x.A, x.B, x.C, x.D, x.E), axis='columns')

2.33 s ± 127 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Itertuples 

In [9]:
%%timeit
result = []
for each_row in df.itertuples():
    each_result = complex_function(each_row.A, each_row.B, each_row.C, each_row.D, each_row.E)
    result.append(each_result)
df["RESULT"] = result

149 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Swifter

In [23]:
import swifter

In [24]:
def complex_function_swifter(df):
    if (0 <= df.A < 0.3) and (0 <= df.B < 0.3):
        return True
    else:
        return False

In [25]:
%%timeit
df["RESULT"] = df.swifter.progress_bar(False).apply(complex_function_swifter, axis='columns')

4.08 s ± 425 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Pandas Vectorize

In [12]:
%%timeit
df["RESULT"] = False
df.loc[(df['A'] >= 0) & (df['A'] < 0.3) & (df['B'] >= 0) & (df['B'] < 0.3), 'RESULT'] = True

2.2 ms ± 184 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
# Note: use pd.cut() for multiple conditions and binning 

## Numpy Vectorize

In [14]:
%%timeit
complex_function_vectorize = np.vectorize(complex_function)
df["RESULT"] = complex_function_vectorize(df["A"].values, df["B"].values, df["C"].values, df["D"].values, df["E"].values)

41.9 ms ± 1.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
%%timeit
df["RESULT"] = np.where((df['A'].values >= 0) & (df['A'].values < 0.3) & (df['B'].values >= 0) & (df['B'].values < 0.3), True, False)

865 µs ± 136 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [16]:
%%timeit
conditions = [
    (df['A'].values >= 0) & (df['A'].values < 0.3) & (df['B'].values >= 0) & (df['B'].values < 0.3)
]
choices = [
    True
]
df["RESULT"] = np.select(conditions, choices, default=False)

913 µs ± 93.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Numba

In [17]:
import numba

In [18]:
@numba.njit()
def complex_function_numba(a, b, c, d, e):
    return np.where((a >= 0) & (a < 0.3) & (b >= 0) & (b < 0.3), True, False)

In [19]:
%%timeit
df["RESULT"] = complex_function_numba(df["A"].values, df["B"].values, df["C"].values, df["D"].values, df["E"].values)

723 µs ± 270 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
# Note: use modin and dask for big datasets (overwrites pandas api, faster read_csv)