# Pandas Speed Compairison - Simple Function 

In [1]:
import numpy as np
import pandas as pd

In [2]:
column_size = 100_000
df = pd.DataFrame(
    {
        "A" : np.random.random(column_size),
        "B" : np.random.random(column_size),
        "C" : np.random.random(column_size),
        "D" : np.random.random(column_size),
        "E" : np.random.random(column_size),
    }
)

In [3]:
def simple_function(a, b, c, d, e):
    return a + b - c * d / e

## Iterrows

In [4]:
%%timeit
result = []
for each_index, each_row in df.iterrows():
    each_result = simple_function(each_row.A, each_row.B, each_row.C, each_row.D, each_row.E)
    result.append(each_result)
df["RESULT"] = result

8.72 s ± 384 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Apply

In [5]:
%%timeit
df["RESULT"] = df.apply(lambda x : simple_function(x.A, x.B, x.C, x.D, x.E), axis='columns')

4.26 s ± 518 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Pandarallel

In [6]:
from pandarallel import pandarallel

In [7]:
pandarallel.initialize()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [8]:
%%timeit
df["RESULT"] = df.parallel_apply(lambda x : simple_function(x.A, x.B, x.C, x.D, x.E), axis='columns')

2.39 s ± 219 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Itertuples 

In [9]:
%%timeit
result = []
for each_row in df.itertuples():
    each_result = simple_function(each_row.A, each_row.B, each_row.C, each_row.D, each_row.E)
    result.append(each_result)
df["RESULT"] = result

164 ms ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Swifter

In [10]:
import swifter

In [11]:
%%timeit
df["RESULT"] = df.swifter.progress_bar(False).apply(lambda x : simple_function(x.A, x.B, x.C, x.D, x.E), axis='columns')

4.49 ms ± 523 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Pandas Vectorize

In [12]:
%%timeit
df["RESULT"] = simple_function(df["A"], df["B"], df["C"], df["D"], df["E"])

1.86 ms ± 251 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Numpy Vectorize

In [13]:
%%timeit
df['RESULT'] = simple_function(df["A"].values, df["B"].values, df["C"].values, df["D"].values, df["E"].values)

1.31 ms ± 175 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Numba

In [14]:
import numba

In [15]:
@numba.njit()
def simple_function_numba(a, b, c, d, e):
    return a + b - c * d / e

In [16]:
%%timeit
df['RESULT'] = simple_function_numba(df["A"].values, df["B"].values, df["C"].values, df["D"].values, df["E"].values)

911 µs ± 153 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
# Note: use modin and dask for big datasets (overwrites pandas api, faster read_csv)