In [13]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import time
from pandarallel import pandarallel
import math
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Initialize pandarallel

In [14]:
pandarallel.initialize()

Pandarallel will run on 4 workers


# DataFrame.apply

In [15]:
df_size = int(5e6)
df = pd.DataFrame(dict(a=np.random.randint(1, 8, df_size),
                       b=np.random.rand(df_size)))

In [16]:
def func(x):
    return math.sin(x.a**2) + math.sin(x.b**2)

In [17]:
%%time
res = df.apply(func, axis=1)

CPU times: user 2min 21s, sys: 445 ms, total: 2min 22s
Wall time: 2min 22s


In [18]:
%%time
res_parallel = df.parallel_apply(func, axis=1)

CPU times: user 190 ms, sys: 124 ms, total: 314 ms
Wall time: 1min 5s


In [19]:
res.equals(res_parallel)

True

# DataFrame.applymap

In [20]:
df_size = int(1e7)
df = pd.DataFrame(dict(a=np.random.randint(1, 8, df_size),
                       b=np.random.rand(df_size)))

In [21]:
def func(x):
    return math.sin(x**2) - math.cos(x**2)

In [22]:
%%time
res = df.applymap(func)

CPU times: user 19.1 s, sys: 1.12 s, total: 20.2 s
Wall time: 20.2 s


In [23]:
%%time
res_parallel = df.parallel_applymap(func)

CPU times: user 459 ms, sys: 723 ms, total: 1.18 s
Wall time: 11.5 s


In [24]:
res.equals(res_parallel)

True

# DataFrame.groupby.apply

In [None]:
df_size = int(3e7)
df = pd.DataFrame(dict(a=np.random.randint(1, 1000, df_size),
                       b=np.random.rand(df_size)))

In [None]:
def func(df):
    dum = 0
    for item in df.b:
        dum += math.log10(math.sqrt(math.exp(item**2)))
        
    return dum / len(df.b)

In [None]:
%%time
res = df.groupby("a").apply(func)

In [None]:
%%time
res_parallel = df.groupby("a").parallel_apply(func)

In [None]:
res.equals(res_parallel)

# DataFrame.groupby.rolling.apply

In [None]:
df_size = int(1e6)
df = pd.DataFrame(dict(a=np.random.randint(1, 300, df_size),
                       b=np.random.rand(df_size)))

In [None]:
def func(x):
    return x.iloc[0] + x.iloc[1] ** 2 + x.iloc[2] ** 3 + x.iloc[3] ** 4

In [None]:
%%time
res = df.groupby('a').b.rolling(4).apply(func, raw=False)

In [None]:
%%time
res_parallel = df.groupby('a').b.rolling(4).parallel_apply(func, raw=False)

In [None]:
res.equals(res_parallel)

# Series.map

In [None]:
df_size = int(5e7)
df = pd.DataFrame(dict(a=np.random.rand(df_size) + 1))

In [None]:
def func(x):
    return math.log10(math.sqrt(math.exp(x**2)))

In [None]:
%%time
res = df.a.map(func)

In [None]:
%%time
res_parallel = df.a.parallel_map(func)

In [None]:
res.equals(res_parallel)

# Series.apply

In [None]:
df_size = int(3.5e7)
df = pd.DataFrame(dict(a=np.random.rand(df_size) + 1))

In [None]:
def func(x, power, bias=0):
    return math.log10(math.sqrt(math.exp(x**power))) + bias

In [None]:
%%time
res = df.a.apply(func, args=(2,), bias=3)

In [None]:
%%time
res_parallel = df.a.parallel_apply(func, args=(2,), bias=3)

In [None]:
res.equals(res_parallel)

# Series.rolling.apply

In [None]:
df_size = int(1e6)
df = pd.DataFrame(dict(a=np.random.randint(1, 8, df_size),
                       b=list(range(df_size))))

In [None]:
def func(x):
    return x.iloc[0] + x.iloc[1] ** 2 + x.iloc[2] ** 3 + x.iloc[3] ** 4

In [None]:
%%time
res = df.b.rolling(4).apply(func, raw=False)

In [None]:
%%time
res_parallel = df.b.rolling(4).parallel_apply(func, raw=False)

In [None]:
res.equals(res_parallel)