In [1]:
from catboost.datasets import titanic
from catboost import CatBoostClassifier, Pool, metrics, cv
from pathlib import Path
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import os

while "freqtrade" not in os.listdir():
    os.chdir("..")

from freqtrade.nbtools.remote_utils import load_lightning_container
import numpy as np

In [30]:
from joblib import Parallel, delayed, parallel_backend
import pandas as pd

df = pd.DataFrame({"pair": [], "close": []})

for pair in ["BTC/USDT", "ETH/USDT", "BNB/USDT", "ADA/USDT", "BCH/USDT", "DAI/USDT", "ETC/USDT"]:
    d1 = pd.DataFrame({
        "close": [np.random.randn() for _ in range(100000)],
    })
    d1["pair"] = pair
    df = pd.concat([df, d1], axis=0)

df["pair"].unique()

array(['BTC/USDT', 'ETH/USDT', 'BNB/USDT', 'ADA/USDT', 'BCH/USDT',
       'DAI/USDT', 'ETC/USDT'], dtype=object)

In [21]:
def df_processing_function(df_inp):
    for i in range(100):
        df_inp[f"sma_{i}"] = df_inp["close"].rolling(i).mean()
    return df_inp

In [31]:
# Vanilla
def vanilla(df_input, func, split_column: str):
    df_split = [df_input[df_input[split_column] == category] for category in df_input[split_column].unique()]
    return pd.concat([func(it) for it in df_split])

%timeit vanilla(df, df_processing_function, "pair")

3.01 s ± 58.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
# Parallel

def parallelize_dataframe(df_input, func, split_column: str, n_cores=4):
    df_split = [df_input[df_input[split_column] == category] for category in df_input[split_column].unique()]
    result = Parallel(n_jobs=n_cores)(delayed(func)(_df) for _df in df_split)
    return pd.concat(result)

%timeit parallelize_dataframe(df, df_processing_function, "pair", n_cores=8)

315 ms ± 83.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
df_vanilla = vanilla(df, df_processing_function, "pair")
df_vanilla

Unnamed: 0,pair,close,close.1,close.2,close.3,close.4,close.5,close.6,close.7,close.8,...,close.9,close.10,close.11,close.12,close.13,close.14,close.15,close.16,close.17,close.18
0,BTC/USDT,0.600912,,0.600912,,,,,,,...,,,,,,,,,,
1,BTC/USDT,0.074778,,0.074778,0.337845,,,,,,...,,,,,,,,,,
2,BTC/USDT,-0.385818,,-0.385818,-0.155520,0.096624,,,,,...,,,,,,,,,,
3,BTC/USDT,0.428992,,0.428992,0.021587,0.039318,0.179716,,,,...,,,,,,,,,,
4,BTC/USDT,0.016016,,0.016016,0.222504,0.019730,0.033492,0.146976,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ETC/USDT,0.555811,,0.555811,-0.970215,-0.937351,-0.885973,-0.679484,-0.730623,-0.532570,...,-0.113314,-0.107996,-0.106293,-0.102101,-0.110046,-0.116144,-0.103852,-0.083085,-0.080169,-0.079988
9996,ETC/USDT,-1.554297,,-1.554297,-0.499243,-1.164909,-1.091587,-1.019638,-0.825286,-0.848291,...,-0.114716,-0.129149,-0.123717,-0.121863,-0.117549,-0.125248,-0.131125,-0.118805,-0.098097,-0.095059
9997,ETC/USDT,0.186813,,0.186813,-0.683742,-0.270558,-0.826978,-0.835907,-0.818563,-0.680701,...,-0.138686,-0.111403,-0.125714,-0.120378,-0.118579,-0.114346,-0.121998,-0.127847,-0.115687,-0.095219
9998,ETC/USDT,-1.273444,,-1.273444,-0.543315,-0.880309,-0.521279,-0.916272,-0.908830,-0.883546,...,-0.140011,-0.151155,-0.124034,-0.138056,-0.132645,-0.130736,-0.126420,-0.133868,-0.139537,-0.127381


In [9]:
df_parallel = parallelize_dataframe(df, df_processing_function, "pair", n_cores=4)
df_parallel

Unnamed: 0,pair,close,sma_0,sma_1,sma_2,sma_3,sma_4,sma_5,sma_6,sma_7,...,sma_90,sma_91,sma_92,sma_93,sma_94,sma_95,sma_96,sma_97,sma_98,sma_99
0,BTC/USDT,0.079422,,0.079422,,,,,,,...,,,,,,,,,,
1,BTC/USDT,-0.296388,,-0.296388,-0.108483,,,,,,...,,,,,,,,,,
2,BTC/USDT,0.289599,,0.289599,-0.003394,0.024211,,,,,...,,,,,,,,,,
3,BTC/USDT,-1.855582,,-1.855582,-0.782991,-0.620790,-0.445737,,,,...,,,,,,,,,,
4,BTC/USDT,1.306838,,1.306838,-0.274372,-0.086382,-0.138883,-0.095222,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,ETC/USDT,0.338796,,0.338796,0.509162,0.215276,-0.067891,-0.063915,0.016781,-0.219624,...,0.011726,0.005405,0.009336,-0.007925,-0.006938,-0.009997,0.001921,-0.003457,-0.009167,-0.002611
99996,ETC/USDT,-0.407306,,-0.407306,-0.034255,0.203673,0.059630,-0.135774,-0.121146,-0.043803,...,0.020314,0.007122,0.000919,0.004856,-0.012174,-0.011152,-0.014136,-0.002297,-0.007577,-0.013188
99997,ETC/USDT,-0.347350,,-0.347350,-0.377328,-0.138620,0.065917,-0.021766,-0.171036,-0.153461,...,0.029806,0.016274,0.003269,-0.002826,0.001109,-0.015702,-0.014654,-0.017571,-0.005818,-0.011010
99998,ETC/USDT,0.441203,,0.441203,0.046927,-0.104484,0.006336,0.140974,0.055396,-0.083574,...,0.020169,0.034327,0.020892,0.007978,0.001898,0.005741,-0.010943,-0.009955,-0.012890,-0.001303


In [29]:
import time

def done():
    time.sleep(1)
    print("I'm done!")

# Parallel(n_jobs=10)(delayed(done)() for i in range(10))

df_split = [df[df["pair"] == category] for category in df["pair"].unique()]

%timeit Parallel(n_jobs=8, prefer="threads")(delayed(df_processing_function)(_df) for _df in df_split)

369 ms ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
import pandas as pd
import qgrid

In [4]:
df = pd.DataFrame({
    "name": ["Zara", "Ann", "Amel"],
    "age": ["29", "31", "25"],
})

df

Unnamed: 0,name,age
0,Zara,29
1,Ann,31
2,Amel,25


In [6]:
qgrid.show_grid(df)