In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from typing import List, Callable, Tuple, Any
from wandb.wandb_run import Run
from datetime import datetime, timedelta

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import attr
import pandas as pd
import gc
import os
import wandb
import nest_asyncio

nest_asyncio.apply()

while "freqtrade" not in os.listdir():
    os.chdir("..")
    
from freqtrade.ml.lightning import LightningModule, LightningConfig
from freqtrade.ml.trainer import TradingTrainer
from freqtrade.ml.container import LightningContainer
from freqtrade.ml import loader


from freqtrade.nbtools.helper import free_mem
from freqtrade.nbtools.pairs import PAIRS_HIGHCAP_NONSTABLE

container = None

# Lightning Module

In [41]:
attr.s(repr=False)
class RandomForest(LightningModule):
    """ Template for LightningModule """
        
    def on_configure(self) -> LightningConfig:
        
        # This datetime can be replaced with datetime.now()
        now = datetime(2021, 7, 26)
        
        # Lighting Configuration
        config = LightningConfig(
            
            # Basic info
            name        = "5n20-randomforest",
            timeframe   = "5m",
            exchange    = "binance",
            
            # Train and validation datetime
            trainval_start  = now - timedelta(days=120),
            trainval_end    = now - timedelta(days=60),
            
            # Backtest Optimization datetime
            opt_start = now - timedelta(days=59),
            opt_end   = now - timedelta(days=30),
            
            # Unbiased Backtest datetime
            test_start = now - timedelta(days=29),
            test_end   = now,
        )
        
        # Optional custom config attributes
        config.add_custom("num_epochs", 1000)
        config.add_custom("num_future_candles", 4)
        config.add_custom("num_classification_classes", 3)
        
        return config
        
    def on_get_data_paths(self, cwd: Path, timeframe: str, exchange: str) -> List[Path]:
        path_data_exchange = cwd.parent / "mount" / "data" / exchange

        return [
            datapath
            for datapath in list(path_data_exchange.glob(f"*-{timeframe}.json"))
            if datapath.name.split("-")[0].replace("_", "/")
            in PAIRS_HIGHCAP_NONSTABLE[:5]
        ]
    
    def on_add_features(self, df_onepair: pd.DataFrame) -> pd.DataFrame:
        import talib.abstract as ta

        # Start add features
        spaces = [3, 5, 9, 15, 25, 50, 100, 200]
        for i in spaces:
            df_onepair[f"ml_smadiff_{i}"] = (df_onepair['close'].rolling(i).mean() - df_onepair['close'])
            df_onepair[f"ml_maxdiff_{i}"] = (df_onepair['close'].rolling(i).max() - df_onepair['close'])
            df_onepair[f"ml_mindiff_{i}"] = (df_onepair['close'].rolling(i).min() - df_onepair['close'])
            df_onepair[f"ml_std_{i}"] = df_onepair['close'].rolling(i).std()
            df_onepair[f"ml_ma_{i}"] = df_onepair['close'].pct_change(i).rolling(i).mean()
            df_onepair[f"ml_rsi_{i}"] = ta.RSI(df_onepair["close"], timeperiod=i)

        df_onepair['ml_bop'] = ta.BOP(df_onepair['open'], df_onepair['high'], df_onepair['low'], df_onepair['close'])
        df_onepair["ml_volume_pctchange"] = df_onepair['volume'].pct_change()
        df_onepair['ml_z_score_120'] = ((df_onepair["ml_ma_15"] - df_onepair["ml_ma_15"].rolling(21).mean() + 1e-9) 
                             / (df_onepair["ml_ma_15"].rolling(21).std() + 1e-9))

        return df_onepair
    
    def on_add_labels(self, df_onepair: pd.DataFrame) -> pd.DataFrame:
        # Create labels for classification task
        future_price = df_onepair['close'].shift(-self.config.num_future_candles)
        ml_label = (future_price - df_onepair['close']) / df_onepair['close']
        df_onepair[self.config.column_y] = pd.qcut(ml_label, self.config.num_classification_classes, labels=False)
        return df_onepair
    
    def on_final_processing(self, df_allpairs: pd.DataFrame) -> Tuple[Any, Any, Any, Any]:
        df_allpairs = self._balance_class_dataset(df_allpairs)
        X = df_allpairs[self.config.columns_x]
        y = df_allpairs[self.config.column_y]
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
        return X_train, X_val, y_train, y_val
    
    def _balance_class_dataset(self, df_allpairs: pd.DataFrame) -> pd.DataFrame:
        """Balance num of datas in every class"""
        lengths_every_class = list(df_allpairs.groupby(by=["ml_label"]).count()["date"])
        df_allpairs_copy = pd.DataFrame()

        for classname in df_allpairs["ml_label"].unique():
            minimum_of_all = min(lengths_every_class)
            df_allpairs_copy = df_allpairs_copy.append(df_allpairs.loc[df_allpairs["ml_label"] == classname, :].iloc[:minimum_of_all])

        # Performance improvements
        df_allpairs_copy = df_allpairs_copy.astype(
            {col: "float32" for col in df_allpairs_copy.columns if "float" in str(df_allpairs_copy[col].dtype)}
        )
        free_mem(df_allpairs)
        return df_allpairs_copy
    
    def on_define_model(self, run: Run, X_train, X_val, y_train, y_val) -> Any:
        return RandomForestClassifier(max_depth=2, random_state=0)
    
    def on_start_training(self, run: Run, X_train, X_val, y_train, y_val):
        print("Start Training...")
        self.model: RandomForestClassifier
        self.model.fit(X_train, y_train)
    
    def on_predict(self, df_input_onepair: pd.DataFrame) -> pd.DataFrame:
        df_input_np = df_input_onepair.to_numpy()
        preds = self.model.predict_proba(df_input_np)
        df_preds = pd.DataFrame(preds)
        
        return df_preds
    
    def on_training_step(self, run: Run, data: dict):
        raise NotImplementedError()
        

# Start Training

In [42]:
module = RandomForest()

with wandb.init(project=module.config.name) as run:
    trainer = TradingTrainer()
    container = trainer.fit(module, run)

[34m[1mwandb[0m: wandb version 0.11.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Start Training...


[34m[1mwandb[0m: Adding directory to artifact (C:\CS\Python\crypto-collection\freqtrade\superft\.temp\lightning_5n20-randomforest_2021-07-31_19-58-16)... Done. 0.0s


VBox(children=(Label(value=' 0.26MB of 0.26MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

# Dataset EDA

In [5]:
if container is None:
    container = LightningContainer(module)
    
df = container._load_df_allpairs()
df.head()

Unnamed: 0,date,open,high,low,close,volume,pair,ml_smadiff_3,ml_maxdiff_3,ml_mindiff_3,...,ml_smadiff_200,ml_maxdiff_200,ml_mindiff_200,ml_std_200,ml_ma_200,ml_rsi_200,ml_bop,ml_volume_pctchange,ml_z_score_120,ml_label
309247,2021-03-29 09:15:00,1.21278,1.21788,1.21278,1.21585,1671711.75,ADA/USDT,-0.002193,0.0,-0.00353,...,-0.028321,0.0,-0.04285,0.00941,-0.00097,54.502419,0.601959,0.219201,-0.906313,2.0
309248,2021-03-29 09:20:00,1.21579,1.2241,1.21566,1.22308,2904280.75,ADA/USDT,-0.005837,0.0,-0.01028,...,-0.035375,0.0,-0.05008,0.00974,-0.000881,55.407448,0.863743,0.73731,-0.800306,2.0
309249,2021-03-29 09:25:00,1.22308,1.22446,1.21774,1.2235,1897667.625,ADA/USDT,-0.00269,0.0,-0.00765,...,-0.035606,0.0,-0.0505,0.010062,-0.000787,55.459171,0.062497,-0.346596,-0.575933,1.0
309250,2021-03-29 09:30:00,1.22349,1.22562,1.21966,1.22029,2518210.0,ADA/USDT,0.002,0.00321,0.0,...,-0.032221,0.00321,-0.04729,0.010318,-0.000691,54.969372,-0.536923,0.327003,-0.309143,2.0
309251,2021-03-29 09:35:00,1.2203,1.226,1.21866,1.22575,1680297.625,ADA/USDT,-0.00257,0.0,-0.00546,...,-0.037479,0.0,-0.05275,0.010654,-0.000565,55.639103,0.742513,-0.332741,0.260539,0.0


# Model EDA

In [43]:
eth_usdt = df.loc[df["pair"] == "ETH/USDT"]
eth_usdt = eth_usdt.drop(columns=["pair"])
eth_usdt.head()

Unnamed: 0,date,open,high,low,close,volume,ml_smadiff_3,ml_maxdiff_3,ml_mindiff_3,ml_std_3,...,ml_smadiff_200,ml_maxdiff_200,ml_mindiff_200,ml_std_200,ml_ma_200,ml_rsi_200,ml_bop,ml_volume_pctchange,ml_z_score_120,ml_label
378700,2021-03-29 09:15:00,1744.349976,1746.98999,1741.359985,1742.030029,3171.518066,0.049967,2.319946,-2.170044,2.245412,...,-51.149723,2.319946,-76.23999,14.813927,-0.010366,54.862041,-0.412068,-0.358517,2.698296,2.0
378701,2021-03-29 09:20:00,1741.660034,1746.719971,1741.199951,1743.76001,2513.226074,-0.380005,0.589966,-1.72998,1.205753,...,-52.688755,0.589966,-77.969971,15.244114,-0.010228,55.047752,0.380429,-0.207564,2.627061,2.0
378702,2021-03-29 09:25:00,1743.77002,1749.780029,1742.52002,1749.349976,3718.205322,-4.303304,0.0,-7.319946,3.825837,...,-58.053169,0.0,-83.559937,15.764766,-0.010089,55.640442,0.768588,0.479455,2.547096,2.0
378703,2021-03-29 09:30:00,1749.359985,1750.47998,1744.060059,1744.780029,4408.751465,1.183309,4.569946,-1.02002,2.976927,...,-53.274624,4.569946,-78.98999,16.191444,-0.009945,55.04414,-0.713398,0.18572,2.411457,2.0
378704,2021-03-29 09:35:00,1744.780029,1755.569946,1744.589966,1752.040039,5304.447754,-3.316691,0.0,-7.26001,3.670345,...,-60.297234,0.0,-86.25,16.723162,-0.009787,55.800442,0.661204,0.203163,2.352111,2.0


In [44]:
eth_usdt_with_preds = container.predict(eth_usdt)
eth_usdt_with_preds.head()

Not dropping X columns
Returned columns from df_preds: [0, 1, 2]


Unnamed: 0,date,open,high,low,close,volume,ml_smadiff_3,ml_maxdiff_3,ml_mindiff_3,ml_std_3,...,ml_std_200,ml_ma_200,ml_rsi_200,ml_bop,ml_volume_pctchange,ml_z_score_120,ml_label,0,1,2
378700,2021-03-29 09:15:00,1744.349976,1746.98999,1741.359985,1742.030029,3171.518066,0.049967,2.319946,-2.170044,2.245412,...,14.813927,-0.010366,54.862041,-0.412068,-0.358517,2.698296,2.0,,,
378701,2021-03-29 09:20:00,1741.660034,1746.719971,1741.199951,1743.76001,2513.226074,-0.380005,0.589966,-1.72998,1.205753,...,15.244114,-0.010228,55.047752,0.380429,-0.207564,2.627061,2.0,,,
378702,2021-03-29 09:25:00,1743.77002,1749.780029,1742.52002,1749.349976,3718.205322,-4.303304,0.0,-7.319946,3.825837,...,15.764766,-0.010089,55.640442,0.768588,0.479455,2.547096,2.0,,,
378703,2021-03-29 09:30:00,1749.359985,1750.47998,1744.060059,1744.780029,4408.751465,1.183309,4.569946,-1.02002,2.976927,...,16.191444,-0.009945,55.04414,-0.713398,0.18572,2.411457,2.0,,,
378704,2021-03-29 09:35:00,1744.780029,1755.569946,1744.589966,1752.040039,5304.447754,-3.316691,0.0,-7.26001,3.670345,...,16.723162,-0.009787,55.800442,0.661204,0.203163,2.352111,2.0,,,


In [49]:
eth_usdt_np = eth_usdt[container.module.config.columns_x]
eth_usdt_np = eth_usdt_np.to_numpy()
preds = container.module.model.predict_proba(eth_usdt_np)
preds = pd.DataFrame(preds)
preds.head()

# for pred_col in preds.columns:
#     if pred_col not in eth_usdt.columns:
#         eth_usdt[pred_col] = preds[pred_col]
        
# eth_usdt.head()

# TODO: Fix Index that makes casting predictions to NaN
# TODO: Fix Index that makes casting predictions to NaN
# TODO: Fix Index that makes casting predictions to NaN

Unnamed: 0,0,1,2
0,0.338233,0.338225,0.323542
1,0.338233,0.338225,0.323542
2,0.339165,0.33769,0.323145
3,0.338233,0.338225,0.323542
4,0.339165,0.33769,0.323145
