In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from typing import List, Callable, Tuple, Any
from wandb.wandb_run import Run
from datetime import datetime, timedelta

from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

import attr
import pandas as pd
import gc
import os
import wandb
import nest_asyncio
import logging
import sys
import numpy as np

nest_asyncio.apply()

while "freqtrade" not in os.listdir():
    os.chdir("..")
    if "freqtrade" in os.listdir():
        nest_asyncio.apply()
        logger = logging.getLogger("freqtrade.ml")
        handler = logging.StreamHandler(stream=sys.stdout)
        handler.setFormatter(logging.Formatter("%(name)s - %(message)s"))
        logger.addHandler(handler)
        logger.setLevel(logging.DEBUG)
    
from freqtrade.ml.lightning import LightningModule, LightningConfig
from freqtrade.ml.trainer import TradingTrainer
from freqtrade.ml.container import LightningContainer
from freqtrade.ml import loader

from freqtrade.nbtools.helper import free_mem
from freqtrade.nbtools.pairs import PAIRS_HIGHCAP_NONSTABLE

container = None

if container is not None:
    print("Deleting container")
    free_mem(container)
    
gc.collect()

0

# Lightning Module

In [10]:
attr.s(repr=False)
class CatBoost(LightningModule):
    """ Template for LightningModule """
        
    def on_configure(self) -> LightningConfig:
        
        # This datetime can be replaced with datetime.now()
        now = datetime(2021, 8, 10)
        
        # Lighting Configuration
        config = LightningConfig(
            
            # Basic info
            name        = "5n20-catboosttest",
            timeframe   = "5m",
            exchange    = "binance",
            
            # Train and validation datetime
            trainval_start  = now - timedelta(days=120),
            trainval_end    = now - timedelta(days=60),
            
            # Backtest Optimization datetime
            opt_start = now - timedelta(days=59),
            opt_end   = now - timedelta(days=30),
            
            # Unbiased Backtest datetime
            test_start = now - timedelta(days=29),
            test_end   = now,
        )
        
        # Optional custom config attributes
        config.add_custom("num_epochs", 10000)
        config.add_custom("num_future_candles", 4)
        config.add_custom("num_classification_classes", 3)
        
        return config
        
    def on_get_data_paths(self, cwd: Path, timeframe: str, exchange: str) -> List[Path]:
        path_data_exchange = cwd.parent / "mount" / "data" / exchange

        return [
            datapath
            for datapath in list(path_data_exchange.glob(f"*-{timeframe}.json"))
            if datapath.name.split("-")[0].replace("_", "/")
            in PAIRS_HIGHCAP_NONSTABLE[:5]
        ]
    
    def on_add_features(self, df_onepair: pd.DataFrame) -> pd.DataFrame:
        import talib.abstract as ta

        # Start add features
        spaces = [3, 5, 9, 15, 25, 50, 100, 200]
        for i in spaces:
            df_onepair[f"ml_smadiff_{i}"] = (df_onepair['close'].rolling(i).mean() - df_onepair['close'])
            df_onepair[f"ml_maxdiff_{i}"] = (df_onepair['close'].rolling(i).max() - df_onepair['close'])
            df_onepair[f"ml_mindiff_{i}"] = (df_onepair['close'].rolling(i).min() - df_onepair['close'])
            df_onepair[f"ml_std_{i}"] = df_onepair['close'].rolling(i).std()
            df_onepair[f"ml_ma_{i}"] = df_onepair['close'].pct_change(i).rolling(i).mean()
            df_onepair[f"ml_rsi_{i}"] = ta.RSI(df_onepair["close"], timeperiod=i)

        df_onepair['ml_bop'] = ta.BOP(df_onepair['open'], df_onepair['high'], df_onepair['low'], df_onepair['close'])
        df_onepair["ml_volume_pctchange"] = df_onepair['volume'].pct_change()
        df_onepair['ml_z_score_120'] = ((df_onepair["ml_ma_15"] - df_onepair["ml_ma_15"].rolling(21).mean() + 1e-9) 
                             / (df_onepair["ml_ma_15"].rolling(21).std() + 1e-9))

        return df_onepair
    
    def on_add_labels(self, df_onepair: pd.DataFrame) -> pd.DataFrame:
        # Create labels for classification task
        future_price = df_onepair['close'].shift(-self.config.num_future_candles)
        ml_label = (future_price - df_onepair['close']) / df_onepair['close']
        df_onepair[self.config.column_y] = pd.qcut(ml_label, self.config.num_classification_classes, labels=False)
        return df_onepair
    
    def on_final_processing(self, df_allpairs: pd.DataFrame) -> Tuple[Any, Any, Any, Any]:
        df_allpairs = self._balance_class_dataset(df_allpairs)
        X = df_allpairs[self.config.columns_x]
        y = df_allpairs[self.config.column_y]
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)
        return X_train, X_val, y_train, y_val
    
    def _balance_class_dataset(self, df_allpairs: pd.DataFrame) -> pd.DataFrame:
        """Balance num of datas in every class"""
        lengths_every_class = list(df_allpairs.groupby(by=["ml_label"]).count()["date"])
        df_allpairs_copy = pd.DataFrame()

        for classname in df_allpairs["ml_label"].unique():
            minimum_of_all = min(lengths_every_class)
            df_allpairs_copy = df_allpairs_copy.append(df_allpairs.loc[df_allpairs["ml_label"] == classname, :].iloc[:minimum_of_all])

        # Performance improvements
        df_allpairs_copy = df_allpairs_copy.astype(
            {col: "float32" for col in df_allpairs_copy.columns if "float" in str(df_allpairs_copy[col].dtype)}
        )
        free_mem(df_allpairs)
        return df_allpairs_copy
    
    def on_define_model(self, run: Run, X_train, X_val, y_train, y_val) -> Any:
        return CatBoostClassifier(
            iterations=self.config.num_epochs, 
            task_type="CPU",
         )
    
    def on_start_training(self, run: Run, X_train, X_val, y_train, y_val):
        print("Start Training...")
        self.model: CatBoostClassifier
        self.model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)]
        )
        print("Accuracy: %.2f" % self.model.score(X_val, y_val))
    
    def on_predict(self, df_input_onepair: pd.DataFrame) -> pd.DataFrame:
        df_input_np = df_input_onepair.to_numpy()
        preds = self.model.predict_proba(df_input_np)
        df_preds = pd.DataFrame(preds)
        
        return df_preds
    
    def on_training_step(self, run: Run, data: dict):
        raise NotImplementedError()


module = CatBoost()

# Dataset Inspector Before Training

In [5]:
df_ = LightningContainer(module)._load_df_allpairs()
dates = df_["date"]
df_ = df_[module.config.columns_x + [module.config.column_y]]

In [6]:
def dataframe_statistics(df_: pd.DataFrame):
    # Detect nan, infinity, and too large values in dataset
    print("N rows na:", str(df_.shape[0] - df_.dropna().shape[0]))
    print("N rows inf:", str(df_.shape[0] - df_.replace([np.inf, -np.inf], np.nan).dropna().shape[0]))
    print("Cols na:", str(df_.columns[df_.isna().any()].tolist()))
    print("Cols inf:", str(list(df_.columns.to_series()[np.isinf(df_).any()])))

    # Detect columns with irregular datatypes
    supported_datatypes = ["float32"]
    print("Irregular (col, dtypes):", str([(it, df_[it].dtype) for it in df_.columns if df_[it].dtype not in supported_datatypes]))

    # Dataset statistics
    print("Dataset Length: %s" % len(df_))
    print("Date Min: %s" % dates.min())
    print("Date Max: %s" % dates.max())
    
dataframe_statistics(df_)
free_mem(df_)

N rows na: 0
N rows inf: 0
Cols na: []
Cols inf: []
Irregular (col, dtypes): []
Dataset Length: 83960
Date Min: 2021-04-13 09:15:00+00:00
Date Max: 2021-06-10 23:40:00+00:00


# Start Training

In [11]:
with wandb.init(project=module.config.name) as run:
    trainer = TradingTrainer()
    container = trainer.fit(module, run, True)



Start Training...
Learning rate set to 0.048738
0:	learn: 1.0960390	test: 1.0962408	best: 1.0962408 (0)	total: 44.4ms	remaining: 7m 24s
1:	learn: 1.0936807	test: 1.0940452	best: 1.0940452 (1)	total: 67.1ms	remaining: 5m 35s
2:	learn: 1.0915098	test: 1.0920524	best: 1.0920524 (2)	total: 88.5ms	remaining: 4m 54s
3:	learn: 1.0895224	test: 1.0902904	best: 1.0902904 (3)	total: 109ms	remaining: 4m 33s
4:	learn: 1.0874784	test: 1.0884345	best: 1.0884345 (4)	total: 131ms	remaining: 4m 20s
5:	learn: 1.0856031	test: 1.0867683	best: 1.0867683 (5)	total: 153ms	remaining: 4m 14s
6:	learn: 1.0839840	test: 1.0852979	best: 1.0852979 (6)	total: 174ms	remaining: 4m 8s
7:	learn: 1.0822596	test: 1.0837447	best: 1.0837447 (7)	total: 195ms	remaining: 4m 3s
8:	learn: 1.0805526	test: 1.0821898	best: 1.0821898 (8)	total: 216ms	remaining: 4m
9:	learn: 1.0789602	test: 1.0807580	best: 1.0807580 (9)	total: 238ms	remaining: 3m 57s
10:	learn: 1.0775403	test: 1.0794109	best: 1.0794109 (10)	total: 260ms	remaining: 3m 

[34m[1mwandb[0m: Adding directory to artifact (C:\CS\Python\crypto-collection\freqtrade\superft\.temp\lightning_5n20-catboosttest_2021-08-14_19-55-05)... Done. 0.1s


VBox(children=(Label(value=' 21.18MB of 21.18MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.…

# Model EDA

In [12]:
# Validate prediction function by simulate inserting freqtrade data into predict function
df_with_preds = trainer.validate_predict(container)
df_with_preds

freqtrade.ml.container - Not dropping X columns in predict because it doesn't exist in predict columns


INFO:freqtrade.ml.container:Not dropping X columns in predict because it doesn't exist in predict columns


freqtrade.ml.container - Returned new columns from df_preds: ['ml_0', 'ml_1', 'ml_2']


INFO:freqtrade.ml.container:Returned new columns from df_preds: ['ml_0', 'ml_1', 'ml_2']



Dataset: Binance BTC/USDT 5m loc[410000:414000] (Freqtrade Regularized)


DF WITH PREDICTIONS INFO
----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4001 entries, 410000 to 414000
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype              
---  ------  --------------  -----              
 0   date    4001 non-null   datetime64[ns, UTC]
 1   open    4001 non-null   float64            
 2   high    4001 non-null   float64            
 3   low     4001 non-null   float64            
 4   close   4001 non-null   float64            
 5   volume  4001 non-null   float64            
 6   ml_0    3602 non-null   float64            
 7   ml_1    3602 non-null   float64            
 8   ml_2    3602 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(8)
memory usage: 281.4 KB
None


Original DF
----------
                            date      open      high       low     close  \
410000 2021-07-10 18:40:00+00:00  33434.42  33446.42  33390.76  3

Unnamed: 0,date,open,high,low,close,volume,ml_0,ml_1,ml_2
410000,2021-07-10 18:40:00+00:00,33434.42,33446.42,33390.76,33402.09,87.119679,,,
410001,2021-07-10 18:45:00+00:00,33401.41,33432.15,33401.40,33416.81,44.021276,,,
410002,2021-07-10 18:50:00+00:00,33416.81,33437.78,33413.39,33421.01,47.821799,,,
410003,2021-07-10 18:55:00+00:00,33421.01,33468.98,33420.01,33466.71,38.742812,,,
410004,2021-07-10 19:00:00+00:00,33466.71,33544.00,33448.99,33535.73,147.871811,,,
...,...,...,...,...,...,...,...,...,...
413996,2021-07-24 15:40:00+00:00,33871.76,33873.99,33816.98,33843.32,212.499511,0.292625,0.503711,0.203664
413997,2021-07-24 15:45:00+00:00,33843.31,33874.87,33817.00,33874.86,104.111770,0.278904,0.530893,0.190203
413998,2021-07-24 15:50:00+00:00,33874.86,33899.61,33854.21,33898.25,63.273788,0.290635,0.412389,0.296977
413999,2021-07-24 15:55:00+00:00,33898.25,33898.25,33843.41,33847.51,139.420180,0.346612,0.363772,0.289617


In [106]:
dataframe_statistics(df_with_preds)

Unnamed: 0,date,open,high,low,close,volume,ml_0,ml_1,ml_2
410395,2021-07-12 03:35:00+00:00,34375.05,34441.49,34370.35,34403.54,84.704248,,,
410396,2021-07-12 03:40:00+00:00,34403.97,34469.0,34400.04,34441.52,94.008576,,,
410397,2021-07-12 03:45:00+00:00,34441.53,34488.38,34408.24,34478.09,119.304031,,,
410398,2021-07-12 03:50:00+00:00,34478.09,34532.46,34476.05,34506.52,157.73811,,,
410399,2021-07-12 03:55:00+00:00,34509.54,34512.87,34438.53,34447.92,69.374371,0.325801,0.357249,0.31695


# Dataset EDA

In [4]:
if container is None:
    container = LightningContainer(module)
    
df = container._load_df_allpairs()
df.head()

Unnamed: 0,date,open,high,low,close,volume,pair,ml_smadiff_3,ml_maxdiff_3,ml_mindiff_3,...,ml_smadiff_200,ml_maxdiff_200,ml_mindiff_200,ml_std_200,ml_ma_200,ml_rsi_200,ml_bop,ml_volume_pctchange,ml_z_score_120,ml_label
309247,2021-03-29 09:15:00,1.21278,1.21788,1.21278,1.21585,1671711.75,ADA/USDT,-0.002193,0.0,-0.00353,...,-0.028321,0.0,-0.04285,0.00941,-0.00097,54.502419,0.601959,0.219201,-0.906313,2.0
309248,2021-03-29 09:20:00,1.21579,1.2241,1.21566,1.22308,2904280.75,ADA/USDT,-0.005837,0.0,-0.01028,...,-0.035375,0.0,-0.05008,0.00974,-0.000881,55.407448,0.863743,0.73731,-0.800306,2.0
309249,2021-03-29 09:25:00,1.22308,1.22446,1.21774,1.2235,1897667.625,ADA/USDT,-0.00269,0.0,-0.00765,...,-0.035606,0.0,-0.0505,0.010062,-0.000787,55.459171,0.062497,-0.346596,-0.575933,1.0
309250,2021-03-29 09:30:00,1.22349,1.22562,1.21966,1.22029,2518210.0,ADA/USDT,0.002,0.00321,0.0,...,-0.032221,0.00321,-0.04729,0.010318,-0.000691,54.969372,-0.536923,0.327003,-0.309143,2.0
309251,2021-03-29 09:35:00,1.2203,1.226,1.21866,1.22575,1680297.625,ADA/USDT,-0.00257,0.0,-0.00546,...,-0.037479,0.0,-0.05275,0.010654,-0.000565,55.639103,0.742513,-0.332741,0.260539,0.0


In [7]:
eth_usdt = df.loc[df["pair"] == "ETH/USDT"]
eth_usdt = eth_usdt.drop(columns=["pair"])
eth_usdt.head()

Unnamed: 0,date,open,high,low,close,volume,ml_smadiff_3,ml_maxdiff_3,ml_mindiff_3,ml_std_3,...,ml_smadiff_200,ml_maxdiff_200,ml_mindiff_200,ml_std_200,ml_ma_200,ml_rsi_200,ml_bop,ml_volume_pctchange,ml_z_score_120,ml_label
378700,2021-03-29 09:15:00,1744.349976,1746.98999,1741.359985,1742.030029,3171.518066,0.049967,2.319946,-2.170044,2.245412,...,-51.149723,2.319946,-76.23999,14.813927,-0.010366,54.862041,-0.412068,-0.358517,2.698296,2.0
378701,2021-03-29 09:20:00,1741.660034,1746.719971,1741.199951,1743.76001,2513.226074,-0.380005,0.589966,-1.72998,1.205753,...,-52.688755,0.589966,-77.969971,15.244114,-0.010228,55.047752,0.380429,-0.207564,2.627061,2.0
378702,2021-03-29 09:25:00,1743.77002,1749.780029,1742.52002,1749.349976,3718.205322,-4.303304,0.0,-7.319946,3.825837,...,-58.053169,0.0,-83.559937,15.764766,-0.010089,55.640442,0.768588,0.479455,2.547096,2.0
378703,2021-03-29 09:30:00,1749.359985,1750.47998,1744.060059,1744.780029,4408.751465,1.183309,4.569946,-1.02002,2.976927,...,-53.274624,4.569946,-78.98999,16.191444,-0.009945,55.04414,-0.713398,0.18572,2.411457,2.0
378704,2021-03-29 09:35:00,1744.780029,1755.569946,1744.589966,1752.040039,5304.447754,-3.316691,0.0,-7.26001,3.670345,...,-60.297234,0.0,-86.25,16.723162,-0.009787,55.800442,0.661204,0.203163,2.352111,2.0
