In this notebook, we build upon findings of the `xgboost` model in `classification_models.ipynb`

In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedKFold
from sklearn.metrics import f1_score, accuracy_score, log_loss, confusion_matrix, classification_report, roc_auc_score

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
from pathlib import Path

src_path = Path("..", "src")
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from feature_engineering import add_vwap, add_atr, add_ema, add_dow, add_return, add_jump_categories_3, add_jump_categories_5
from utility_functions import classification_summary

We will do all the data cleaning / feature engineering and `xgboost` specific preprocessing as seen in the aforementioned notebook

In [3]:
df_raw = pd.read_csv("./../input/ETHUSDT_1h_2020_2024_join_final.csv")

df_raw.drop(df_raw.columns[df_raw.columns.str.contains('unnamed', case=False)], axis=1, inplace=True)

print(df_raw.info())

df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43817 entries, 0 to 43816
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   date                      43817 non-null  float64
 1   open                      43817 non-null  float64
 2   high                      43817 non-null  float64
 3   low                       43817 non-null  float64
 4   close                     43817 non-null  float64
 5   volume                    43817 non-null  float64
 6   base_asset_volume         43817 non-null  float64
 7   no_trades                 43817 non-null  int64  
 8   taker_buy_vol             43817 non-null  float64
 9   taker_buy_base_asset_vol  43817 non-null  float64
dtypes: float64(9), int64(1)
memory usage: 3.3 MB
None


Unnamed: 0,date,open,high,low,close,volume,base_asset_volume,no_trades,taker_buy_vol,taker_buy_base_asset_vol
0,1577840000000.0,129.16,129.19,128.68,128.87,7769.17336,1000930.0,2504,4149.93345,534619.339
1,1577840000000.0,128.87,130.65,128.78,130.64,11344.65516,1474278.0,4885,5930.54276,770486.0567
2,1577840000000.0,130.63,130.98,130.35,130.85,7603.35623,994025.6,3046,3324.35218,434675.4447
3,1577850000000.0,130.85,130.89,129.94,130.2,4968.55433,647361.0,2818,1810.03564,235890.3302
4,1577850000000.0,130.21,130.74,130.15,130.2,3397.90747,443006.7,2264,1839.74371,239848.3483


In [4]:
# general data cleaning
df = df_raw.copy()

df = add_return(df)

# add jump feature and target variable
df = add_jump_categories_3(df, up_margin=0.005, down_margin=0.003)
df['next_jump'] = df['jump'].shift(-1)

# feature engineering
df = add_atr(df)
df = add_ema(df)
df = add_vwap(df)

df = add_dow(df)
df = pd.get_dummies(df, columns=['day_of_week'], prefix='dow', drop_first=True)
df = df.dropna()

# lag features
lag_factor = 5
cols = ['open', 'high', 'low', 'close', 'volume', 'atr', 'ema', 'VWAP']

for lag in range(1, lag_factor+1):
    for col in cols:
        newcol = np.zeros(df.shape[0]) * np.nan
        newcol[lag:] = df[col].values[:-lag]
        df.insert(len(df.columns), "{0}_{1}".format(col, lag), newcol)

df = df.dropna()

# move the jump and target variable (jump_tmr) to the end
df = pd.get_dummies(df, columns=['jump'], prefix='jump', drop_first=True)
df = df[[col for col in df.columns if col not in ['next_jump']] + ['next_jump']]

for col, dtype in zip(df.columns, df.dtypes):
    print(col, dtype)

df.head(10)

date float64
open float64
high float64
low float64
close float64
volume float64
base_asset_volume float64
no_trades int64
taker_buy_vol float64
taker_buy_base_asset_vol float64
return float64
atr float64
ema float64
VWAP float64
dow_Monday bool
dow_Saturday bool
dow_Sunday bool
dow_Thursday bool
dow_Tuesday bool
dow_Wednesday bool
open_1 float64
high_1 float64
low_1 float64
close_1 float64
volume_1 float64
atr_1 float64
ema_1 float64
VWAP_1 float64
open_2 float64
high_2 float64
low_2 float64
close_2 float64
volume_2 float64
atr_2 float64
ema_2 float64
VWAP_2 float64
open_3 float64
high_3 float64
low_3 float64
close_3 float64
volume_3 float64
atr_3 float64
ema_3 float64
VWAP_3 float64
open_4 float64
high_4 float64
low_4 float64
close_4 float64
volume_4 float64
atr_4 float64
ema_4 float64
VWAP_4 float64
open_5 float64
high_5 float64
low_5 float64
close_5 float64
volume_5 float64
atr_5 float64
ema_5 float64
VWAP_5 float64
jump_neutral bool
jump_up bool
next_jump object


Unnamed: 0,date,open,high,low,close,volume,base_asset_volume,no_trades,taker_buy_vol,taker_buy_base_asset_vol,...,high_5,low_5,close_5,volume_5,atr_5,ema_5,VWAP_5,jump_neutral,jump_up,next_jump
19,1577910000000.0,132.04,132.16,131.62,131.86,2111.21443,278355.7,1995,997.52946,131502.5,...,132.37,131.24,131.96,7325.25762,0.785,131.029332,130.561825,True,False,neutral
20,1577910000000.0,131.86,132.25,131.7,132.18,2014.79285,266048.4,1988,1021.42474,134902.8,...,132.4,131.6,132.08,5361.06926,0.113214,131.228799,130.641168,True,False,neutral
21,1577910000000.0,132.17,132.37,131.68,131.78,4879.42025,644006.0,2410,1841.37772,242979.2,...,132.95,131.78,132.85,6915.20906,0.091658,131.488372,130.7643,True,False,down
22,1577920000000.0,131.82,131.82,129.9,130.27,14876.06749,1943372.0,6386,5520.77235,720974.1,...,133.05,132.27,132.34,5424.00732,0.062261,131.701365,130.851473,False,False,neutral
23,1577920000000.0,130.28,130.87,129.74,130.77,3865.45991,503554.6,3232,2025.11315,263935.9,...,132.46,131.57,132.04,5707.7934,0.068019,131.765758,130.90863,True,False,neutral
24,1577920000000.0,130.72,130.78,130.27,130.67,3772.6667,492526.7,2565,2094.53022,273427.3,...,132.16,131.62,131.86,2111.21443,0.04343,131.788607,130.925844,True,False,neutral
25,1577930000000.0,130.66,130.67,130.12,130.15,3684.51912,480344.1,2414,1879.43297,245012.0,...,132.25,131.7,132.18,2014.79285,0.042388,131.839552,130.944429,True,False,down
26,1577930000000.0,130.14,130.16,128.89,129.72,19078.42209,2469243.0,8599,10251.27762,1326941.0,...,132.37,131.68,131.78,4879.42025,0.052313,131.860308,130.983103,False,False,down
27,1577930000000.0,129.71,129.71,128.77,129.1,11950.18634,1544526.0,5294,6776.08848,875907.7,...,131.82,129.9,130.27,14876.06749,0.14088,131.620913,130.949343,False,False,neutral
28,1577940000000.0,129.09,129.87,128.69,129.55,8931.67759,1156161.0,4813,4789.83655,620041.2,...,130.87,129.74,130.77,3865.45991,0.090777,131.388731,130.936277,True,False,neutral


In [9]:
n_big_down = list(df['next_jump']).count('big_down')
n_small_down = list(df['next_jump']).count('small_down')
n_neutral = list(df['next_jump']).count('neutral')
n_small_up = list(df['next_jump']).count('small_up')
n_big_up = list(df['next_jump']).count('big_up')

print('num big down:', n_big_down)
print('num small down:', n_small_down)
print('num neutral:', n_neutral)
print('num small up:', n_small_up)
print('num big up:', n_big_up)

n_down = list(df['next_jump']).count('down')
n_up = list(df['next_jump']).count('up')

print('num down', n_down)
print('num up', n_up)

num big down: 0
num small down: 0
num neutral: 27788
num small up: 0
num big up: 0
num down 9598
num up 6411


In [6]:
# xgboost specific preprocessing

xgbDF = df.copy()

jump_lookup = {
    'down':0,
    'neutral':1,
    'up':2
}

xgbDF['next_jump'] = xgbDF['next_jump'].map(jump_lookup)

X = xgbDF.drop(['next_jump', 'date'], axis=1).copy()
y = xgbDF['next_jump'].copy()
m = xgb.DMatrix(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False) 

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [11]:
# baseline prediction with all set to neutral
y_pred = np.zeros(X_test.shape[0])
for idx in range(len(y_pred)):
    y_pred[idx] = '1'

classification_summary(y_pred, y_test)


------------ Classification Report ------------
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1701
           1       0.69      1.00      0.82      6087
           2       0.00      0.00      0.00       972

    accuracy                           0.69      8760
   macro avg       0.23      0.33      0.27      8760
weighted avg       0.48      0.69      0.57      8760



-------------- Confusion Matrix --------------
[[   0 1701    0]
 [   0 6087    0]
 [   0  972    0]]


***
XGBoost 1
***

Naive Implementation using custom weighted F1-score evaluation metric and softprob objective function

In [17]:
def softmax(x):
    '''Softmax function with x as input vector.'''
    e = np.exp(x)
    return e / np.sum(e)

def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
    '''Loss function.  Computing the gradient and approximated hessian (diagonal).
    Reimplements the `multi:softprob` inside XGBoost.

    '''
    labels = data.get_label()
    kRows = predt.shape[0]
    kClasses = 5

    if data.get_weight().size == 0:
        # Use 1 as weight if we don't have custom weight.
        weights = np.ones((kRows, 1), dtype=float)
    else:
        weights = data.get_weight()

    # The prediction is of shape (rows, classes), each element in a row
    # represents a raw prediction (leaf weight, hasn't gone through softmax
    # yet).
    assert predt.shape == (kRows, kClasses)

    grad = np.zeros((kRows, kClasses), dtype=float)
    hess = np.zeros((kRows, kClasses), dtype=float)

    eps = 1e-6

    # compute the gradient and hessian, slow iterations in Python, only
    # suitable for demo.  Also the one in native XGBoost core is more robust to
    # numeric overflow as we don't do anything to mitigate the `exp` in
    # `softmax` here.
    for r in range(predt.shape[0]):
        target = labels[r]
        p = softmax(predt[r, :])
        for c in range(predt.shape[1]):
            assert target >= 0 or target <= kClasses
            g = p[c] - 1.0 if c == target else p[c]
            g = g * weights[r]
            h = max((2.0 * p[c] * (1.0 - p[c]) * weights[r]).item(), eps)
            grad[r, c] = g
            hess[r, c] = h

    return grad, hess

def f1_weighted_eval(predt: np.ndarray, dtrain: xgb.DMatrix):
    y_true = dtrain.get_label()
    # Convert raw logits to predicted class
    y_pred = np.argmax(predt, axis=1)
    
    f1 = f1_score(y_true, y_pred, average='macro')
    return 'f1_Weighted', f1

model = xgb.train(
    {
        'num_class':5,
        'disable_default_eval_metric':True
    },
    dtrain=dtrain,
    num_boost_round=10,
    obj=softprob_obj,
    custom_metric=f1_weighted_eval
)

KeyboardInterrupt: 

In [9]:
y_pred = model.predict(dtest)

print(y_pred)

classification_summary(y_pred, y_test)

[1. 1. 1. ... 1. 1. 1.]

------------ Classification Report ------------
              precision    recall  f1-score   support

           0       0.31      0.02      0.04       728
           1       0.89      0.99      0.94      7747
           2       0.53      0.07      0.13       285

    accuracy                           0.88      8760
   macro avg       0.58      0.36      0.37      8760
weighted avg       0.83      0.88      0.84      8760



-------------- Confusion Matrix --------------
[[  16  708    4]
 [  31 7701   15]
 [   4  260   21]]


***
XGBoost Model 2
***

Tuned using `Optuna`. See https://www.kaggle.com/code/para24/xgboost-stepwise-tuning-using-optuna/notebook#7.-Stepwise-Hyperparameter-Tuning

We separate the 6 most effective and commonly altered XGBoost hyperparameters into 3 groups as follows:

Group 1: `max_depth`, `min_child_weight`

Group 2: `subsample`, `colsample_bytree`

Group 3: `learning_rate`, `num_boost_round`

We then tune each group sequentially, finding the optimal value for each group using previous findings (with `learning_rate` and `num_boost_round` set to some default values to begin with before being the final group to be optimised).

In [7]:
X = df.drop(['date', 'next_jump'], axis=1).copy()
y = df['next_jump'].copy()
y = y.map(jump_lookup)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [14]:
from importlib import reload

import optimise_xgb as oxgb
reload(oxgb)
import eval_metrics as em
reload(em)

eval_metric = em.f1_weighted_eval

params = oxgb.stepwise_optimization(
    X_train, y_train, 3, eval_metric, trials=9
)

STUDY NAME:  xgboost
-------------------------------------------------------
EVALUATION METRIC:  f1_weighted_eval
-------------------------------------------------------
BEST CV SCORE:  0.4421094
-------------------------------------------------------
OPTIMAL GROUP - 1 PARAMS:  {'max_depth': 21, 'min_child_weight': 17.096825780771223}
-------------------------------------------------------
BEST TRIAL FrozenTrial(number=440, state=TrialState.COMPLETE, values=[0.4421094], datetime_start=datetime.datetime(2025, 1, 23, 11, 57, 20, 959711), datetime_complete=datetime.datetime(2025, 1, 23, 11, 57, 30, 361412), params={'max_depth': 21, 'min_child_weight': 17.096825780771223}, user_attrs={}, system_attrs={}, intermediate_values={0: 0.4421094, 1: 0.4436256, 2: 0.442991, 3: 0.44254619999999995, 4: 0.442382, 5: 0.4435868, 6: 0.4445568, 7: 0.4443122, 8: 0.44416500000000003, 9: 0.4437308000000001}, distributions={'max_depth': IntDistribution(high=30, log=False, low=2, step=1), 'min_child_weight': F

In [11]:
from optuna import create_study, logging
from optuna.pruners import MedianPruner
from optuna.integration import XGBoostPruningCallback

def objective(trial, X, y, group, score, params=dict()):
    dtrain = xgb.DMatrix(X, label=y)

    # initial learning params
    params['num_boost_round'] = 200
    params['learning_rate'] = 0.01
    params['objective'] = 'multi:softprob'
    params['num_class'] = 3

    if group == '1':
        params['max_depth'] = trial.suggest_int('max_depth', 2, 30)
        params['min_child_weight'] = trial.suggest_loguniform('min_child_weight', 1e-10, 1e10)
    
    if group == '2':
        params['subsample'] = trial.suggest_uniform('subsample', 0, 1)
        params['colsample_bytree'] = trial.suggest_uniform('colsample_bytree', 0, 1)

    if group == '3':
        num_boost_round = trial.suggest_int('num_boost_round', 100, 400)
        learning_rate = trial.suggest_uniform('learning_rate', 0.005, 0.1)
        params['num_boost_round'] = num_boost_round
        params['learning_rate'] = learning_rate

    pruning_callback = XGBoostPruningCallback(trial, 'test-' + score.__name__)

    cv_scores = xgb.cv(params, dtrain, nfold=5,
                       stratified=True,
                       feval=score,
                       num_boost_round=params['num_boost_round'],
                       early_stopping_rounds=10,
                       callbacks=[pruning_callback])
    
    return cv_scores['test-' + score.__name__ + '-mean'].values[-1]

def execute_optimization(study_name, group, score, trials, params=dict(), direction='maximize'):
    logging.set_verbosity(logging.ERROR)

    ## use pruner to skip trials that aren't doing so well
    pruner = MedianPruner(n_warmup_steps=5)

    study = create_study(
        direction=direction,
        study_name=study_name,
        storage='sqlite:///optuna.db',
        load_if_exists=True,
        pruner=pruner
    )

    study.optimize(
        lambda trial: objective(trial, X_train, y_train, group, score, params),
        n_trials=trials,
        n_jobs=1
    )

    print('STUDY NAME: ', study_name)
    print('-------------------------------------------------------')
    print('EVALUATION METRIC: ', score.__name__)
    print('-------------------------------------------------------')
    print('BEST CV SCORE: ', study.best_value)
    print('-------------------------------------------------------')
    print(f'OPTIMAL GROUP - {group} PARAMS: ', study.best_params)
    print('-------------------------------------------------------')
    print('BEST TRIAL', study.best_trial)
    print('-------------------------------------------------------')

    return study.best_params

def f1_weighted_eval(predt: np.ndarray, dtrain: xgb.DMatrix):
    y_true = dtrain.get_label()
    y_pred = np.argmax(predt, axis=1)
    
    f1 = f1_score(y_true, y_pred, average='macro')
    return 'f1_weighted_eval', f1


In [12]:
def stepwise_optimization(trials=9):
    final_params = dict()
    for g in ['1', '2', '3']:
        print(f'====== Optimizing Group {g} ======')
        update_params = execute_optimization(
            'xgboost', g, f1_weighted_eval, trials, params=final_params, direction='maximize'
        )
        final_params.update(update_params)
        print(f'Params after updating group {g}: ', final_params)
        print('\n\n')

    print(f'====== Final Optimal Parameters ======')
    print(final_params)

    return final_params

xgb2_params = stepwise_optimization()

STUDY NAME:  xgboost
-------------------------------------------------------
EVALUATION METRIC:  f1_weighted_eval
-------------------------------------------------------
BEST CV SCORE:  0.4421094
-------------------------------------------------------
OPTIMAL GROUP - 1 PARAMS:  {'max_depth': 21, 'min_child_weight': 17.096825780771223}
-------------------------------------------------------
BEST TRIAL FrozenTrial(number=440, state=TrialState.COMPLETE, values=[0.4421094], datetime_start=datetime.datetime(2025, 1, 23, 11, 57, 20, 959711), datetime_complete=datetime.datetime(2025, 1, 23, 11, 57, 30, 361412), params={'max_depth': 21, 'min_child_weight': 17.096825780771223}, user_attrs={}, system_attrs={}, intermediate_values={0: 0.4421094, 1: 0.4436256, 2: 0.442991, 3: 0.44254619999999995, 4: 0.442382, 5: 0.4435868, 6: 0.4445568, 7: 0.4443122, 8: 0.44416500000000003, 9: 0.4437308000000001}, distributions={'max_depth': IntDistribution(high=30, log=False, low=2, step=1), 'min_child_weight': F

In [10]:
print(xgb2_params)

xgb2_params['num_class'] = 3

model = xgb.train(
    params=xgb2_params,
    dtrain=dtrain,
    num_boost_round=xgb2_params['num_boost_round']
)

model

{'num_boost_round': 291, 'learning_rate': 0.06725325239041732, 'objective': 'multi:softprob', 'num_class': 3, 'max_depth': 21, 'min_child_weight': 17.096825780771223, 'subsample': 0.7323724775919658, 'colsample_bytree': 0.7775431848586021}


KeyboardInterrupt: 

In [24]:
y_pred_prob = model.predict(xgb.DMatrix(X_test, label=y_test))
y_pred = y_pred_prob.argmax(axis=1)

print(y_pred)

print('\n----------------- classification report ----------------\n')
print(classification_report(y_test, y_pred))

print('\n------------------- confusion matrix -------------------\n')
print(confusion_matrix(y_test, y_pred))


[1 1 1 ... 1 1 1]

----------------- classification report ----------------

              precision    recall  f1-score   support

           0       0.42      0.08      0.14      1701
           1       0.71      0.97      0.82      6087
           2       0.53      0.06      0.11       972

    accuracy                           0.70      8760
   macro avg       0.55      0.37      0.36      8760
weighted avg       0.64      0.70      0.61      8760


------------------- confusion matrix -------------------

[[ 142 1539   20]
 [ 134 5920   33]
 [  62  851   59]]
