This is a watered-down version of project 4 from the [fintech bootcamp at home](https://github.com/whitgroves/fintech-bootcamp-at-home) used with some models from my [optiver Kaggle submission](https://github.com/whitgroves/optiver-trading-at-the-close) to test the ensemble's performance. Feel free to treat this as a tutorial if you need an example and/or use case.

In [1]:
!echo $PATH # if /usr/local/cuda/bin is missing, re-run VScode form terminal

/mnt/Data/Repos/clique-ml/.cuda/bin:/usr/local/cuda/bin:/home/whitgroves/.local/bin:/usr/local/cuda/bin:/home/whitgroves/.local/bin:/usr/local/cuda/bin:/home/whitgroves/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin


In [2]:
!ptxas --version # expecting 12.2

ptxas: NVIDIA (R) Ptx optimizing assembler
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Jun_13_19:13:58_PDT_2023
Cuda compilation tools, release 12.2, V12.2.91
Build cuda_12.2.r12.2/compiler.32965470_0


In [3]:
import pandas as pd
import random as r

data = pd.read_csv('./stock_bars.csv')
data = data.loc[data.symbol == r.choice(data.symbol.unique())] # pick a single stock from the test data each time
data

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
516,NKLA,2022-01-03 05:00:00+00:00,10.0200,10.3191,9.7500,10.2400,12750229.0,61421.0,10.077616
517,NKLA,2022-01-04 05:00:00+00:00,10.2200,10.3900,9.8350,10.3200,11373741.0,54861.0,10.120397
518,NKLA,2022-01-05 05:00:00+00:00,10.6800,11.5100,9.8500,9.8800,36341451.0,159097.0,10.662359
519,NKLA,2022-01-06 05:00:00+00:00,10.2340,10.8000,9.6450,10.2100,26383343.0,112858.0,10.185134
520,NKLA,2022-01-07 05:00:00+00:00,10.2100,10.7550,10.1100,10.6200,14584682.0,65934.0,10.487543
...,...,...,...,...,...,...,...,...,...
1027,NKLA,2024-01-17 05:00:00+00:00,0.6298,0.6357,0.6034,0.6332,56436368.0,71343.0,0.622183
1028,NKLA,2024-01-18 05:00:00+00:00,0.6551,0.6700,0.5825,0.6218,101971452.0,107485.0,0.620318
1029,NKLA,2024-01-19 05:00:00+00:00,0.6218,0.6700,0.6028,0.6521,82251930.0,85288.0,0.640258
1030,NKLA,2024-01-22 05:00:00+00:00,0.6400,0.6880,0.6301,0.6517,75830657.0,78571.0,0.659761


In [4]:
def standardize(data:pd.DataFrame, skip_cols:list[str]=[]) -> pd.DataFrame:
    skip_cols = [col for col in skip_cols if col in data.columns]
    skip = data[skip_cols]
    temp = data.drop(skip_cols, axis=1)
    temp = (temp - temp.mean()) / temp.std(ddof=0) # standardize
    temp = temp.ffill().fillna(0) # impute
    return pd.concat([skip, temp], axis=1, join='inner')

norm = standardize(data, ['symbol', 'timestamp'])
norm

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
516,NKLA,2022-01-03 05:00:00+00:00,2.255504,2.227298,2.308932,2.354789,-0.499379,-0.131881,2.288845
517,NKLA,2022-01-04 05:00:00+00:00,2.326172,2.251452,2.340183,2.383214,-0.526871,-0.240642,2.304003
518,NKLA,2022-01-05 05:00:00+00:00,2.488709,2.633016,2.345697,2.226875,-0.028207,1.487530,2.496032
519,NKLA,2022-01-06 05:00:00+00:00,2.331119,2.391132,2.270329,2.344129,-0.227094,0.720914,2.326941
520,NKLA,2022-01-07 05:00:00+00:00,2.322639,2.375801,2.441286,2.489809,-0.462741,-0.057058,2.434091
...,...,...,...,...,...,...,...,...,...
1027,NKLA,2024-01-17 05:00:00+00:00,-1.062431,-1.071668,-1.053818,-1.058651,0.373136,0.032620,-1.061422
1028,NKLA,2024-01-18 05:00:00+00:00,-1.053492,-1.059983,-1.061502,-1.062702,1.282578,0.631833,-1.062083
1029,NKLA,2024-01-19 05:00:00+00:00,-1.065258,-1.059983,-1.054039,-1.051936,0.888733,0.263820,-1.055017
1030,NKLA,2024-01-22 05:00:00+00:00,-1.058827,-1.053851,-1.044002,-1.052078,0.760485,0.152456,-1.048107


In [5]:
y = norm['low'].shift(-1) - norm['low'] # since data is standardize, we attempt to learn/predict the difference between t-1 and t
X = norm.drop(['symbol', 'timestamp'], axis=1)

In [6]:
X

Unnamed: 0,open,high,low,close,volume,trade_count,vwap
516,2.255504,2.227298,2.308932,2.354789,-0.499379,-0.131881,2.288845
517,2.326172,2.251452,2.340183,2.383214,-0.526871,-0.240642,2.304003
518,2.488709,2.633016,2.345697,2.226875,-0.028207,1.487530,2.496032
519,2.331119,2.391132,2.270329,2.344129,-0.227094,0.720914,2.326941
520,2.322639,2.375801,2.441286,2.489809,-0.462741,-0.057058,2.434091
...,...,...,...,...,...,...,...
1027,-1.062431,-1.071668,-1.053818,-1.058651,0.373136,0.032620,-1.061422
1028,-1.053492,-1.059983,-1.061502,-1.062702,1.282578,0.631833,-1.062083
1029,-1.065258,-1.059983,-1.054039,-1.051936,0.888733,0.263820,-1.055017
1030,-1.058827,-1.053851,-1.044002,-1.052078,0.760485,0.152456,-1.048107


In [7]:
y

516     0.031250
517     0.005515
518    -0.075368
519     0.170957
520    -0.125001
          ...   
1027   -0.007684
1028    0.007463
1029    0.010037
1030    0.000735
1031         NaN
Name: low, Length: 516, dtype: float64

In [8]:
# cutoff = int(len(X)*0.8) # 80:20 train/test split
# X_train, X_test = X[:cutoff], X[cutoff:-1]
# y_train, y_test = y[:cutoff], y[cutoff:-1]

# print(len(X), len(X_train), len(X_test))
# print(len(y), len(y_train), len(y_test))

In [9]:
# import clique
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import tensorflow as tf

layers = tf.keras.layers
Sequential = tf.keras.Sequential
regularizer = tf.keras.regularizers.l1(0.001)
tf.keras.utils.set_random_seed(626)

2025-07-19 15:59:48.568220: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-19 15:59:48.576479: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752958788.585129  103644 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752958788.587816  103644 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752958788.594752  103644 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [10]:
# N_FEATURES = len(X.columns)
# ACTIVATION_1 = 'tanh' # inputs are standardized so keep negative range
# ACTIVATION_2 = 'relu' # performed better than tanh, sigmoid
# DROPOUT = 0.5         # performed better than 0.3, 0.4

shared_kw = dict(random_state=626, learning_rate=0.2, max_depth=3, subsample=0.8)
xgb_lgb_kw = dict(n_jobs=16, colsample_bytree=0.85, reg_alpha=500)
xgb_cat_kw = dict(early_stopping_rounds=5)
lgb_cat_kw = dict(num_leaves=8, min_child_samples=2000)

models = [ # order matters if limit is set; frontloading stronger models will cause more rejections; the reverse will oversaturate
    xgb.XGBRegressor(**shared_kw, **xgb_lgb_kw, **xgb_cat_kw, eval_metric='mae', tree_method='hist', gamma=0.2), #, nthread=1),
    xgb.XGBRegressor(**shared_kw, **xgb_lgb_kw, **xgb_cat_kw, eval_metric='mae', tree_method='hist', gamma=0.2),
    xgb.XGBRegressor(**shared_kw, **xgb_lgb_kw, **xgb_cat_kw, eval_metric='mae', tree_method='hist', gamma=0.2),
    lgb.LGBMRegressor(**shared_kw, **xgb_lgb_kw, **lgb_cat_kw, early_stopping_round=5, metric='l1', min_split_gain=0.001, verbosity=-1),
    cat.CatBoostRegressor(**shared_kw, **xgb_cat_kw, **lgb_cat_kw, eval_metric='MAE'),
    Sequential([layers.Dense(1, activation='tanh', input_shape=[len(X.columns)])], name='linear'), # N -> 1
    # Sequential([ # N -> N/2 -> 1
    #     layers.Dense(N_FEATURES, kernel_regularizer=regularizer, activation=ACTIVATION_1, input_shape=[N_FEATURES]),
    #     layers.Dropout(DROPOUT),
    #     layers.BatchNormalization(),
    #     layers.Dense(N_FEATURES//2, kernel_regularizer=regularizer, activation=ACTIVATION_2),
    #     layers.Dropout(DROPOUT),
    #     layers.BatchNormalization(),
    #     layers.Dense(1)
    # ], name='net'),
]

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [21]:
import clique
# clique.ModelProfile(models[3])
# clique.Clique(models=models)
# pass
very_exclusive_group = clique.Clique(models=models)
for model_id in very_exclusive_group.keys():
    print(model_id, '\t', very_exclusive_group[model_id])


XGBRegressor 	 <ModelProfile (XGBRegressor)>
XGBRegressor_0 	 <ModelProfile (XGBRegressor)>
XGBRegressor_1 	 <ModelProfile (XGBRegressor)>
LGBMRegressor 	 <ModelProfile (LGBMRegressor)>
CatBoostRegressor 	 <ModelProfile (CatBoostRegressor)>
Sequential 	 <ModelProfile (Sequential)>


In [12]:
clique.Clique(models=models)

<Clique (6 model(s); limit: 6)>

In [13]:
repr(clique.ModelProfile(models[3]))

'<ModelProfile (LGBMRegressor)>'

In [14]:
clique.ModelProfile(models[3])

0,1,2
,boosting_type,'gbdt'
,num_leaves,8
,max_depth,3
,learning_rate,0.2
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.001
,min_child_weight,0.001


In [15]:

# # try: ensemble = e.load_ensemble(X_test=X, y_test=y)
# # except FileNotFoundError: 
# ensemble = clique.train_ensemble(models, X, y, folds=5, limit=3)
# ensemble

In [16]:
# ensemble.best_model

In [17]:
# ensemble.predict(X)

In [18]:
len(dict())

0