In [1]:
!echo $PATH # if /usr/local/cuda/bin is missing, re-run code form terminal

/mnt/Data/Repos/ensemble/.cuda/bin:/usr/local/cuda/bin:/home/whitgroves/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/snap/bin


In [2]:
!ptxas --version # expecting 12.2

ptxas: NVIDIA (R) Ptx optimizing assembler
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Jun_13_19:13:58_PDT_2023
Cuda compilation tools, release 12.2, V12.2.91
Build cuda_12.2.r12.2/compiler.32965470_0


In [3]:
import pandas as pd
import random as r

data = pd.read_csv('.data/stock_bars.csv')
data = data.loc[data.symbol == r.choice(data.symbol.unique())] # pick a single stock from the test data each time
data

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
0,BB,2022-01-03 05:00:00+00:00,9.32,9.3900,9.1000,9.37,5267494.0,26372.0,9.268915
1,BB,2022-01-04 05:00:00+00:00,9.37,9.4900,9.1200,9.33,6360118.0,29899.0,9.271508
2,BB,2022-01-05 05:00:00+00:00,9.27,9.3471,8.7800,8.78,9620376.0,39131.0,9.026742
3,BB,2022-01-06 05:00:00+00:00,8.79,9.0200,8.4700,8.78,11505077.0,47090.0,8.771278
4,BB,2022-01-07 05:00:00+00:00,8.97,9.2300,8.8000,8.86,6509053.0,29903.0,8.953267
...,...,...,...,...,...,...,...,...,...
511,BB,2024-01-17 05:00:00+00:00,3.33,3.3500,3.2748,3.34,3264012.0,9790.0,3.322546
512,BB,2024-01-18 05:00:00+00:00,3.37,3.4200,3.3450,3.39,3564418.0,9148.0,3.377668
513,BB,2024-01-19 05:00:00+00:00,3.38,3.4550,3.3450,3.43,7640902.0,13106.0,3.423459
514,BB,2024-01-22 05:00:00+00:00,3.60,3.6000,3.6000,3.60,2332.0,15.0,3.569584


In [4]:
def normalize(data:pd.DataFrame, skip_cols:list[str]=[]) -> pd.DataFrame:
    skip_cols = [col for col in skip_cols if col in data.columns]
    skip = data[skip_cols]
    temp = data.drop(skip_cols, axis=1)
    temp = (temp - temp.mean()) / temp.std(ddof=0) # standardize
    temp = temp.ffill().fillna(0) # impute
    return pd.concat([skip, temp], axis=1, join='inner')

norm = normalize(data, ['symbol', 'timestamp'])
norm

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
0,BB,2022-01-03 05:00:00+00:00,3.230217,3.114700,3.236637,3.282115,-0.326388,0.809768,3.194762
1,BB,2022-01-04 05:00:00+00:00,3.268630,3.189574,3.252447,3.251254,-0.062726,1.157147,3.196758
2,BB,2022-01-05 05:00:00+00:00,3.191805,3.082579,2.983688,2.826914,0.724010,2.066417,3.008389
3,BB,2022-01-06 05:00:00+00:00,2.823042,2.837666,2.738644,2.826914,1.178808,2.850308,2.811788
4,BB,2022-01-07 05:00:00+00:00,2.961328,2.994901,2.999498,2.888637,-0.026787,1.157541,2.951844
...,...,...,...,...,...,...,...,...,...
511,BB,2024-01-17 05:00:00+00:00,-1.371634,-1.407688,-1.367984,-1.370193,-0.809850,-0.823411,-1.381475
512,BB,2024-01-18 05:00:00+00:00,-1.340904,-1.355277,-1.312493,-1.331617,-0.737359,-0.886643,-1.339053
513,BB,2024-01-19 05:00:00+00:00,-1.333221,-1.329071,-1.312493,-1.300756,0.246341,-0.496815,-1.303813
514,BB,2024-01-22 05:00:00+00:00,-1.164205,-1.220503,-1.110925,-1.169596,-1.596929,-1.786162,-1.191358


In [5]:
y = norm['low'].shift(-1) - norm['low'] # since data is normalized, we attempt to learn/predict the difference between t-1 and t
X = norm.drop(['symbol', 'timestamp'], axis=1)

In [6]:
X

Unnamed: 0,open,high,low,close,volume,trade_count,vwap
0,3.230217,3.114700,3.236637,3.282115,-0.326388,0.809768,3.194762
1,3.268630,3.189574,3.252447,3.251254,-0.062726,1.157147,3.196758
2,3.191805,3.082579,2.983688,2.826914,0.724010,2.066417,3.008389
3,2.823042,2.837666,2.738644,2.826914,1.178808,2.850308,2.811788
4,2.961328,2.994901,2.999498,2.888637,-0.026787,1.157541,2.951844
...,...,...,...,...,...,...,...
511,-1.371634,-1.407688,-1.367984,-1.370193,-0.809850,-0.823411,-1.381475
512,-1.340904,-1.355277,-1.312493,-1.331617,-0.737359,-0.886643,-1.339053
513,-1.333221,-1.329071,-1.312493,-1.300756,0.246341,-0.496815,-1.303813
514,-1.164205,-1.220503,-1.110925,-1.169596,-1.596929,-1.786162,-1.191358


In [7]:
y

0      0.015809
1     -0.268758
2     -0.245044
3      0.260854
4     -0.229235
         ...   
511    0.055491
512    0.000000
513    0.201569
514   -0.047428
515         NaN
Name: low, Length: 516, dtype: float64

In [8]:
# cutoff = int(len(X)*0.8) # 80:20 train/test split
# X_train, X_test = X[:cutoff], X[cutoff:-1]
# y_train, y_test = y[:cutoff], y[cutoff:-1]

# print(len(X), len(X_train), len(X_test))
# print(len(y), len(y_train), len(y_test))

In [13]:
import ensemble as e
import xgboost as xgb
import lightgbm as lgb
import catboost as cat

In [15]:
N_FEATURES = len(X.columns)
ACTIVATION_1 = 'tanh' # inputs are standardized so keep negative range
ACTIVATION_2 = 'relu' # performed better than tanh, sigmoid
DROPOUT = 0.5         # performed better than 0.3, 0.4
RANDOM_STATE = 25     # funnier than 24

layers = e.tf.keras.layers
Sequential = e.tf.keras.Sequential
regularizer = e.tf.keras.regularizers.l1(0.001)
e.tf.keras.utils.set_random_seed(RANDOM_STATE)

shared_kw = dict(random_state=RANDOM_STATE, learning_rate=0.2, max_depth=3, subsample=0.8)
xgb_lgb_kw = dict(n_jobs=16, colsample_bytree=0.85, reg_alpha=500)                         
xgb_cat_kw = dict(early_stopping_rounds=5)
lgb_cat_kw = dict(num_leaves=8, min_child_samples=2000)

models = [ # order matters if limit is set; frontloading stronger models will cause more rejections; the reverse will oversaturate
    
    xgb.XGBRegressor(**shared_kw, **xgb_lgb_kw, **xgb_cat_kw, eval_metric='mae', tree_method='hist', gamma=0.2), #, nthread=1),
    lgb.LGBMRegressor(**shared_kw, **xgb_lgb_kw, **lgb_cat_kw, early_stopping_round=5, metric='l1', min_split_gain=0.001, verbosity=-1),
    cat.CatBoostRegressor(**shared_kw, **xgb_cat_kw, **lgb_cat_kw, eval_metric='MAE'),
    Sequential([layers.Dense(1, activation=ACTIVATION_1, input_shape=[N_FEATURES])], name='linear'), # N -> 1
    Sequential([ # N -> N/2 -> 1
        layers.Dense(N_FEATURES, kernel_regularizer=regularizer, activation=ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//2, kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='net'),
]

# try: ensemble = e.load_ensemble(X_test=X, y_test=y)
# except FileNotFoundError: 
ensemble = e.train_ensemble(models, X, y, folds=3, limit=5)
ensemble

Pre-training setup...Complete (0.0s)
Model 1/5: Fold 1/3: Stopped: PredictionError: Model is guessing a constant value. -- 3      
Model 2/5: Fold 1/3: Stopped: PredictionError: Model is guessing a constant value. -- 3       
Model 3/5: Fold 1/3: Accepted with score: 0.08048629 (0.1s) (CatBoostRegressor_1731694420_0)          
Model 3/5: Fold 2/3: Accepted with score: 0.08028630 (0.0s) (CatBoostRegressor_1731694420_1)          
Model 3/5: Fold 3/3: Accepted with score: 0.07973444 (0.0s) (CatBoostRegressor_1731694420_2)          
Model 4/5: Fold 1/3: Training linear...                                                

I0000 00:00:1731694420.632649   39264 service.cc:148] XLA service 0x7f6cdc003fc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1731694420.632663   39264 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
I0000 00:00:1731694420.632666   39264 service.cc:156]   StreamExecutor device (1): NVIDIA GeForce GTX 1660 SUPER, Compute Capability 7.5
I0000 00:00:1731694420.658844   39264 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1731694420.759967   39264 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Model 4/5: Fold 1/3: Accepted with score: 0.82537044 (0.5s) (linear_1731694420_0)          
Model 4/5: Fold 2/3: Accepted with score: 0.82489204 (0.2s) (linear_1731694421_1)          
Model 4/5: Fold 3/3: Rejected with score: 0.82374809 (0.3s)                            
Model 5/5: Fold 1/3: Rejected with score: 0.43292238 (1.4s)                         
Model 5/5: Fold 2/3: Rejected with score: 0.42504210 (0.8s)          
Model 5/5: Fold 3/3: Rejected with score: 0.40824837 (1.6s)          


<SelectiveEnsemble (5 model(s)>