In [None]:
!echo $PATH # if /usr/local/cuda/bin is missing, re-run VScode form terminal

In [None]:
!ptxas --version # expecting 12.2

In [None]:
import pandas as pd
import random as r

data = pd.read_csv('./stock_bars.csv')
data = data.loc[data.symbol == r.choice(data.symbol.unique())] # pick a single stock from the test data each time
data

In [None]:
def normalize(data:pd.DataFrame, skip_cols:list[str]=[]) -> pd.DataFrame:
    skip_cols = [col for col in skip_cols if col in data.columns]
    skip = data[skip_cols]
    temp = data.drop(skip_cols, axis=1)
    temp = (temp - temp.mean()) / temp.std(ddof=0) # standardize
    temp = temp.ffill().fillna(0) # impute
    return pd.concat([skip, temp], axis=1, join='inner')

norm = normalize(data, ['symbol', 'timestamp'])
norm

In [None]:
y = norm['low'].shift(-1) - norm['low'] # since data is normalized, we attempt to learn/predict the difference between t-1 and t
X = norm.drop(['symbol', 'timestamp'], axis=1)

In [None]:
X

In [None]:
y

In [None]:
# cutoff = int(len(X)*0.8) # 80:20 train/test split
# X_train, X_test = X[:cutoff], X[cutoff:-1]
# y_train, y_test = y[:cutoff], y[cutoff:-1]

# print(len(X), len(X_train), len(X_test))
# print(len(y), len(y_train), len(y_test))

In [None]:
import clique
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import tensorflow as tf

In [None]:
N_FEATURES = len(X.columns)
ACTIVATION_1 = 'tanh' # inputs are standardized so keep negative range
ACTIVATION_2 = 'relu' # performed better than tanh, sigmoid
DROPOUT = 0.5         # performed better than 0.3, 0.4
RANDOM_STATE = 25     # funnier than 24

layers = tf.keras.layers
Sequential = tf.keras.Sequential
regularizer = tf.keras.regularizers.l1(0.001)
tf.keras.utils.set_random_seed(RANDOM_STATE)

shared_kw = dict(random_state=RANDOM_STATE, learning_rate=0.2, max_depth=3, subsample=0.8)
xgb_lgb_kw = dict(n_jobs=16, colsample_bytree=0.85, reg_alpha=500)                         
xgb_cat_kw = dict(early_stopping_rounds=5)
lgb_cat_kw = dict(num_leaves=8, min_child_samples=2000)

models = [ # order matters if limit is set; frontloading stronger models will cause more rejections; the reverse will oversaturate
    xgb.XGBRegressor(**shared_kw, **xgb_lgb_kw, **xgb_cat_kw, eval_metric='mae', tree_method='hist', gamma=0.2), #, nthread=1),
    lgb.LGBMRegressor(**shared_kw, **xgb_lgb_kw, **lgb_cat_kw, early_stopping_round=5, metric='l1', min_split_gain=0.001, verbosity=-1),
    cat.CatBoostRegressor(**shared_kw, **xgb_cat_kw, **lgb_cat_kw, eval_metric='MAE'),
    Sequential([layers.Dense(1, activation=ACTIVATION_1, input_shape=[N_FEATURES])], name='linear'), # N -> 1
    # Sequential([ # N -> N/2 -> 1
    #     layers.Dense(N_FEATURES, kernel_regularizer=regularizer, activation=ACTIVATION_1, input_shape=[N_FEATURES]),
    #     layers.Dropout(DROPOUT),
    #     layers.BatchNormalization(),
    #     layers.Dense(N_FEATURES//2, kernel_regularizer=regularizer, activation=ACTIVATION_2),
    #     layers.Dropout(DROPOUT),
    #     layers.BatchNormalization(),
    #     layers.Dense(1)
    # ], name='net'),
]

# try: ensemble = e.load_ensemble(X_test=X, y_test=y)
# except FileNotFoundError: 
ensemble = clique.train_ensemble(models, X, y, folds=5, limit=3)
ensemble

In [None]:
ensemble.best_model

In [None]:
ensemble.predict(X)