Another rewrite because I had to rebuild my local environment for cudf + tensorflow to work.

In [35]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # https://stackoverflow.com/questions/40426502/is-there-a-way-to-suppress-the-messages-tensorflow-prints
import gc
import typing as t
import warnings
warnings.filterwarnings('ignore')
from itertools import combinations
try: # got tired of changing code between local and kaggle setup
    import cudf.pandas
    cudf.pandas.install() # must be called before pandas import
except ModuleNotFoundError:
    print('cudf not installed. Continuing with CPU dataframes.')
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import tensorflow as tf # https://github.com/tensorflow/tensorflow/issues/62075
keras = tf.keras # https://github.com/microsoft/pylance-release/issues/1066
from keras import Sequential, layers, regularizers

In [36]:
if len(tf.config.list_physical_devices('GPU')) > 0:
    print('GPU present.')

GPU present.


In [37]:
DATA_TRAIN = '.data/train.csv'
DATA_TEST_X = '.data/test.csv'
DATA_TEST_Y = '.data/revealed_targets.csv'

KAGGLE_DATA_TRAIN = '/kaggle/input/optiver-trading-at-the-close/train.csv'
KAGGLE_DATA_TEST_X = '/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv'
KAGGLE_DATA_TEST_Y = '/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv'

DROPS = ['index', 'time_id', 'currently_scored', 'time_id_x', 'time_id_y', 'revealed_date_id', 'revealed_time_id']
SORTS = ['date_id', 'seconds_in_bucket', 'stock_id']
INDEX = 'row_id'

N_FEATURES = 11 + 0 # update if/as features are engineered 
# +0  features => TBD <--- training score
# +2  features => TBD ('volume', 'mid_price')
# +5  features => TBD ('volume', 'mid_price', 'liquidity_imbalance', 'matched_imbalance', 'size_imbalance')
# +20 features => TBD (previous 5 + combinations)

# https://www.kaggle.com/code/verracodeguacas/high-speed-predictions-no-gpu
def add_features(data:pd.DataFrame) -> pd.DataFrame: # data arrives sorted and pruned
    df = data.copy()
    df['volume'] = df.ask_size + df.bid_size
    df['mid_price'] = (df.ask_price + df.bid_price) / 2
    df['liquidity_imbalance'] = (df.bid_size-df.ask_size) / (df.bid_size+df.ask_size)
    df['matched_imbalance'] = (df.imbalance_size-df.matched_size) / (df.matched_size+df.imbalance_size)
    df['size_imbalance'] = df.bid_size / df.ask_size
    # size_cols = [x for x in data.columns if 'size' in x]
    # price_cols = [x for x in list(set(data.columns)-set(size_cols)) if 'p' in x] # filter for '_price' and 'wap'
    # for c in combinations(price_cols, 2):
    #     df[f'{c[0]}_{c[1]}_imbalance'] = (df[c[0]]-df[c[1]]) / (df[c[0]]+df[c[1]])
    # # print(len(df.columns))
    return df

def preprocess(data:pd.DataFrame) -> pd.DataFrame: # separate for submission compat
    df = data.reset_index().set_index(INDEX)
    df = df.drop([col for col in DROPS if col in df.columns], axis=1)
    df = df.sort_values(by=SORTS).drop(SORTS, axis=1)
    # df = add_features(df)
    df = (df - df.min()) / (df.max() - df.min())
    df = df.ffill().fillna(0)
    return df

def load_vars(testing:bool=False) -> tuple[pd.DataFrame, pd.Series]:

    def read_data(train, test_x, test_y):
        if testing:
            data = pd.merge(*[pd.read_csv(path) for path in [test_x, test_y]], on=SORTS) # https://stackoverflow.com/a/32041277/3178898
            ycol = 'revealed_target'
        else:
            data = pd.read_csv(train, index_col=INDEX)
            ycol = 'target'
        return data, ycol
    
    try: # tired of switching local/kaggle setup
        data, ycol = read_data(DATA_TRAIN, DATA_TEST_X, DATA_TEST_Y)
    except FileNotFoundError:
        data, ycol = read_data(KAGGLE_DATA_TRAIN, KAGGLE_DATA_TEST_X, KAGGLE_DATA_TEST_Y)

    data = data.dropna(subset=[ycol]) # some targets are null
    X = preprocess(data.drop(ycol, axis=1))
    y = data[ycol]
    return X, y 

In [38]:
class Model(t.Protocol): # interface for any sklearn-API model
    def fit(self, X, y, sample_weight=None): ...
    def predict(self, X): ...
    def get_params(self, deep=True): ...

class ModelProfile: # wrapper to ensure model info for Ensemble
    def __init__(self, model:Model, score:float, predict_kw:dict={}) -> None:
        self.model = model
        self.score = score
        self.predict_kw = predict_kw

class Ensemble: # https://www.kaggle.com/code/iqmansingh/optiver-4-fold-time-series-split-ensemble
    def __init__(self, models:list[ModelProfile]=None, limit:int=None) -> None:
        self.models = models or list[ModelProfile]()
        self.limit = limit # see add()

    @property
    def best_score(self) -> float:
        return min(m.score for m in self.models) if len(self) > 0 else None
    
    @property
    def mean_score(self) -> float:
        return sum(m.score for m in self.models) / len(self) if len(self) > 0 else None

    # adds a model to the collection. if limit is set, will reject new models below the mean (when full)
    def add(self, model: ModelProfile) -> bool:
        if self.limit and len(self) >= self.limit and model.score > self.mean_score:
            return False
        self.models.append(model)
        return True
    
    # returns all models with scores better than the current mean. can set a recursion number to prune multiple times.
    def prune(self, recurse:int=1): # -> Ensemble
        new = Ensemble([m for m in self.models if m.score <= self.mean_score])
        return new.prune(recurse-1) if recurse > 1 else new
    
    # wrapper for Model.predict(). calls each of the models and returns the average prediction
    def predict(self, X:pd.DataFrame) -> pd.DataFrame:
        y = pd.DataFrame(index=X.index)
        y['pred'] = 0
        for model in self.models:
            m_pred = model.model.predict(X, **model.predict_kw)
            y.pred += m_pred.reshape(-1) # tensorflow
        y.pred = y.pred / len(self)
        return y

    def __len__(self) -> int:
        return len(self.models)
    
    def __repr__(self) -> str:
        return f'<Ensemble ({len(self)} model(s); mean_score={self.mean_score}; best_score={self.best_score}; target_size={self.limit})>'

In [39]:
# Accepts a list of Models and returns an ensemble of the best performers.
# An existing Ensemble can also be passed in, which will be updated and returned instead.
def train_ensemble(models:list[Model], folds:int=5, ensemble:Ensemble=Ensemble()) -> Ensemble:

    print(f'Pre-training setup...', end='\r')
    cv = TimeSeriesSplit(folds)
    X, y = load_vars()
    X_test, y_test = load_vars(testing=True)
    
    for model in models:
        
        # customize fit() and predict() kwargs for each model type
        fit_kw = dict()
        predict_kw = dict()
        early_stop = False
        model_class = type(model).__name__
        match model_class:
            case 'Sequential':
                model.compile(optimizer='adam', loss='mae')
                keras_kw = dict(batch_size=256, verbose=0)
                fit_kw.update(keras_kw) #dict(epochs=10//folds, **keras_kw))
                predict_kw.update(keras_kw)
            case 'LGBMRegressor':
                fit_kw.update(dict(verbose=False))
                early_stop = 'early_stopping_round' in model.get_params()
            case 'XGBRegressor':
                fit_kw.update(dict(verbose=0))
                early_stop = 'early_stopping_rounds' in model.get_params()
        
        # k-fold cross-validation
        model_fails = 0
        for i, (train, valid) in enumerate(cv.split(X)):
            try: # sometimes a training round can fail, but I don't want to give up on the whole ensemble
                
                print(f'Training {model_class}: Fold {i + 1}/{folds} - Running...', end='\r')
                X_valid, y_valid = X.iloc[valid, :], y[valid]

                fold_kw = {} # some kwargs rely on data that changes per fold
                if early_stop:
                    fold_kw['eval_set'] = [(X_valid, y_valid)]
                    if model_class == 'LGBMRegressor': fold_kw['eval_metric'] = 'l1'
                if model_class == 'Sequential':
                    fold_kw['validation_data'] = (X_valid, y_valid)
                fit_kw.update(fold_kw)

                try: # sometimes the keywords work, sometimes they don't
                    model.fit(X.iloc[train, :], y[train], **fit_kw)
                except:
                    model.fit(X.iloc[train, :], y[train], **fold_kw) # regardless, always want the early stop

                mae_train = mean_absolute_error(y_valid, model.predict(X_valid, **predict_kw))
                mae_test = mean_absolute_error(y_test, model.predict(X_test, **predict_kw))
                
                del X_valid, y_valid
                print(f'Training {model_class}: Fold {i + 1}/{folds} - Complete.  \n\tTrain MAE:  {mae_train}\n\tTest MAE:   {mae_test}')

                ensemble.add(ModelProfile(model, mae_test, predict_kw))
                print(f'{ensemble}')
            
            except Exception as e:
                print(f'Training {model_class}: Fold {i + 1}/{folds} - Error: {e.args}')
                model_fails += 1
                if model_fails > 1: break # consecutive failures are likely a misconfig on the model
            
            finally: # otherwise it's likely an out of memory error and we can move on
                gc.collect()
    
    return ensemble

In [40]:
NN_DROPOUT = 0.3
RANDOM_STATE = 25 # funnier than 24
keras.utils.set_random_seed(RANDOM_STATE)
gb_params = dict(random_state=RANDOM_STATE, n_jobs=16, learning_rate=0.2, max_depth=3, colsample_bytree=0.85, subsample=0.8, reg_alpha=500) # lgb and xgb have some overlap

# current test: 10 epochs per fold vs 10 epochs overall vs <fold> epochs (i.e., 1 epoch per fold)
# note this is with NO additional features, whereas benchmark has +5 (and 10/f epochs)
# result: 
models = [
    Sequential([ # benchmark: 5.4043550502492765 (2/5); 10e best: 5.401457811465988; 10/f best: 5.395060764482284; 1/f best: 5.390486159321856
        layers.Dense(N_FEATURES, kernel_regularizer=regularizers.l1(0.001), input_shape=[N_FEATURES]),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES, kernel_regularizer=regularizers.l1(0.001), activation='relu'),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ]),
    Sequential([ # benchmark: 5.43891581273165 (1/5); 10e best: 6.478754201861217; 10/f best: 5.803956092486219; 1/f best: 5.457737969527803
        layers.Dense(N_FEATURES*2, kernel_regularizer=regularizers.l1(0.001), input_shape=[N_FEATURES]),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES, kernel_regularizer=regularizers.l1(0.001), activation='tanh'), # <<--tanh is not the move, change back to N_FEATURES//2, relu
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ]),
    Sequential([ # benchmark: 5.4263829906077365 (1/5); 10e best: 5.390621362794841 ; 10/f best: 5.390174899445438; 1/f best: 5.390578563622552
        layers.Dense(N_FEATURES*2, kernel_regularizer=regularizers.l1(0.001), input_shape=[N_FEATURES]),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES, kernel_regularizer=regularizers.l1(0.001), activation='relu'),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ]),
    # LGBMR benchmark: 5.421243718047972 (3/5); 10e best: 5.456184417608288; 10/f best: 5.425122550420957; 1/f best: 5.424118845379246
    LGBMRegressor(**gb_params, early_stopping_round=5, metric='l1', num_leaves=8, min_child_samples=2000, min_split_gain=0.001, verbosity=-1),
    # XGBRegressor(**gb_params, early_stopping_rounds=5, eval_metric='mae', tree_method='hist', gamma=0.2, verbose=0),
]

ensemble = train_ensemble(models, folds=2)

Pre-training setup...

Training Sequential: Fold 1/2 - Complete.  
	Train MAE:  6.544870035261917
	Test MAE:   5.390486159321856
<Ensemble (1 model(s); mean_score=5.390486159321856; best_score=5.390486159321856; target_size=None)>
Training Sequential: Fold 2/2 - Complete.  
	Train MAE:  6.204855920523244
	Test MAE:   5.39063359790567
<Ensemble (2 model(s); mean_score=5.390559878613763; best_score=5.390486159321856; target_size=None)>
Training Sequential: Fold 1/2 - Complete.  
	Train MAE:  6.549382497583903
	Test MAE:   5.457737969527803
<Ensemble (3 model(s); mean_score=5.412952575585109; best_score=5.390486159321856; target_size=None)>
Training Sequential: Fold 2/2 - Complete.  
	Train MAE:  6.203263369442382
	Test MAE:   5.495910518966307
<Ensemble (4 model(s); mean_score=5.433692061430408; best_score=5.390486159321856; target_size=None)>
Training Sequential: Fold 1/2 - Complete.  
	Train MAE:  6.545958775086783
	Test MAE:   5.390578563622552
<Ensemble (5 model(s); mean_score=5.425069361868837; best_score

In [41]:
ensemble

<Ensemble (8 model(s); mean_score=5.4241638501601175; best_score=5.390486159321856; target_size=None)>

In [42]:
pruned = ensemble.prune(1)
pruned

<Ensemble (5 model(s); mean_score=5.39735625080939; best_score=5.390486159321856; target_size=None)>

In [43]:
twice_pruned = ensemble.prune(2)
twice_pruned

<Ensemble (4 model(s); mean_score=5.390665602166926; best_score=5.390486159321856; target_size=None)>

In [44]:
def test_model(model:Model|Ensemble) -> float:
    X_test, y_test = load_vars(testing=True)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

e_score = test_model(ensemble)
p_score = test_model(pruned)
t_score = test_model(twice_pruned)

In [45]:
e_score

5.402608749320398

In [46]:
p_score

5.391107050004356

In [47]:
t_score

5.390769963287764

In [48]:
model = twice_pruned if t_score == min(e_score, p_score, t_score) else pruned if p_score < e_score else ensemble

In [49]:
model

<Ensemble (4 model(s); mean_score=5.390665602166926; best_score=5.390486159321856; target_size=None)>

In [50]:
# submission compat check
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

for (test, revealed_targets, _) in iter_test:
    X_test = preprocess(test)
    y_pred = model.predict(X_test)
    submission = test[['row_id']].set_index('row_id') # needed to match rows
    submission['target'] = y_pred
    submission = submission.reset_index() # convert back for final CSV write
    env.predict(submission)

In [51]:
try:
    res = pd.read_csv('/kaggle/working/submission.csv') # sanity check
except FileNotFoundError:
    res = pd.read_csv('./.data/submission.csv')
res

Unnamed: 0,row_id,target
0,478_0_0,0.268041
1,478_0_1,0.266230
2,478_0_2,0.295883
3,478_0_3,0.267925
4,478_0_4,0.268709
...,...,...
32995,480_540_195,-0.042382
32996,480_540_196,-0.042386
32997,480_540_197,-0.042177
32998,480_540_198,-0.041872
