Another rewrite because I had to rebuild my local environment for cudf + tensorflow to work.

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # https://stackoverflow.com/questions/40426502/is-there-a-way-to-suppress-the-messages-tensorflow-prints
import gc
import sys
import typing as t
import warnings
warnings.filterwarnings('ignore')
try: # got tired of changing code between local and kaggle setup
    import cudf.pandas
    cudf.pandas.install() # must be called before pandas import
except ModuleNotFoundError:
    print('cudf not installed. Continuing with CPU dataframes.')
import pandas as pd
import tensorflow as tf # https://github.com/tensorflow/tensorflow/issues/62075
keras = tf.keras # https://github.com/microsoft/pylance-release/issues/1066
from keras import Sequential, layers
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

In [2]:
DATA_TRAIN = '.data/train.csv'
DATA_TEST_X = '.data/test.csv'
DATA_TEST_Y = '.data/revealed_targets.csv'

KAGGLE_DATA_TRAIN = '/kaggle/input/optiver-trading-at-the-close/train.csv'
KAGGLE_DATA_TEST_X = '/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv'
KAGGLE_DATA_TEST_Y = '/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv'

DROPS = ['index', 'time_id', 'currently_scored', 'time_id_x', 'time_id_y', 'revealed_date_id', 'revealed_time_id']
SORTS = ['date_id', 'seconds_in_bucket', 'stock_id']
INDEX = 'row_id'

def preprocess(data:pd.DataFrame, ycol:str=None) -> pd.DataFrame: # separate for submission compat
    data = data.reset_index().set_index(INDEX)
    data = data.drop([col for col in DROPS if col in data.columns], axis=1)
    data = data.sort_values(by=SORTS).drop(SORTS, axis=1)
    data = (data - data.min()) / (data.max() - data.min())
    data = data.ffill().fillna(0)
    return data

def load_vars(testing:bool=False) -> tuple[pd.DataFrame, pd.Series]:

    def read_data(train, test_x, test_y):
        if testing:
            data = pd.merge(*[pd.read_csv(path) for path in [test_x, test_y]], on=SORTS) # https://stackoverflow.com/a/32041277/3178898
            ycol = 'revealed_target'
        else:
            data = pd.read_csv(train, index_col=INDEX)
            ycol = 'target'
        return data, ycol
    
    try: # tired of switching local/kaggle setup
        data, ycol = read_data(DATA_TRAIN, DATA_TEST_X, DATA_TEST_Y)
    except FileNotFoundError:
        data, ycol = read_data(KAGGLE_DATA_TRAIN, KAGGLE_DATA_TEST_X, KAGGLE_DATA_TEST_Y)

    data = data.dropna(subset=[ycol]) # some targets are null
    return preprocess(data.drop(ycol, axis=1)), data[ycol]

N_FEATURES = 11 # update if/as features are engineered

In [3]:
s = ['a', 'b', 'c', 'd']
limit = None
slimit = None
(limit or slimit or max(len(s) // 2, 1))

2

In [12]:
class Model(t.Protocol):
    def fit(self, X, y, sample_weight=None): ...
    def predict(self, X): ...
    def get_params(self, deep=True): ...

class ModelProfile:
    def __init__(self, model:Model, score:float, predict_kw:dict={}) -> None:
        self.model = model
        self.score = score
        self.predict_kw = predict_kw

class Ensemble:
    def __init__(self, limit:int=None) -> None:
        self.models = list[ModelProfile]()
        self.limit = limit

    @property
    def best_score(self) -> float:
        return min(m.score for m in self.models) if len(self) > 0 else None
    
    @property
    def mean_score(self) -> float:
        return sum(m.score for m in self.models) / len(self) if len(self) > 0 else None

    def add(self, model: ModelProfile) -> bool:
        if self.limit and len(self) >= self.limit and model.score > self.mean_score:
            return False
        self.models.append(model)
        return True
        
    def prune(self, limit:int=None) -> int:
        self.limit = limit or self.limit or max(len(self) // 2, 1)
        while len(self) > self.limit:
            self.models = [m for m in self.models if m.score < self.mean_score]
        return len(self)
    
    def predict(self, X:pd.DataFrame) -> pd.DataFrame:
        y = pd.DataFrame(index=X.index)
        y['pred'] = 0
        for model in self.models:
            m_pred = model.model.predict(X, **model.predict_kw)
            y.pred += m_pred.reshape(-1) # tensorflow
        y.pred = y.pred / len(self)
        return y

    def __len__(self) -> int:
        return len(self.models)
    
    def __repr__(self) -> str:
        return f'<Ensemble ({len(self)} model(s); mean_score={self.mean_score}; best_score={self.best_score}; target_size={self.limit})>'

In [13]:
# Accepts a list of Models and returns an ensemble of the best performers.
# An existing Ensemble can also be passed in, which will be updated and returned instead.
def train_ensemble(models:list[Model], folds:int=5, ensemble:Ensemble=Ensemble()) -> Ensemble:

    print(f'Pre-training setup...', end='\r')
    cv = TimeSeriesSplit(folds)
    X, y = load_vars()
    X_test, y_test = load_vars(testing=True)
    
    for model in models:
        
        # customize fit() and predict() kwargs for each model type (and params)
        fit_kw = dict()
        predict_kw = dict()
        early_stop = False
        model_class = type(model).__name__
        match model_class:
            case 'Sequential':
                model.compile(optimizer='adam', loss='mae')
                keras_kw = dict(batch_size=256, verbose=0)
                fit_kw.update(dict(epochs=10//folds, **keras_kw))
                predict_kw.update(keras_kw)
            case 'LGBMRegressor':
                fit_kw.update(dict(verbose=False))
                early_stop = 'early_stopping_round' in model.get_params()
            case 'XGBRegressor':
                fit_kw.update(dict(verbose=0))
                early_stop = 'early_stopping_rounds' in model.get_params()
        
        # k-fold cross-validation
        model_fails = 0
        for i, (train, valid) in enumerate(cv.split(X)):
            try: # sometimes a training round can fail, but I don't want to give up on the whole ensemble
                
                print(f'Training {model_class}: Fold {i + 1}/{folds} - Running...', end='\r')
                X_valid, y_valid = X.iloc[valid, :], y[valid]

                early_stop_kw = {}
                if early_stop:
                    early_stop_kw['eval_set'] = [(X_valid, y_valid)]
                    if model_class == 'LGBMRegressor': early_stop_kw['eval_metric'] = 'l1'
                if model_class == 'Sequential':
                    early_stop_kw['validation_data'] = (X_valid, y_valid)
                fit_kw.update(early_stop_kw)

                try: # some keywords work in local setup, some only work on kaggle
                    model.fit(X.iloc[train, :], y[train], **fit_kw)
                except:
                    model.fit(X.iloc[train, :], y[train], **early_stop_kw) # regardless, always want the early stop

                mae_train = mean_absolute_error(y_valid, model.predict(X_valid, **predict_kw))
                mae_test = mean_absolute_error(y_test, model.predict(X_test, **predict_kw))
                
                del X_valid, y_valid
                print(f'Training {model_class}: Fold {i + 1}/{folds} - Complete.  \n\tTrain MAE:  {mae_train}\n\tTest MAE:   {mae_test}')

                if ensemble.add(ModelProfile(model, mae_test, predict_kw)):
                    print(f'Model accepted.')
                else:
                    print(f'Model rejected.')

                print(f'\tMean score: {ensemble.mean_score}\n\tBest score: {ensemble.best_score}')
            
            except Exception as e:
                print(f'Training {model_class}: Fold {i + 1}/{folds} - Error: {e.args}')
                model_fails += 1
                if model_fails > 1: break # consecutive failures are likely a misconfig on the model
            
            finally: # otherwise it's likely an out of memory error and we can move on
                gc.collect()
    
    return ensemble

In [14]:
shared_params = dict(random_state=25, n_jobs=16,  learning_rate=0.2, max_depth=3, colsample_bytree=0.85, subsample=0.8, reg_alpha=500)
lgb_params = dict(**shared_params, early_stopping_round=5, metric='l1', num_leaves=8, min_child_samples=2000, min_split_gain=0.001, verbosity=-1)
xgb_params = dict(**shared_params, early_stopping_rounds=5, eval_metric='mae', tree_method='hist', gamma=0.2)

models = [
    # Sequential([
    #     layers.Dense(32, input_shape=[N_FEATURES]),
    #     layers.Dropout(0.3),
    #     layers.BatchNormalization(),
    #     layers.Dense(64, activation='tanh'),
    #     layers.Dropout(0.3),
    #     layers.BatchNormalization(),
    #     layers.Dense(1),
    # ]),
    LGBMRegressor(**lgb_params),
    XGBRegressor(**xgb_params),
]

ensemble = train_ensemble(models, folds=3)

Training LGBMRegressor: Fold 1/3 - Complete.  
	Train MAE:  7.034098842562557
	Test MAE:   5.845854514798698
Model accepted.
	Mean score: 5.845854514798698
	Best score: 5.845854514798698
Training LGBMRegressor: Fold 2/3 - Complete.  
	Train MAE:  6.21301784123645
	Test MAE:   5.429571765461806
Model accepted.
	Mean score: 5.637713140130252
	Best score: 5.429571765461806
Training LGBMRegressor: Fold 3/3 - Complete.  
	Train MAE:  6.0942163712552455
	Test MAE:   5.527554314988951
Model accepted.
	Mean score: 5.600993531749818
	Best score: 5.429571765461806
Training XGBRegressor: Fold 1/3 - Complete.  
	Train MAE:  7.038525480784746
	Test MAE:   6.182351412102538
Model accepted.
	Mean score: 5.746333001837998
	Best score: 5.429571765461806
Training XGBRegressor: Fold 2/3 - Complete.  
	Train MAE:  6.2139224280507985
	Test MAE:   5.408446094484484
Model accepted.
	Mean score: 5.678755620367295
	Best score: 5.408446094484484
Training XGBRegressor: Fold 3/3 - Complete.  
	Train MAE:  6.09810

In [15]:
ensemble

<Ensemble (6 model(s); mean_score=5.6362334034308175; best_score=5.408446094484484; target_size=None)>

In [16]:
ensemble.prune()
ensemble

<Ensemble (3 model(s); mean_score=5.420546726231573; best_score=5.408446094484484; target_size=3)>

In [17]:
def test_model(model:Model|Ensemble) -> float:
    X_test, y_test = load_vars(testing=True)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

test_model(ensemble)

5.445350865581263

In [18]:
# submission compat check
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

for (test, revealed_targets, _) in iter_test:
    X_test = preprocess(test)
    y_pred = ensemble.predict(X_test)
    submission = test[['row_id']].set_index('row_id') # needed to match rows
    submission['target'] = y_pred
    submission = submission.reset_index() # convert back for final CSV write
    env.predict(submission)

In [19]:
try:
    res = pd.read_csv('/kaggle/working/submission.csv') # sanity check
except FileNotFoundError:
    res = pd.read_csv('./.data/submission.csv')
res

Unnamed: 0,row_id,target
0,478_0_0,0.574097
1,478_0_1,1.521065
2,478_0_2,30.435205
3,478_0_3,0.574097
4,478_0_4,1.020032
...,...,...
32995,480_540_195,-0.710971
32996,480_540_196,-0.710971
32997,480_540_197,-0.496850
32998,480_540_198,-0.080503
