Another rewrite because I had to rebuild my local environment for cudf + tensorflow to work.

In [17]:
#!!---------------!UPDATE!---------------!!
DATA_TRAIN = '.data/train.csv'
DATA_TEST_X = '.data/test.csv'
DATA_TEST_Y = '.data/revealed_targets.csv'

In [16]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # https://stackoverflow.com/questions/40426502/is-there-a-way-to-suppress-the-messages-tensorflow-prints
import gc
import sys
import typing as t
import warnings
warnings.filterwarnings('ignore')
try: # got tired of changing code between local and kaggle setup
    import cudf.pandas
    cudf.pandas.install() # must be called before pandas import
except ModuleNotFoundError:
    print('cudf not installed. Continuing in CPU mode.')
    from catboost import CatBoostRegressor # CatBoost doesn't support cudf so we exclude it locally
import pandas as pd
import tensorflow as tf # https://github.com/tensorflow/tensorflow/issues/62075
keras = tf.keras # https://github.com/microsoft/pylance-release/issues/1066
from keras import Sequential, layers
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

In [18]:
DROPS = ['index', 'time_id', 'currently_scored', 'time_id_x', 'time_id_y', 'revealed_date_id', 'revealed_time_id']
SORTS = ['date_id', 'seconds_in_bucket', 'stock_id']
INDEX = 'row_id'

def preprocess(data:pd.DataFrame, ycol:str=None) -> pd.DataFrame: # separate for submission compat
    data = data.reset_index().set_index(INDEX)
    data = data.drop([col for col in DROPS if col in data.columns], axis=1)
    data = data.sort_values(by=SORTS).drop(SORTS, axis=1)
    data = (data - data.min()) / (data.max() - data.min())
    data = data.ffill().fillna(0)
    return data

def load_vars(testing:bool=False) -> tuple[pd.DataFrame, pd.Series]:
    if testing:
        data = pd.merge(*[pd.read_csv(path) for path in [DATA_TEST_X, DATA_TEST_Y]], on=SORTS) # https://stackoverflow.com/a/32041277/3178898
        ycol = 'revealed_target'
    else:
        data = pd.read_csv(DATA_TRAIN, index_col=INDEX)
        ycol = 'target'
    data = data.dropna(subset=[ycol]) # some targets are null
    return preprocess(data.drop(ycol, axis=1)), data[ycol]

N_FEATURES = 11 # update if/as features are engineered

In [29]:
class Model(t.Protocol):
    def fit(self, X, y, sample_weight=None): ...
    def predict(self, X): ...
    def get_params(self, deep=True): ...

class ModelProfile:
    def __init__(self, model:Model, score:float) -> None:
        self.model = model
        self.score = score

class Ensemble:
    def __init__(self) -> None:
        self.models = list[ModelProfile]()

    @property
    def best_score(self) -> float:
        return min(m.score for m in self.models) if len(self) > 0 else None
    
    @property
    def mean_score(self) -> float:
        return sum(m.score for m in self.models) / len(self) if len(self) > 0 else None

    def add(self, model: ModelProfile) -> bool:
        if self.mean_score is not None and model.score > self.mean_score:
            return False
        self.models.append(model)
        return True
        
    def prune(self) -> int:
        self.models = [m for m in self.models if m.score < self.mean_score]
        return len(self)
    
    def predict(self, X:pd.DataFrame) -> pd.DataFrame:
        y = pd.DataFrame(index=X.index)
        y['pred'] = 0
        for model in self.models:
            y.pred += model.model.predict(X)
        y.pred = y.pred / len(self)
        return y

    def __len__(self) -> int:
        return len(self.models)
    
    def __repr__(self) -> str:
        return f'<Ensemble ({len(self)} model(s); mean_score={self.mean_score}, best_score={self.best_score})>'

# accepts a list of Models and/or (class, params) pairs and returns an ensemble of the best performers
# an existing Ensemble can also be passed in to update it.
def train_ensemble(models:list[Model|tuple[type, dict]], folds:int=5, ens:Ensemble=Ensemble()) -> Ensemble:

    print(f'Pre-training setup...', end='\r')
    cv = TimeSeriesSplit(folds)
    X, y = load_vars()
    X_test, y_test = load_vars(testing=True)
    
    for model in models:
        
        # instantiate (if not already)
        if isinstance(model, tuple):
            model = Model(model[0](**model[1]))
        model_class = type(model).__name__
        
        # customize fit() and predict() kwargs for each model type
        fit_kw = dict()
        predict_kw = dict()
        match model_class:
            case 'Sequential':
                model.compile(optimizer='adam', loss='mae')
                keras_kw = dict(batch_size=256, verbose=0)
                fit_kw.update(keras_kw)
                predict_kw.update(keras_kw)
            case 'LGBMRegressor':
                pass
            case 'XGBRegressor':
                fit_kw.update(dict(verbose=0))
            case 'CatBoost':
                fit_kw.update(dict(verbose=False))
        
        # early_stop = any(x in model.get_params() for x in ['early_stopping_rounds', 'early_stopping_round'])
        
        # k-fold cross-validation
        for i, (train, valid) in enumerate(cv.split(X)):
            try: # sometimes a training round can fail, but I don't want to give up on the whole ensemble
                
                print(f'Training {model_class}: Fold {i + 1}/{folds} - Running...', end='\r')
                model.fit(X.iloc[train, :], y[train], **fit_kw)
                print(f'Training {model_class}: Fold {i + 1}/{folds} - Complete.  ')

                mae_train = mean_absolute_error(y[valid], model.predict(X.iloc[valid, :], **predict_kw))
                mae_test = mean_absolute_error(y_test, model.predict(X_test, **predict_kw))
                print(f'\tTrain MAE:  {mae_train}\n\tTest MAE:   {mae_test}')

                if ens.add(ModelProfile(model, mae_test)):
                    print(f'Model accepted.')
                else:
                    print(f'Model rejected.')
                print(f'\tMean score: {ens.mean_score}\n\tBest score: {ens.best_score}')
            
            except Exception as e:
                print(f'Training {model_class}: Fold {i + 1}/{folds} - Error: {e}')
                break # if a round fails it's usually a model misconfig, so move on to the next model
            
            finally:
                gc.collect() # local setup runs out of memory
    
    return ens

In [31]:
models = [
    # Sequential([
    #     layers.Dense(units=1, input_shape=[N_FEATURES])
    # ]),
    LGBMRegressor(verbosity=-1),
    # XGBRegressor(n_jobs=16),
]
if 'catboost' in sys.modules:
    models.append(CatBoostRegressor(silent=True)) # see imports

ens = train_ensemble(models, folds=2)

Training LGBMRegressor: Fold 1/2 - Complete.  
	Train MAE:  6.49258150865267
	Test MAE:   12.413096267303516
Model accepted.
	Mean score: 12.413096267303516
	Best score: 12.413096267303516
Training LGBMRegressor: Fold 2/2 - Complete.  
	Train MAE:  6.155308119798974
	Test MAE:   9.234917407387922
Model accepted.
	Mean score: 10.824006837345719
	Best score: 9.234917407387922


In [None]:
ens

In [32]:
ens.prune()
ens

<Ensemble (1 model(s); mean_score=9.234917407387922, best_score=9.234917407387922)>

In [33]:
def test_model(model:Model|Ensemble) -> float:
    X_test, y_test = load_vars(testing=True)
    y_pred = model.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

test_model(ens)

9.234009679922847

In [34]:
# submission compat check
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

for (test, revealed_targets, _) in iter_test:
    X_test = preprocess(test)
    y_pred = ens.predict(X_test)
    submission = test[['row_id']].set_index('row_id') # needed to match rows
    submission['target'] = y_pred
    submission = submission.reset_index() # convert back for final CSV write
    env.predict(submission)

In [35]:
# TODO:
# https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBRegressor.feature_importances_