This started as a rewrite of v3 but a enough changed that I'm making a differnt version. notably, I'm forgoing the use of the regression imputer and lag features because (A) they don't work if each stock only has 1 row, and (B) they slow everything down when they do work.

I'll build around the sample submission first to see what can be engineered from that before plugging in the test data.

In [51]:
import gc
import typing
import warnings
warnings.filterwarnings('ignore')
# %load_ext cudf.pandas
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas


In [52]:
def preprocess(data:pd.DataFrame) -> pd.DataFrame:
    assert 'target' not in data.columns # sanity check
    data = data.reset_index().set_index('row_id')
    for drop_col in ['index', 'time_id', 'currently_scored']:
        if drop_col in data.columns:
            data = data.drop(drop_col, axis=1)
    id_cols = ['date_id', 'seconds_in_bucket', 'stock_id']
    data = data.sort_values(by=id_cols)
    data = data.drop(id_cols, axis=1)
    data = (data - data.min()) / (data.max() - data.min()) # normalize
    data = data.ffill().fillna(0) # imputation at home
    return data

In [53]:
data = pd.read_csv('./.data/train.csv', index_col='row_id')
data = data.dropna(subset=['target'])
y = data.target
X = data.drop('target', axis=1)
del data # moore's law is a hell of a drug
X = preprocess(X)

In [54]:
class Model(typing.Protocol):
    def fit(self, X, y, sample_weight=None): ...
    def predict(self, X): ...

def train_model(model:Model, X:pd.DataFrame, y:pd.Series, folds:int=5) -> Model:
    for i, (i_train, i_valid) in enumerate(TimeSeriesSplit(n_splits=folds).split(X)):
        print(f'CV Fold {i + 1} of {folds} - Running...', end='\r')
        try:
            X_train, X_valid = X.iloc[i_train, :], X.iloc[i_valid, :]
            y_train, y_valid = y[i_train], y[i_valid]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_valid)
            mae = mean_absolute_error(y_valid, y_pred)
            del X_train, X_valid, y_train, y_valid, y_pred # an stitch in time...
        except Exception as e:
            print(f'CV Fold {i + 1} of {folds} - Failed: {e}')
            print(f'Returning undertrained model ({i} folds)')
            break
        finally:
            gc.collect() # ...saves the kernel from crashing - "Uncle" Ben Banklin
        print(f'CV Fold {i + 1} of {folds} - Complete. MAE: {mae}')
    return model

In [57]:
model = train_model(XGBRegressor(seed=25), X, y)

CV Fold 1 of 5 - Complete. MAE: 7.470980489574998
CV Fold 2 of 5 - Complete. MAE: 6.854041889494296
CV Fold 3 of 5 - Complete. MAE: 6.148848127515973
CV Fold 4 of 5 - Complete. MAE: 6.3991843911600945
CV Fold 5 of 5 - Complete. MAE: 5.962018910300458


In [58]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

for (test, revealed_targets, _) in iter_test:
    X_test = preprocess(test)
    y_pred = model.predict(X_test)
    submission = test[['row_id']]
    submission['target'] = y_pred
    env.predict(submission)