Initial submission scored a 31.4109 :(

Coming back to see if tweaking params can help before trying something new.

In [1]:
import gc
import typing
import warnings
warnings.filterwarnings('ignore')
# %load_ext cudf.pandas
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

In [2]:
def preprocess(data:pd.DataFrame) -> pd.DataFrame:
    assert 'target' not in data.columns # sanity check
    data = data.reset_index().set_index('row_id')
    for drop_col in ['index', 'time_id', 'currently_scored']:
        if drop_col in data.columns:
            data = data.drop(drop_col, axis=1)
    id_cols = ['date_id', 'seconds_in_bucket', 'stock_id']
    data = data.sort_values(by=id_cols)
    data = data.drop(id_cols, axis=1)
    data = (data - data.min()) / (data.max() - data.min()) # normalize
    data = data.ffill().fillna(0) # imputation at home
    return data

In [3]:
data = pd.read_csv('./.data/train.csv', index_col='row_id')
data = data.dropna(subset=['target'])
y = data.target
X = data.drop('target', axis=1)
del data # moore's law is a hell of a drug
X = preprocess(X)

In [4]:
class Model(typing.Protocol):
    def fit(self, X, y, sample_weight=None): ...
    def predict(self, X): ...

def train_model(model:Model, X:pd.DataFrame, y:pd.Series, folds:int=5, early_stop:bool=False) -> Model:
    for i, (i_train, i_valid) in enumerate(TimeSeriesSplit(n_splits=folds).split(X)):
        print(f'Training: Fold {i + 1}/{folds} - Running...', end='\r')
        try:
            X_train, X_valid = X.iloc[i_train, :], X.iloc[i_valid, :]
            y_train, y_valid = y[i_train], y[i_valid]
            model.fit(X_train, y_train, verbose=False, eval_set=[(X_valid, y_valid)] if early_stop else None)
            y_pred = model.predict(X_valid)
            mae = mean_absolute_error(y_valid, y_pred)
            del X_train, X_valid, y_train, y_valid, y_pred # a stitch in time...
        except Exception as e:
            print(f'Training: Fold {i + 1}/{folds} - Failed: {e}')
            print(f'Returning undertrained model ({i} folds)')
            break
        finally:
            gc.collect() # ...saves the kernel from crashing - "Uncle" Ben Banklin
        print(f'Training: Fold {i + 1}/{folds} - Complete. MAE: {mae}')
    return model

In [27]:
# model_params = dict(seed=25, tree_method='hist', n_jobs=16, eval_metric='mae', eta=0.2, max_depth=3, gamma=0.2)
# model = train_model(XGBRegressor(**model_params), X, y, early_stop=('early_stopping_rounds' in model_params)) # local MAE: 5.936219912494401 (Kaggle score: ~31)

CV Fold 1 of 5 - Complete. MAE: 7.351825973729486
CV Fold 2 of 5 - Complete. MAE: 6.829925284195577
CV Fold 3 of 5 - Complete. MAE: 6.142346409136241
CV Fold 4 of 5 - Complete. MAE: 6.373074008358564
CV Fold 5 of 5 - Complete. MAE: 5.936219912494401


In [29]:
model_params = dict(seed=25, n_jobs=8, tree_method='hist', eval_metric='mae', eta=0.2, max_depth=3, gamma=0.2, colsample_bytree=0.85, subsample=0.8, reg_alpha=500)
model = XGBRegressor(**model_params)
# print('Starting grid search...', end='\r')
# grid_params = dict(reg_alpha=[0.01, 0.1, 1, 10, 100, 1000])
# grid_search = GridSearchCV(estimator=model, param_grid=grid_params, n_jobs=8)
# grid_search = grid_search.fit(X, y)
# print(f'Grid search complete. Best params: {grid_search.best_params_}')
# model = grid_search.best_estimator_
model = train_model(model, X, y, early_stop=('early_stopping_rounds' in model_params))

Grid search complete. Best params: {'reg_alpha': 1000}
CV Fold 1 of 5 - Complete. MAE: 7.342769416989973
CV Fold 2 of 5 - Complete. MAE: 6.827891261674777
CV Fold 3 of 5 - Complete. MAE: 6.140392080574695
CV Fold 4 of 5 - Complete. MAE: 6.369327561854159
CV Fold 5 of 5 - Complete. MAE: 5.933228711079479


In [None]:
# submission compat check
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

for (test, revealed_targets, _) in iter_test:
    X_test = preprocess(test)
    y_pred = model.predict(X_test)
    submission = test[['row_id']]
    submission['target'] = y_pred
    env.predict(submission)

Latest submission (V2) clocked in at 8.4405, we're making progress 🦾