Optimized XGBRegressor got us to 8.4405, let's see what LightGBM can do.

In [23]:
import gc
import typing as t
import warnings
warnings.filterwarnings('ignore')
# %load_ext cudf.pandas
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import lightgbm as lgb

In [24]:
TRAINING_DATA = './.data/train.csv' # !!update for Kaggle!!

In [25]:
def preprocess(data:pd.DataFrame) -> pd.DataFrame:
    assert 'target' not in data.columns # sanity check
    data = data.reset_index().set_index('row_id')
    for drop_col in ['index', 'time_id', 'currently_scored']:
        if drop_col in data.columns:
            data = data.drop(drop_col, axis=1)
    id_cols = ['date_id', 'seconds_in_bucket', 'stock_id']
    data = data.sort_values(by=id_cols)
    data = data.drop(id_cols, axis=1)
    data = (data - data.min()) / (data.max() - data.min())
    data = data.ffill().fillna(0)
    return data

In [26]:
def load_training_vars(path:str=TRAINING_DATA) -> tuple[pd.DataFrame, pd.Series]:
    data = pd.read_csv(path, index_col='row_id')
    data = data.dropna(subset=['target'])
    y = data.target
    X = data.drop('target', axis=1)
    X = preprocess(X)
    return X, y

In [27]:
class Model(t.Protocol):
    def fit(self, X, y, sample_weight=None): ...
    def predict(self, X): ...

def train_model(model:type|Model, model_kw:dict={}, fold_kw:dict={}) -> Model:
    print(f'Pre-training...', end='\r')
    gc.collect() # just in case
    X, y = load_training_vars()    
    cv = TimeSeriesSplit(**fold_kw)
    folds = cv.get_n_splits()
    if isinstance(model, type):
        model = model(**model_kw)
    print(f'Pre-training - Done.')
    # early_stop = 'early_stopping_roungs' in model_kwargs
    for i, (i_train, i_valid) in enumerate(cv.split(X)):
        print(f'Training: Fold {i + 1}/{folds} - Running...', end='\r')
        try:
            model.fit(X.iloc[i_train, :], y[i_train], verbose=False) #, eval_set=[(X_valid, y_valid)] if early_stop else None)
            mae = mean_absolute_error(y[i_valid], model.predict(X.iloc[i_valid, :]))
        except Exception as e:
            print(f'Training: Fold {i + 1}/{folds} - Failed: {e}')
            print(f'Returning undertrained model ({i} folds)')
            break
        print(f'Training: Fold {i + 1}/{folds} - Done. MAE: {mae}')
    return model

In [28]:
train_model(lgb.LGBMRegressor)
pass # lgb baseline

Pre-training - Done.
Training: Fold 1/5 - Done. MAE: 7.365128684211296
Training: Fold 2/5 - Done. MAE: 6.834012097098109
Training: Fold 3/5 - Done. MAE: 6.13850643808549
Training: Fold 4/5 - Done. MAE: 6.370388844816867
Training: Fold 5/5 - Done. MAE: 5.936786897874561


In [29]:
shared_params = dict(seed=25, n_jobs=16, learning_rate=0.2, max_depth=3, colsample_bytree=0.85, subsample=0.8, reg_alpha=500)

In [30]:
xgb_params = dict(tree_method='hist', eval_metric='mae', gamma=0.2, **shared_params)
train_model(xgb.XGBRegressor, xgb_params)
pass # tuned xgb

Pre-training - Done.
Training: Fold 1/5 - Done. MAE: 7.345823785360344
Training: Fold 2/5 - Done. MAE: 6.827276161701868
Training: Fold 3/5 - Done. MAE: 6.142414891861338
Training: Fold 4/5 - Done. MAE: 6.370331139121156
Training: Fold 5/5 - Done. MAE: 5.934155244654084


In [31]:
lgb_params = dict(metric='l1', **shared_params)
train_model(lgb.LGBMRegressor, shared_params)
pass # compare to tuned xgb

Pre-training - Done.
Training: Fold 1/5 - Done. MAE: 7.3418280122986275
Training: Fold 2/5 - Done. MAE: 6.828364364682563
Training: Fold 3/5 - Done. MAE: 6.140877500590833
Training: Fold 4/5 - Done. MAE: 6.369030754339805
Training: Fold 5/5 - Done. MAE: 5.9343541090902105


In [32]:
def grid_search(model:Model, param_grid:dict, n_jobs:int=8) -> Model:
    print('Starting grid search...', end='\r')
    X, y = load_training_vars()
    search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=n_jobs)
    search.fit(X, y)
    print(f'Grid search complete. Best params: {search.best_params_}')
    return search.best_estimator_

In [33]:
model = xgb.XGBRegressor(**xgb_params)
model = grid_search(model, param_grid={'max_leaves':[2**x for x in range(1, 5)]}) # best was 2
model = train_model(model)

Grid search complete. Best params: {'max_leaves': 2}
Pre-training - Done.
Training: Fold 1/5 - Done. MAE: 7.380403400253767
Training: Fold 2/5 - Done. MAE: 6.855299064685574
Training: Fold 3/5 - Done. MAE: 6.154777026210802
Training: Fold 4/5 - Done. MAE: 6.383582022118538
Training: Fold 5/5 - Done. MAE: 5.947080130939923


In [34]:
model = lgb.LGBMRegressor(**lgb_params)
model = grid_search(model, param_grid={'num_leaves':[2**x for x in range(1, 5)]}) # best = 8
model = train_model(model)

Grid search complete. Best params: {'num_leaves': 16}
Pre-training - Done.
Training: Fold 1/5 - Done. MAE: 7.3418280122986275
Training: Fold 2/5 - Done. MAE: 6.828364364682563
Training: Fold 3/5 - Done. MAE: 6.140877500590833
Training: Fold 4/5 - Done. MAE: 6.369030754339805
Training: Fold 5/5 - Done. MAE: 5.9343541090902105


In [13]:
# lgb_params = dict(metric='l1', num_leaves=8, **shared_params)
# model = lgb.LGBMRegressor(**lgb_params)
# model = grid_search(model, param_grid={'min_data_in_leaf':[10**x for x in range(2, 5)], # these are overriding aliases used by the python API
#                                        'min_gain_to_split': [0, 0.1, 1, 10]}) # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
# model = train_model(model)

In [14]:
lgb_params = dict(metric='l1', num_leaves=8, **shared_params)
model = lgb.LGBMRegressor(**lgb_params)
model = grid_search(model, param_grid={'min_child_samples':[2 * 10**x for x in range(2, 5)],
                                       'min_split_gain': [0, 0.01, 0.05, 0.1, 0.5]})
model = train_model(model) # 5.932219891077756 (first improvement over v5!)

Grid search complete. Best params: {'min_child_samples': 2000, 'min_split_gain': 0.01}
Pre-training - Done.
Training: Fold 1/5 - Done. MAE: 7.341038789452177
Training: Fold 2/5 - Done. MAE: 6.8271654582570305
Training: Fold 3/5 - Done. MAE: 6.142087789705268
Training: Fold 4/5 - Done. MAE: 6.369221848953748
Training: Fold 5/5 - Done. MAE: 5.932219891077756


In [20]:
# refine prior results
model = lgb.LGBMRegressor(**lgb_params)
model = grid_search(model, param_grid={'min_child_samples':[500, 1000, 2000, 4000, 8000], # last run was middle of the range (2000) -> confirmed
                                       'min_split_gain': [0.001, 0.005, 0.01, 0.02]}) # similar result (0.01) -> 0.001
model = train_model(model)

Grid search complete. Best params: {'min_child_samples': 2000, 'min_split_gain': 0.001}
Pre-training - Done.
Training: Fold 1/5 - Done. MAE: 7.341038789452177
Training: Fold 2/5 - Done. MAE: 6.8271654582570305
Training: Fold 3/5 - Done. MAE: 6.142087789705268
Training: Fold 4/5 - Done. MAE: 6.369221848953748
Training: Fold 5/5 - Done. MAE: 5.932219891077756


In [21]:
lgb_params = dict(metric='l1', num_leaves=8, min_child_samples=2000, min_split_gain=0.001, **shared_params)
model = lgb.LGBMRegressor(**lgb_params)
model = train_model(model)

Pre-training - Done.
Training: Fold 1/5 - Done. MAE: 7.341038789452177
Training: Fold 2/5 - Done. MAE: 6.8271654582570305
Training: Fold 3/5 - Done. MAE: 6.142087789705268
Training: Fold 4/5 - Done. MAE: 6.369221848953748
Training: Fold 5/5 - Done. MAE: 5.932219891077756


In [22]:
# submission compat check
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

for (test, revealed_targets, _) in iter_test:
    X_test = preprocess(test)
    y_pred = model.predict(X_test)
    submission = test[['row_id']]
    submission['target'] = y_pred
    env.predict(submission)