In [1]:
# !!UPDATE!!
TRAINING_DATA = './.data/train.csv' 
# TESTING_DATA = './.data/test.csv'

In [2]:
import gc
import typing as t
import warnings
warnings.filterwarnings('ignore')
# %load_ext cudf.pandas
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit #, GridSearchCV
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import lightgbm as lgb

In [3]:
def preprocess(data:pd.DataFrame) -> pd.DataFrame:
    data = data.reset_index().set_index('row_id')
    for drop_col in ['index', 'time_id', 'currently_scored']:
        if drop_col in data.columns:
            data = data.drop(drop_col, axis=1)
    id_cols = ['date_id', 'seconds_in_bucket', 'stock_id']
    data = data.sort_values(by=id_cols)
    data = data.drop(id_cols, axis=1)
    data = (data - data.min()) / (data.max() - data.min())
    data = data.ffill().fillna(0)
    return data

In [4]:
def load_training_vars(path:str=TRAINING_DATA) -> tuple[pd.DataFrame, pd.Series]:
    data = pd.read_csv(path, index_col='row_id')
    data = data.dropna(subset=['target'])
    y = data.target
    X = data.drop('target', axis=1)
    X = preprocess(X)
    return X, y

In [5]:
X, y = load_training_vars()

In [6]:
shared_params = dict(seed=25, n_jobs=16, learning_rate=0.2, max_depth=3, colsample_bytree=0.85, subsample=0.8, reg_alpha=500)
xgb_params = dict(**shared_params, tree_method='hist', eval_metric='mae', gamma=0.2, early_stopping_rounds=5)
lgb_params = dict(**shared_params, metric='l1', num_leaves=8, min_child_samples=2000, min_split_gain=0.001, early_stopping_round=5)

In [7]:
class Model(t.Protocol):
    def fit(self, X, y, sample_weight=None): ...
    def predict(self, X): ...

class Ensemble(list[Model]):
    def __init__(self, models:list[Model]=None, limit:float=6.0779) -> None:
        if models: 
            self.extend(models)
        self.limit = limit
    def predict(self, X:pd.DataFrame) -> pd.DataFrame:
        y = pd.DataFrame(index=X.index)
        y['pred'] = 0
        for model in self:
            y.pred += model.predict(X)
        y.pred = y.pred / len(self)
        return y

def train_model(model:type|Model, model_kw:dict={}, cv_folds:int=5, ens:Ensemble|None=None) -> Model|Ensemble:
    gc.collect() # just in case 
    cv = TimeSeriesSplit(n_splits=cv_folds)
    cv_folds = cv.get_n_splits()
    if isinstance(model, type):
        model = model(**model_kw)
    model_class = type(model).__name__
    early_stop = any(x in model_kw for x in ['early_stopping_rounds', 'early_stopping_round'])
    for i, (i_train, i_valid) in enumerate(cv.split(X)):
        print(f'Training {model_class}: Fold {i + 1}/{cv_folds} - Running...', end='\r')
        try:
            model.fit(X.iloc[i_train, :], y[i_train], verbose=False,
                      eval_set=([(X.iloc[i_valid, :], y[i_valid])] if early_stop else None),
                      eval_metric=('l1' if isinstance(model, lgb.LGBMRegressor) else None))
            mae = mean_absolute_error(y[i_valid], model.predict(X.iloc[i_valid, :]))
            if ens is not None and mae < ens.limit:
                ens.append(model)
        except Exception as e:
            print(f'Training {model_class}: Fold {i + 1}/{cv_folds} - Failed: {e}')
            print(f'Returning undertrained model ({i} folds)')
            break
        print(f'Training {model_class}: Fold {i + 1}/{cv_folds} - Done. MAE: {mae}')
    return ens if ens else model

In [8]:
model = train_model(xgb.XGBRegressor,  xgb_params, ens=Ensemble(limit=7))
model = train_model(lgb.LGBMRegressor, lgb_params, ens=model)

Training XGBRegressor: Fold 1/5 - Done. MAE: 7.3667617513962975
Training XGBRegressor: Fold 2/5 - Done. MAE: 6.8421438828072665
Training XGBRegressor: Fold 3/5 - Done. MAE: 6.1462978627363825
Training XGBRegressor: Fold 4/5 - Done. MAE: 6.37172200017953
Training XGBRegressor: Fold 5/5 - Done. MAE: 5.936866471891241
Training LGBMRegressor: Fold 1/5 - Done. MAE: 7.341038789452177
Training LGBMRegressor: Fold 2/5 - Done. MAE: 6.84366909148948
Training LGBMRegressor: Fold 3/5 - Done. MAE: 6.146624723166877
Training LGBMRegressor: Fold 4/5 - Done. MAE: 6.369221848953748
Training LGBMRegressor: Fold 5/5 - Done. MAE: 5.9364673649185065


In [21]:
model.predict(X)

Unnamed: 0_level_0,pred
row_id,Unnamed: 1_level_1
0_0_0,-1.518468
0_0_1,1.170271
0_0_2,-1.936042
0_0_3,0.959451
0_0_4,-2.743284
...,...
480_540_195,0.061657
480_540_196,-0.314008
480_540_197,0.844217
480_540_198,0.094575


In [10]:
# def grid_search(model:Model, param_grid:dict, n_jobs:int=8) -> Model: # skip for now
#     print('Starting grid search...', end='\r')
#     X, y = load_training_vars()
#     search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=n_jobs)
#     search.fit(X, y)
#     print(f'Grid search complete. Best params: {search.best_params_}')
#     return search.best_estimator_

In [25]:
# submission compat check
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

for (test, revealed_targets, _) in iter_test:
    X_test = preprocess(test)
    y_pred = model.predict(X_test)
    submission = test[['row_id']].set_index('row_id') # needed to match rows
    submission['target'] = y_pred
    submission = submission.reset_index() # convert back for final CSV write
    env.predict(submission)

In [None]:
# res = pd.read_csv('/kaggle/working/submission.csv') # sanity check
# res