Largely built on [@iqmansingh's](https://www.kaggle.com/iqmansingh) notebook, [4-Fold Time-Series Split Ensemble](https://www.kaggle.com/code/iqmansingh/optiver-4-fold-time-series-split-ensemble), although this borrows the `reduce_mem_usage` and `imbalance_features` snippets as well. The core idea is still to build a voting ensemble on time series splits, but with score tracking so it can reject models that degrade performance.

*Note: I eventually learned that scikit-learn has a built-in [`VotingRegressor`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingRegressor.html) class, **(shout-out to [@chinzorigtganbat's](https://www.kaggle.com/chinzorigtganbat) [VotingRegressor + Boosters](https://www.kaggle.com/code/chinzorigtganbat/votingregressor-boosters))**, but it's different enough that I couldn't use it here without a rewrite.*

*Note2: I later discovered the [many selective ensemble papers](https://scholar.google.com/scholar?q=selective+ensemble+machine+learning&hl=en&as_sdt=0&as_vis=1&oi=scholart) put out in the last decade. At best this is a naive implementation of the concept, but I want to acknowldge the authors for their work.*

In [1]:
IS_TRAIN = True # true -> train ensemble; false -> load pretrained ensemble
SAVE_MODELS = False # true -> save copies of each model to MODEL_FOLDER during training
MEMORY_CAP = None # 16.8 # GiB; if malloc starts to runaway reign it in

In [2]:
import os
import gc
import time
import joblib
import psutil
# import typing
import warnings
import itertools
warnings.simplefilter('ignore') # ignore FutureWarnings; must precede pandas import
import pandas as pd
import numpy as np
import numba as nb
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
import sklearn.metrics as met
import sklearn.model_selection as sel
import typing_extensions as ext # used over vanilla typing since it backports 3.11+ features
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # ignore bugged CUDA errors; must precede tf import
import tensorflow as tf
tf.keras.utils.disable_interactive_logging() # ensemble will provide its own condensed version
print(('GPU available.' if len(tf.config.list_physical_devices('GPU')) > 0 else 'No GPU detected.'))

No GPU detected.


In [3]:
@nb.njit(parallel=True)
def compute_triplet_imbalance(values:np.ndarray, combo_indices:list[tuple[int, int, int]]) -> np.ndarray:
    num_rows = values.shape[0]
    num_combinations = len(combo_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in nb.prange(num_combinations): # enumerate() works but prange() lets us run in parallel
        a, b, c = combo_indices[i]
        for j in nb.prange(num_rows):
            _a, _b, _c = values[j, a], values[j, b], values[j, c]
            max_val = max(_a, _b, _c)
            min_val = min(_a, _b, _c)
            mid_val = sum([_a, _b, _c])-max_val-min_val
            imbalance_features[j, i] = np.nan if mid_val == min_val else (max_val-mid_val)/(mid_val-min_val)
    return imbalance_features   

def calculate_triplet_imbalance_numba(cols:list[str], data:pd.DataFrame) -> pd.DataFrame:
    values = data[cols].values
    combo_indices = []
    columns = []
    for a, b, c in itertools.combinations(cols, 3):
        combo_indices.append(tuple([cols.index(col) for col in [a, b, c]]))
        columns.append(f'{a}_{b}_{c}_imbalance')
    features_array = compute_triplet_imbalance(values, combo_indices)
    features = pd.DataFrame(features_array, columns=columns)
    return features

def imbalance_features(data:pd.DataFrame) -> pd.DataFrame:
    prices = [*[col for col in data.columns if 'price' in col], 'wap']
    sizes = [col for col in data.columns if 'size' in col]
    data['volume'] = data.eval('ask_size+bid_size')
    data['mid_price'] = data.eval('(ask_price+bid_price)/2')
    data['liquidity_imbalance'] = data.eval('(bid_size-ask_size)/volume')
    data['matched_imbalance'] = data.eval('(imbalance_size-matched_size)/(imbalance_size+matched_size)')
    data['size_imbalance'] = data.eval('bid_size/ask_size')
    data['imbalance_momentum'] = data.groupby(level='stock_id').imbalance_size.diff(periods=1) / data.matched_size
    data['price_spread'] = data.eval('ask_price-bid_price')
    data['spread_intensity'] = data.groupby(level='stock_id').price_spread.diff()
    data['price_pressure'] = data.eval('imbalance_size*price_spread')
    data['market_urgency'] = data.eval('price_spread*liquidity_imbalance')
    data['depth_pressure'] = data.eval('(ask_size-bid_size)*(far_price-near_price)')
    for cols in itertools.combinations(prices, 2):
        data[f'{cols[0]}_{cols[1]}_imbalance'] = data.eval(f'({cols[0]}-{cols[1]})/({cols[0]}+{cols[1]})')
    for cols in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(cols, data)
        data[triplet_feature.columns] = triplet_feature.values
    for func in ['mean', 'std', 'skew', 'kurt']:
        data[f'all_prices_{func}'] = data[prices].agg(func, axis=1)
        data[f'all_sizes_{func}'] = data[sizes].agg(func, axis=1)
    for win in [1, 2, 3, 5, 8, 13]:
        for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
            data[f'{col}_shift_{win}'] = data.groupby(level='stock_id')[col].shift(win)
            data[f'{col}_pct_{win}'] = data.groupby(level='stock_id')[col].pct_change(win)
        for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'market_urgency', 'imbalance_momentum', 'size_imbalance']:
            data[f'{col}_diff_{win}'] = data.groupby(level='stock_id')[col].diff(win)
    data = data.replace([np.inf, -np.inf], 0)
    return data

def reduce_mem_usage(data:pd.DataFrame, verbose:bool=False) -> pd.DataFrame: # 3.10+
    if verbose: mem_start = data.memory_usage().sum()
    for col in data.columns:
        match data[col].dtype:
            case 'object' | 'bool': continue
            case 'int32' | 'int64':
                for int_size in [np.int8, np.int16, np.int32]:
                    if data[col].min() > np.iinfo(int_size).min and data[col].max() < np.iinfo(int_size).max:
                        data[col] = data[col].astype(int_size)
            case 'float32' | 'float64':
                for float_size in [np.float16, np.float32]:
                    if data[col].min() > np.finfo(float_size).min and data[col].max() < np.finfo(float_size).max:
                        data[col] = data[col].astype(float_size)
            case _: raise Exception(data[col].dtype)
    if verbose:
        mem_end = data.memory_usage().sum()
        print(f'DataFrame memory reduced from {mem_start} to {mem_end}.')
    return data

In [4]:
LOCAL_DATA_TRAIN = '.data/train.csv'
LOCAL_DATA_TEST_X = '.data/test.csv'
LOCAL_DATA_TEST_Y = '.data/revealed_targets.csv'

KAGGLE_DATA_TRAIN = '/kaggle/input/optiver-trading-at-the-close/train.csv'
KAGGLE_DATA_TEST_X = '/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv'
KAGGLE_DATA_TEST_Y = '/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv'

DROPS = ['index', 'time_id', 'currently_scored', 'time_id_x', 'time_id_y', 'revealed_date_id', 'revealed_time_id', 'row_id']
SORTS = ['date_id', 'stock_id', 'seconds_in_bucket'] # order matters here
SKIPS = ['imbalance_buy_sell_flag', 'target']

def preprocess(data:pd.DataFrame) -> pd.DataFrame: # separate from load_data() for submission compat
    data = data.set_index(SORTS).sort_index()      # pushing these into a multi-index makes life easier down the road
    data = imbalance_features(data)                # must precede standardization; requires SKIPS in data
    skip = data[[col for col in SKIPS if col in data.columns]]
    data = data.drop([col for col in [*DROPS, *SKIPS] if col in data.columns], axis=1)
    data = data.groupby(level='stock_id').ffill()  # impute with last observation; groupby() ensures ffill() is per-stock, per-day
    data = (data - data.mean()) / data.std(ddof=0) # normalize/standardize (z-score)
    data = data.fillna(0)                          # clean columns that didn't ffill or with a stdev of 0 (i.e., only 1 unique value)
    data = pd.concat([skip, data], axis=1, join='inner') # re-join with skipped columns
    temp = data.index.to_frame().seconds_in_bucket       # encode seconds as sin/cos waves
    data['seconds_in_bucket_sin'] = np.sin((temp * 2 * np.pi / 540))
    data['seconds_in_bucket_cos'] = np.cos((temp * 2 * np.pi / 540))
    data = reduce_mem_usage(data)                  # 2nd pass doubles setup time but saves 2-3 GiB 
    return data

def load_vars(test:bool=False) -> tuple[pd.DataFrame, pd.Series]: # returns training (or test) data for either local or kaggle setup
    def read_data(train, test_x, test_y): # wrap call to read_csv() since test X and y values are stored separately and must be merged
        if test: return pd.merge(*[pd.read_csv(path) for path in [test_x, test_y]], on=SORTS).rename(columns={'revealed_target':'target'})
        else: return pd.read_csv(train)
    try: data = read_data(LOCAL_DATA_TRAIN, LOCAL_DATA_TEST_X, LOCAL_DATA_TEST_Y)
    except FileNotFoundError: data = read_data(KAGGLE_DATA_TRAIN, KAGGLE_DATA_TEST_X, KAGGLE_DATA_TEST_Y)
    data = data.dropna(subset=['target']) # some rows have null targets
    data = reduce_mem_usage(data) # 1st pass must precede preprocess() or kaggle will run out of memory
    data = preprocess(data)
    return data.drop('target', axis=1), data.target

In [5]:
class PredictionError(Exception): pass # specific for training feedback

class IModel(ext.Protocol): # partial wrapper for sklearn API
    def fit(self, X, y, **kwargs) -> ext.Self: ...
    def predict(self, X, **kwargs) -> np.ndarray: ...
    def get_params(self, deep=True) -> dict[str, ext.Any]: ...

class SelectiveEnsemble: # once len(models) >= limit, reject new models with scores above the mean
    def __init__(self, limit:int=None) -> None:
        self.limit = limit 
        self.models = dict[str, IModel]()
        self.scores = dict[str, float]()
        self.kwargs = dict[str, dict]()
        self.test_x, self.test_y = load_vars(test=True)
    
    @property
    def mean_score(self) -> float:
        return sum(self.scores[m] for m in self.models) / len(self) if len(self) > 0 else None
    
    @property
    def best_score(self) -> float:
        return min(self.scores[m] for m in self.models) if len(self) > 0 else None
    
    @property
    def best_model(self) -> tuple[IModel, str, dict]:
        return [(self.models[m], m, self.kwargs[m].copy()) for m in self.models if self.scores[m] == self.best_score][0]
    
    def add(self, model:IModel, name:str, kwargs:dict) -> tuple[bool, float]: # raises PredictionError
        if name in self.models: name = f'{name}(1)'
        pred = model.predict(self.test_x, **kwargs)
        if len(np.unique(pred)) == 1: raise PredictionError('Model is guessing a constant value.')
        if np.isnan(pred).any(): raise PredictionError('Model is guessing NaN.')
        score = met.mean_absolute_error(self.test_y, pred)
        if self.limit and len(self) >= self.limit and self.mean_score < score: return False, score
        self.models[name] = model
        self.scores[name] = score
        self.kwargs[name] = kwargs
        return True, score

    def prune(self, limit:int=None) -> ext.Self: # removes models with scores above the mean; recurses if limit is set
        pruned = SelectiveEnsemble(limit=(limit or self.limit))
        pruned.models = {m:self.models[m] for m in self.models if self.scores[m] <= self.mean_score}
        pruned.scores = {m:self.scores[m] for m in pruned.models}
        pruned.kwargs = {m:self.kwargs[m] for m in pruned.models}
        if pruned.limit and len(pruned) > pruned.limit > 1: return pruned.prune()
        return pruned
    
    def clone(self, limit:int=None) -> ext.Self:
        clone = SelectiveEnsemble(limit=(limit or self.limit))
        clone.models = self.models.copy()
        clone.scores = self.scores.copy()
        clone.kwargs = self.kwargs.copy()
        return clone
    
    def predict(self, X:pd.DataFrame, **kwargs) -> np.ndarray: # wrapper for soft voting; kwargs for compat
        y = np.zeros(len(X))
        for m in self.models:
            pred = self.models[m].predict(X, **self.kwargs[m])
            pred = pred.reshape(-1) # reshape needed for tensorflow output; doesn't impact other model types
            temp = np.ma.masked_invalid(pred) # mask NaN and +/- inf to find largest legit values; https://stackoverflow.com/a/41097911/3178898
            pred = np.nan_to_num(pred, posinf=temp.max()+temp.std(), neginf=temp.min()-temp.std()) # then use those to clamp the invalid ones
            y += pred
        y = y / len(self)
        return y

    def __len__(self) -> int:
        return len(self.models)
    
    def __repr__(self) -> str:
        return f'<SelectiveEnsemble ({len(self)} model(s); mean: {self.mean_score:.8f}; best: {self.best_score:.8f}; limit: {self.limit})>'

In [6]:
MODEL_FOLDER = '.models/'
if not os.path.exists(MODEL_FOLDER): os.makedirs(MODEL_FOLDER)
process = psutil.Process() # defaults to current process

# customize fit() and predict() kwargs for each model's type and params
def build_model_kwargs(model:IModel, val_data:tuple[pd.DataFrame, pd.Series]=None) -> tuple[dict, dict, dict]:
    fit_kw = dict()
    predict_kw = dict()
    early_stop_kw = dict()
    model_class = type(model).__name__
    match model_class:
        case 'Sequential':
            # model.compile(optimizer='adam', loss='mae')
            keras_kw = dict(batch_size=256, verbose=0)
            fit_kw.update(keras_kw)
            predict_kw.update(keras_kw)
            early_stop_kw['validation_data'] = val_data
        case 'LGBMRegressor':
            fit_kw.update(dict(verbose=False)) # verbose=0 throws an error
            if 'early_stopping_round' in model.get_params():
                early_stop_kw['eval_set'] = [val_data]
                early_stop_kw['eval_metric'] = 'l1'
        case 'XGBRegressor' | 'CatBoostRegressor':
            fit_kw.update(dict(verbose=0))
            if 'early_stopping_rounds' in model.get_params():
                early_stop_kw['eval_set'] = [val_data]
    fit_kw.update(early_stop_kw)
    return fit_kw, predict_kw, early_stop_kw

# builds an ensemble trained on the data from load_vars(). if an existing ensemble is provided, it will be updated instead.
def train_ensemble(models:list[IModel], folds:int=5, limit:int=None, ensemble:SelectiveEnsemble=None) -> SelectiveEnsemble:
    setup_start = time.time()
    print(f'Pre-training setup...', end='\r')
    ensemble = ensemble.clone(limit=(limit or len(ensemble))) if ensemble else SelectiveEnsemble(limit=(limit or len(models)))
    cv = sel.TimeSeriesSplit(folds)
    X, y = load_vars()
    setup_time = time.time() - setup_start
    print(f'Pre-training setup...Complete ({setup_time:.1f}s)')
    for j, model in enumerate(models): # each model gets its own ensemble, then the best fold will be added to the main
        name = type(model).__name__
        is_sequential = name == 'Sequential'
        if is_sequential:
            model.compile(optimizer='adam', loss='mae')
            name = model.name            
        _msg = f'Model {j+1}/{len(models)}:'
        for i, (i_train, i_valid) in enumerate(cv.split(X)):
            try: # fail gracefully instead of giving up on the whole ensemble
                fold_start = time.time()
                _name = f'{name}_{int(time.time())}'
                msg = f'{_msg} Fold {i+1}/{folds}:'
                print(f'{msg} Training {name}...'+' '*48, end='\r')
                X_valid, y_valid = X.iloc[i_valid, :], y.iloc[i_valid]
                fit_kw, predict_kw, early_stop_kw = build_model_kwargs(model, (X_valid, y_valid))
                try: model.fit(X.iloc[i_train, :], y.iloc[i_train], **fit_kw) # some kwargs fail on kaggle
                except: model.fit(X.iloc[i_train, :], y.iloc[i_train], **early_stop_kw) # fallback to early stop only
                del X_valid, y_valid
                mem_total = process.memory_info().rss / 1024**3 # B -> GiB
                if MEMORY_CAP and mem_total > MEMORY_CAP: raise MemoryError(f'High memory allocation ({mem_total:.1f} > {MEMORY_CAP:.1f} GiB)') # plug memory leak
                print(f'{msg} Adding {name} to ensemble...', end='\r')
                if is_sequential:
                    clone = tf.keras.models.clone_model(model)
                    clone.set_weights(model.get_weights())
                else: clone = None
                res, score = ensemble.add((clone or model), _name, predict_kw)
                if (res and SAVE_MODELS):
                    save_path = os.path.join(MODEL_FOLDER, f'{_name}.joblib')
                    joblib.dump(model, save_path)
                fold_time = time.time()-fold_start
                print(f'{msg} {("Accepted" if res else "Rejected")} with score: {score:.8f}'
                     +f' ({fold_time:.1f}s) ({mem_total:.1f} GiB)'+(f' ({_name})' if res else '')+' '*10)
            except Exception as e:
                print(f'{msg} Stopped: {type(e).__name__}: {e}')
                if isinstance(e, PredictionError): break # these tend not to improve, so move on to the next model
                if isinstance(e, MemoryError): break     # malloc resets with each model, so move on if exceeded
            finally:
                while gc.collect() > 0: pass # memory is at a premium
    return ensemble

def load_ensemble(model_dir:str=MODEL_FOLDER) -> SelectiveEnsemble:
    ensemble = SelectiveEnsemble()
    for file in os.listdir(model_dir):
        model = joblib.load(os.path.join(model_dir, file))
        name = file.split('.joblib')[0]
        kwargs = build_model_kwargs(model, (ensemble.test_x, ensemble.test_y))[1] # only need predict_kw
        ensemble.add(model, name, kwargs)
    if len(ensemble) == 0: raise FileNotFoundError(f'No models saved in {model_dir}.')
    return ensemble

In [7]:
N_FEATURES = len(load_vars(test=True)[0].columns)
ACTIVATION_1 = 'tanh' # inputs are standardized so keep negative range
ACTIVATION_2 = 'relu' # performed better than tanh, sigmoid
DROPOUT = 0.5         # performed better than 0.3, 0.4
RANDOM_STATE = 25     # funnier than 24

layers = tf.keras.layers
Sequential = tf.keras.Sequential
regularizer = tf.keras.regularizers.l1(0.001)
tf.keras.utils.set_random_seed(RANDOM_STATE)

shared_kw = dict(random_state=RANDOM_STATE, learning_rate=0.2, max_depth=3, subsample=0.8)
xgb_lgb_kw = dict(n_jobs=16, colsample_bytree=0.85, reg_alpha=500)                         
xgb_cat_kw = dict(early_stopping_rounds=5)
lgb_cat_kw = dict(num_leaves=8, min_child_samples=2000)

models = [ # order matters if limit is set; frontloading stronger models will cause more rejections; the reverse will oversaturate
    Sequential([ # 145 -> 18 -> 1
        layers.Dense(N_FEATURES, kernel_regularizer=regularizer, activation=ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//8, kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='octo'),
    Sequential([ # 145 -> 36 -> 1
        layers.Dense(N_FEATURES, kernel_regularizer=regularizer, activation=ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//4, kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='quad'),
    Sequential([ # 145 -> 72 -> 1
        layers.Dense(N_FEATURES, kernel_regularizer=regularizer, activation=ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//2, kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='duce'),
    Sequential([ # 145 -> 145 -> 1
        layers.Dense(N_FEATURES, kernel_regularizer=regularizer, activation=ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES, kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='mono'),
    xgb.XGBRegressor(**shared_kw, **xgb_lgb_kw, **xgb_cat_kw, eval_metric='mae', tree_method='hist', gamma=0.2), #, nthread=1),
    lgb.LGBMRegressor(**shared_kw, **xgb_lgb_kw, **lgb_cat_kw, early_stopping_round=5, metric='l1', min_split_gain=0.001, verbosity=-1),
    cat.CatBoostRegressor(**shared_kw, **xgb_cat_kw, **lgb_cat_kw, eval_metric='MAE'),
    Sequential([layers.Dense(1, activation=ACTIVATION_1, input_shape=[N_FEATURES])], name='linear'), # 145 -> 1
    Sequential([ # 145 -> 72 -> 36 -> 18 -> 1
        layers.Dense(N_FEATURES, kernel_regularizer=regularizer, activation=ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//2, kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//4, kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//8, kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='deep'),
    Sequential([ # 145 -> 89 -> 13 -> 5 -> 1 
        layers.Dense(N_FEATURES, kernel_regularizer=regularizer, activation=ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(89, kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(13, kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(5 , kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='fib'),
    Sequential([ # 145 -> 29 -> 5 -> 1
        layers.Dense(N_FEATURES, kernel_regularizer=regularizer, activation=ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(29, kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(5 , kernel_regularizer=regularizer, activation=ACTIVATION_2),
        layers.Dropout(DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='prime'),
]

ensemble = train_ensemble(models, limit=1, folds=5) if IS_TRAIN else load_ensemble()
ensemble

Pre-training setup...Complete (106.3s)
Model 1/11: Fold 1/5: Accepted with score: 5.45839691 (7.8s) (8.2 GiB) (octo_1703037944)          
Model 1/11: Fold 2/5: Rejected with score: 5.47021723 (10.8s) (10.1 GiB)              
Model 1/11: Fold 3/5: Accepted with score: 5.44078064 (15.3s) (12.9 GiB) (octo_1703037963)          
Model 1/11: Fold 4/5: Rejected with score: 5.45171499 (19.0s) (13.9 GiB)              
Model 1/11: Fold 5/5: Rejected with score: 5.45290136 (23.8s) (18.6 GiB)              
Model 2/11: Fold 1/5: Rejected with score: 5.45541906 (6.9s) (18.6 GiB)               
Model 2/11: Fold 2/5: Rejected with score: 5.45729303 (10.7s) (18.6 GiB)              
Model 2/11: Fold 3/5: Rejected with score: 5.45989752 (14.8s) (18.6 GiB)              
Model 2/11: Fold 4/5: Accepted with score: 5.43706799 (18.7s) (18.6 GiB) (quad_1703038055)          
Model 2/11: Fold 5/5: Accepted with score: 5.44366264 (23.3s) (18.6 GiB) (quad_1703038074)          
Model 3/11: Fold 1/5: Rejected with s

<SelectiveEnsemble (21 model(s); mean: 5.42662393; best: 5.40881443; limit: 1)>

In [8]:
cut1 = ensemble.prune()
cut2 = cut1.prune()
print(f'Ensemble: {ensemble.mean_score:.8f}, {len(ensemble)} models ({", ".join([m for m in ensemble.models])})')
print(f'1st Cut : {cut1.mean_score:.8f}, {len(cut1)} models ({", ".join([m for m in cut1.models])})')
print(f'2nd Cut : {cut2.mean_score:.8f}, {len(cut2)} models ({", ".join([m for m in cut2.models])})')

Ensemble: 5.42662393, 21 models (octo_1703037944, octo_1703037963, quad_1703038055, quad_1703038074, linear_1703038400, linear_1703038403, linear_1703038407, linear_1703038413, linear_1703038420, deep_1703038453, deep_1703038499, fib_1703038531, fib_1703038540, fib_1703038554, fib_1703038574, fib_1703038601, prime_1703038632, prime_1703038639, prime_1703038652, prime_1703038669, prime_1703038692)
1st Cut : 5.41876000, 11 models (linear_1703038400, linear_1703038413, fib_1703038531, fib_1703038540, fib_1703038554, fib_1703038574, fib_1703038601, prime_1703038632, prime_1703038639, prime_1703038652, prime_1703038692)
2nd Cut : 5.41309757, 5 models (fib_1703038531, fib_1703038574, fib_1703038601, prime_1703038632, prime_1703038692)


In [9]:
# raise Exception # stop for manual eval
model = cut1

In [10]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

for (test, _, _) in iter_test:
    X_test = preprocess(test)
    y_pred = model.predict(X_test, verbose=0)
    assert not np.isnan(y_pred).any() # sanity check 2
    submission = test[['row_id']]
    submission['target'] = y_pred
    env.predict(submission)

try:
    res = pd.read_csv('/kaggle/working/submission.csv') # sanity check
except FileNotFoundError:
    res = pd.read_csv('.data/submission.csv')
res

Unnamed: 0,row_id,target
0,478_0_0,-0.508882
1,478_0_1,0.375170
2,478_0_2,0.550890
3,478_0_3,-0.469055
4,478_0_4,-0.549514
...,...,...
32995,480_540_195,0.162991
32996,480_540_196,-0.685365
32997,480_540_197,0.206961
32998,480_540_198,0.406229
