In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # https://stackoverflow.com/questions/40426502/is-there-a-way-to-suppress-the-messages-tensorflow-prints
import gc
import typing as t
import warnings
warnings.filterwarnings('ignore')
from itertools import combinations
try: # got tired of changing code between local and kaggle setup
    import cudf.pandas
    cudf.pandas.install() # must be called before pandas import
except ModuleNotFoundError:
    print('cudf not installed. Continuing with CPU dataframes.')
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import tensorflow as tf # https://github.com/tensorflow/tensorflow/issues/62075
keras = tf.keras # https://github.com/microsoft/pylance-release/issues/1066
from keras import Sequential, layers, regularizers
from keras.models import clone_model

In [None]:
if len(tf.config.list_physical_devices('GPU')) > 0: print('GPU available.')

In [None]:
DATA_TRAIN = '.data/train.csv'
DATA_TEST_X = '.data/test.csv'
DATA_TEST_Y = '.data/revealed_targets.csv'

KAGGLE_DATA_TRAIN = '/kaggle/input/optiver-trading-at-the-close/train.csv'
KAGGLE_DATA_TEST_X = '/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv'
KAGGLE_DATA_TEST_Y = '/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv'

DROPS = ['index', 'time_id', 'currently_scored', 'time_id_x', 'time_id_y', 'revealed_date_id', 'revealed_time_id']
SORTS = ['date_id', 'seconds_in_bucket', 'stock_id']
INDEX = 'row_id'

N_FEATURES = 11 # update if/as features are engineered

# https://www.kaggle.com/code/verracodeguacas/high-speed-predictions-no-gpu
def add_features(data:pd.DataFrame) -> pd.DataFrame: # data arrives sorted and pruned
    df = data.copy()
    df['volume'] = df.ask_size + df.bid_size
    df['mid_price'] = (df.ask_price + df.bid_price) / 2
    df['liquidity_imbalance'] = (df.bid_size-df.ask_size) / (df.bid_size+df.ask_size)
    df['matched_imbalance'] = (df.imbalance_size-df.matched_size) / (df.matched_size+df.imbalance_size)
    df['size_imbalance'] = df.bid_size / df.ask_size
    # size_cols = [x for x in data.columns if 'size' in x]
    # price_cols = [x for x in list(set(data.columns)-set(size_cols)) if 'p' in x] # filter for '_price' and 'wap'
    # for c in combinations(price_cols, 2):
    #     df[f'{c[0]}_{c[1]}_imbalance'] = (df[c[0]]-df[c[1]]) / (df[c[0]]+df[c[1]])
    # # print(len(df.columns))
    return df

def preprocess(data:pd.DataFrame) -> pd.DataFrame: # separate for submission compat
    df = data.reset_index().set_index(INDEX)
    df = df.drop([col for col in DROPS if col in df.columns], axis=1)
    df = df.sort_values(by=SORTS).drop(SORTS, axis=1)
    df = (df - df.mean()) / df.std() # (df - df.min()) / (df.max() - df.min()) # normalize
    df = df.ffill().fillna(0) # "impute"
    # df = add_features(df)
    return df

def load_vars(testing:bool=False) -> tuple[pd.DataFrame, pd.Series]:

    def read_data(train, test_x, test_y):
        if testing:
            data = pd.merge(*[pd.read_csv(path) for path in [test_x, test_y]], on=SORTS) # https://stackoverflow.com/a/32041277/3178898
            ycol = 'revealed_target'
        else:
            data = pd.read_csv(train, index_col=INDEX)
            ycol = 'target'
        return data, ycol
    
    try: # tired of switching local/kaggle setup
        data, ycol = read_data(DATA_TRAIN, DATA_TEST_X, DATA_TEST_Y)
    except FileNotFoundError:
        data, ycol = read_data(KAGGLE_DATA_TRAIN, KAGGLE_DATA_TEST_X, KAGGLE_DATA_TEST_Y)

    data = data.dropna(subset=[ycol]) # some targets are null
    X = preprocess(data.drop(ycol, axis=1))
    y = data[ycol]
    return X, y 

# load_vars()[0].describe()

In [None]:
class ModelProfile: # wrapper to ensure model info for Ensemble
    def __init__(self, model:Sequential, score:float, predict_kw:dict={}) -> None:
        self.model = model
        self.score = score
        self.predict_kw = predict_kw

class Ensemble: # https://www.kaggle.com/code/iqmansingh/optiver-4-fold-time-series-split-ensemble
    def __init__(self, models:list[ModelProfile]=None, target_size:int=None) -> None:
        self.models = models or list[ModelProfile]()
        self.target_size = target_size # see add()

    @property
    def mean_score(self) -> float:
        return sum(m.score for m in self.models) / len(self) if len(self) > 0 else None
    
    @property
    def best_score(self) -> float:
        return min(m.score for m in self.models) if len(self) > 0 else None
    
    @property
    def best_model(self) -> Sequential:
        return [m.model for m in self.models if m.score == self.best_score][0]

    # adds a model to the collection. if limit is set, will reject new models below the mean (when full)
    def add(self, model: ModelProfile) -> bool:
        if self.target_size and len(self) >= self.target_size and model.score > self.mean_score:
            return False
        self.models.append(model)
        return True
    
    # removes all models above the current mean. if limit is passed or target size is set, repeats until that size (or 1) is reached
    def prune(self, limit:int=None, inplace:bool=False): # -> Ensemble
        pruned = Ensemble([m for m in self.models if m.score <= self.mean_score])
        limit = limit or self.target_size
        if limit is not None and len(pruned) > limit > 1:
            return pruned.prune(limit=limit)
        if inplace: self.models = pruned.models
        return pruned
    
    # wrapper for Model.predict(). calls each of the models and returns the average prediction. kwargs is only for compat
    def predict(self, X:pd.DataFrame, **kwargs) -> pd.DataFrame:
        y = pd.DataFrame(index=X.index)
        y['pred'] = 0
        for m in self.models:
            m_pred = m.model.predict(X, **m.predict_kw)
            y.pred += m_pred.reshape(-1) # tensorflow
        y.pred = y.pred / len(self)
        return y

    def __len__(self) -> int:
        return len(self.models)
    
    def __repr__(self) -> str:
        return f'<Ensemble ({len(self)} model(s); mean_score={self.mean_score:.6f}; best_score={self.best_score:.6f}; target_size={self.target_size})>'

In [None]:
class TrainingError(Exception): pass

def score_model(model:Sequential|Ensemble, predict_kw:dict={}, X_test:pd.DataFrame=None, y_test:pd.Series=None) -> float:
    if any(x is None for x in [X_test, y_test]):
        X_test, y_test = load_vars(testing=True)
    y_pred = model.predict(X_test, **predict_kw)
    if len(np.unique(y_pred)) == 1:
        raise TrainingError('Model is guessing a constant value.')
    # sometimes the model predicts NaN for some rows, which breaks mean_absolute_error << -- only when using additional features
    if (isinstance(y_pred, pd.DataFrame) and y_pred.isna().sum().any() > 0) or (isinstance(y_pred, np.ndarray) and np.isnan(y_pred).any()):
        raise TrainingError('Model is guessing NaN.')
    return mean_absolute_error(y_test, y_pred)

# Accepts a list of models and returns an ensemble of the best performers.
# An existing Ensemble can be passed in as well, which will will have its models updated inplace.
def train_ensemble(models:list[Sequential], folds:int=5, ensemble:Ensemble=None, ignore_errors:bool=True) -> Ensemble:

    print(f'Pre-training setup...', end='\r')
    ensemble = ensemble or Ensemble()
    cv = TimeSeriesSplit(folds)
    X, y = load_vars()
    X_test, y_test = load_vars(testing=True)
    
    for j, model in enumerate(models):
        model.compile(optimizer='adam', loss='mae')
        for i, (train, valid) in enumerate(cv.split(X)):
            msg = f'Training model: {(model.name or j)}: Fold {i + 1}/{folds}'
            
            try: # sometimes a training round can fail, but I don't want to give up on the whole ensemble   
                print(f'{msg} - Running...', end='\r')
                X_valid, y_valid = X.iloc[valid, :], y[valid]

                keras_kw = dict(batch_size=256, verbose=0)
                model.fit(X.iloc[train, :], y[train], **keras_kw, validation_data=(X_valid, y_valid))
                
                mae_train = score_model(model, keras_kw, X_valid, y_valid)
                mae_test = score_model(model, keras_kw, X_test, y_test)
                
                del X_valid, y_valid
                print(f'{msg} - Complete.  \n\tTrain MAE:  {mae_train}\n\tTest MAE:   {mae_test}')

                clone = clone_model(model) # https://stackoverflow.com/a/48552179/3178898
                clone.set_weights(model.get_weights())
                clone._name = f'{model.name}_{i}' # https://stackoverflow.com/a/63853924/3178898
                
                if ensemble.add(ModelProfile(clone, mae_test, keras_kw)):
                    print(f'Model accepted: {ensemble}')
                else:
                    print(f'Model rejected: {ensemble}')
            
            except TrainingError as e: # these don't get better, stop trying to train this model
                print(f'{msg} - Stopped: {e}') 
                break 

            except Exception as e: # these are usually out of memory errors, but generally want to skip these 
                if not ignore_errors: raise e # ...generally
                print(f'{msg} - Error: {type(e).__name__}: {e}')
            
            finally: gc.collect() # memory is still at a premium
    
    return ensemble

In [None]:
NN_DROPOUT = 0.5
NN_ACTIVATION_1 = 'tanh'
NN_ACTIVATION_2 = 'relu'
keras.utils.set_random_seed(25)

models = [
    Sequential([
        layers.Dense(N_FEATURES, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//8, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_2),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='octo'),
    Sequential([
        layers.Dense(N_FEATURES, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//4, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_2),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='quad'),
    Sequential([
        layers.Dense(N_FEATURES, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//2, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_2),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='duce'),
    Sequential([
        layers.Dense(N_FEATURES, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_2),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='mono'),
    Sequential([ 
        layers.Dense(N_FEATURES, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_1, input_shape=[N_FEATURES]),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//2, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_2),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//4, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_2),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(N_FEATURES//8, kernel_regularizer=regularizers.l1(0.001), activation=NN_ACTIVATION_2),
        layers.Dropout(NN_DROPOUT),
        layers.BatchNormalization(),
        layers.Dense(1)
    ], name='deep'),
]

ensemble = train_ensemble(models)

In [None]:
pruned = ensemble.prune()
top_3 = ensemble.prune(3)

e_score = score_model(ensemble)
p_score = score_model(pruned)
t_score = score_model(top_3)
b_score = score_model(ensemble.best_model)

In [None]:
print(f'Ensemble Score:\t{e_score}')
# [m.model.name for m in ensemble.models]

In [None]:
print(f'Pruned Score:\t{p_score}')
[m.model.name for m in pruned.models]

In [None]:
print(f'Top 3 Score:\t{t_score}')
[m.model.name for m in top_3.models]

In [None]:
print(f'Best Model Score:\t{b_score}')
ensemble.best_model.name

In [None]:
raise Exception # stop for manual eval
model = None

In [None]:
# submission compat check
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

for (test, revealed_targets, _) in iter_test:
    X_test = preprocess(test)
    y_pred = model.predict(X_test, verbose=0)
    submission = test[['row_id']].set_index('row_id') # needed to match rows
    submission['target'] = y_pred
    submission = submission.reset_index() # convert back for final CSV write
    env.predict(submission)

In [None]:
try:
    res = pd.read_csv('/kaggle/working/submission.csv') # sanity check
except FileNotFoundError:
    res = pd.read_csv('./.data/submission.csv')
res