In [1]:
# TODO:
#   detect whether local or kaggle and skip training on the latter
#   save/load models from file while training
#   support generic model protocol
#   use the engineered features everyone else is
#   auto-select model by score
#   (stretch) implement windowing with the engineered features
#   (stretch) implement RNN https://www.tensorflow.org/tutorials/structured_data/time_series#recurrent_neural_network
#   (stretch) voting ensemble
#   (stretch) stacking ensemble https://scikit-learn.org/stable/modules/ensemble.html#stacked-generalization

In [2]:
import os
import gc
import time
import typing
import joblib
import warnings
import itertools
warnings.simplefilter('ignore') # should precede pandas import
import pandas as pd
import numpy as np
import numba as nb
import lightgbm as lgb
import sklearn.svm as svm
import sklearn.impute as imp
import sklearn.metrics as met
import sklearn.model_selection as sel
import matplotlib.pyplot as plt
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # must precede tf import
import tensorflow as tf
tf.keras.utils.disable_interactive_logging()
tf.keras.utils.set_random_seed(25)
if len(tf.config.list_physical_devices('GPU')) == 0: print('No GPU detected.')

In [3]:
def reduce_mem_usage(df:pd.DataFrame, verbose:bool=False) -> pd.DataFrame:
    if verbose: mem_start = df.memory_usage().sum()
    for col in df.columns:
        match df[col].dtype:
            case 'object':
                continue
            case 'int32' | 'int64':
                for int_size in [np.int8, np.int16, np.int32]:
                    if df[col].min() > np.iinfo(int_size).min and df[col].max() < np.iinfo(int_size).max:
                        df[col] = df[col].astype(int_size)
            case 'float32' | 'float64':
                for float_size in [np.float16, np.float32]:
                    if df[col].min() > np.finfo(float_size).min and df[col].max() < np.finfo(float_size).max:
                        df[col] = df[col].astype(float_size)
            case _:
                raise Exception(df[col].dtype)
    if verbose:
        mem_end = df.memory_usage().sum()
        print(f'DataFrame memory reduced from {mem_start} to {mem_end}.')
    return df

try: # load data and identify local or kaggle setup
    df = pd.read_csv('.data/train.csv')
    LOCAL = True
except FileNotFoundError:
    df = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
    LOCAL = False
df = df.dropna(subset=['target'])
df = df.reset_index(drop=True)
df = reduce_mem_usage(df)
df

In [5]:
@nb.njit(parallel=True)
def compute_triplet_imbalance(df_values:np.ndarray, combo_indices:list[tuple[int, int, int]]) -> np.ndarray:
    num_rows = df_values.shape[0]
    num_combinations = len(combo_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in nb.prange(num_combinations): # used instead of enumerate(combo_indices) to run in parallel
        a, b, c = combo_indices[i]
        for j in nb.prange(num_rows):
            _a, _b, _c = df_values[j, a], df_values[j, b], df_values[j, c]
            max_val = max(_a, _b, _c)
            min_val = min(_a, _b, _c)
            mid_val = sum([_a, _b, _c])-max_val-min_val
            imbalance_features[j, i] = np.nan if mid_val == min_val else (max_val-mid_val)/(mid_val-min_val)
    return imbalance_features   

def calculate_triplet_imbalance_numba(cols:list[str], df:pd.DataFrame) -> pd.DataFrame:
    df_values = df[cols].values
    combo_indices = []
    columns = []
    for a, b, c in itertools.combinations(cols, 3):
        combo_indices.append(tuple([cols.index(col) for col in [a, b, c]]))
        columns.append(f'{a}_{b}_{c}_imbalance')
    features_array = compute_triplet_imbalance(df_values, combo_indices)
    features = pd.DataFrame(features_array, columns=columns)
    return features

def imbalance_features(df:pd.DataFrame) -> pd.DataFrame:
    prices = [*[col for col in df.columns if 'price' in col], 'wap']
    sizes = [col for col in df.columns if 'size' in col]
    df['volume'] = df.eval('ask_size+bid_size')
    df['mid_price'] = df.eval('(ask_price+bid_price)/2')
    df['liquidity_imbalance'] = df.eval('(bid_size-ask_size)/volume')
    df['matched_imbalance'] = df.eval('(imbalance_size-matched_size)/(imbalance_size+matched_size)')
    df['size_imbalance'] = df.eval('bid_size/ask_size')
    df['imbalance_momentum'] = df.groupby('stock_id').imbalance_size.diff(periods=1) / df.matched_size
    df['price_spread'] = df.eval('ask_price-bid_price')
    df['spread_intensity'] = df.groupby('stock_id').price_spread.diff()
    df['price_pressure'] = df.eval('imbalance_size*price_spread')
    df['market_urgency'] = df.eval('price_spread*liquidity_imbalance')
    df['depth_pressure'] = df.eval('(ask_size-bid_size)*(far_price-near_price)')
    for cols in itertools.combinations(prices, 2):
        df[f'{cols[0]}_{cols[1]}_imbalance'] = df.eval(f'({cols[0]}-{cols[1]})/({cols[0]}+{cols[1]})')
    for cols in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(cols, df)
        df[triplet_feature.columns] = triplet_feature.values
    for func in ['mean', 'std', 'skew', 'kurt']:
        df[f'all_prices_{func}'] = df[prices].agg(func, axis=1)
        df[f'all_sizes_{func}'] = df[sizes].agg(func, axis=1)
    for win in [1, 2, 3, 10]:
        for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
            df[f'{col}_shift_{win}'] = df.groupby('stock_id')[col].shift(win)
            df[f'{col}_pct_{win}'] = df.groupby('stock_id')[col].pct_change(win)
        for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'market_urgency', 'imbalance_momentum', 'size_imbalance']:
            df[f'{col}_diff_{win}'] = df.groupby('stock_id')[col].diff(win)
    df = df.replace([np.inf, -np.inf], 0)
    return df

def other_features(df:pd.DataFrame) -> pd.DataFrame:
    df['weekday'] = df.eval('date_id%5')
    df['seconds'] = df.eval('seconds_in_bucket%60')
    df['minutes'] = df.eval('seconds_in_bucket//60')
    # TODO: add sin/cos features (priority -1)
    gdf = df.groupby('stock_id')
    global_stock_id_feats = dict[str, pd.DataFrame]()
    for feat in ['size', 'price']:
        bid, ask = f'bid_{feat}', f'ask_{feat}'
        global_stock_id_feats[f'median_{feat}'] = gdf[bid].median()-gdf[ask].median()
        global_stock_id_feats[f'std_{feat}'] = gdf[bid].std()-gdf[ask].std()
        global_stock_id_feats[f'ptp_{feat}'] = gdf[bid].max()-gdf[ask].min() # TODO: why 'ptp'?
    for k, v in global_stock_id_feats.items():
        df[f'global_{k}'] = df.stock_id.map(v.to_dict())
    return df

def generate_features(df:pd.DataFrame) -> pd.DataFrame:
    cols = [col for col in df.columns if col not in ['row_id', 'time_id', 'target']]
    df = df[cols]
    print('Building imbalance features...', end='\r')
    df = imbalance_features(df)
    print('Building other features...    ', end='\r')
    df = other_features(df)
    gc.collect()
    features = [col for col in df.columns if col not in ['row_id', 'time_id', 'target', 'date_id']]
    df = df[features]
    print(f'Done. Total features in dataset: {len(df.columns)}')
    return df

generate_features(df)

In [9]:
model = tf.keras.Model()

In [10]:
import optiver2023 # submission compat check
env = optiver2023.make_env()
iter_test = env.iter_test()

for i, (_test, _, _) in enumerate(iter_test):
    X_test = generate_features(_test)
    y_pred = model.predict(X_test, verbose=0)
    submission = pd.DataFrame(columns=['target'], index=X_test.index)
    submission['target'] = y_pred
    # TODO: reformat for submission
    assert all(x in submission.columns for x in ['row_id', 'target']) # sanity check 2
    env.predict(submission)

try:
    res = pd.read_csv('/kaggle/working/submission.csv') # sanity check 1
except FileNotFoundError:
    res = pd.read_csv('./.data/submission.csv')
res

Done. Total features in dataset: 124


NotImplementedError: in user code:

    File "/mnt/Data/Repos/kaggle/.cuda/lib/python3.10/site-packages/keras/src/engine/training.py", line 2440, in predict_function  *
        return step_function(self, iterator)
    File "/mnt/Data/Repos/kaggle/.cuda/lib/python3.10/site-packages/keras/src/engine/training.py", line 2425, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/mnt/Data/Repos/kaggle/.cuda/lib/python3.10/site-packages/keras/src/engine/training.py", line 2413, in run_step  **
        outputs = model.predict_step(data)
    File "/mnt/Data/Repos/kaggle/.cuda/lib/python3.10/site-packages/keras/src/engine/training.py", line 2381, in predict_step
        return self(x, training=False)
    File "/mnt/Data/Repos/kaggle/.cuda/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/mnt/Data/Repos/kaggle/.cuda/lib/python3.10/site-packages/keras/src/engine/training.py", line 617, in call
        raise NotImplementedError(

    NotImplementedError: Exception encountered when calling layer 'model' (type Model).
    
    Unimplemented `tf.keras.Model.call()`: if you intend to create a `Model` with the Functional API, please provide `inputs` and `outputs` arguments. Otherwise, subclass `Model` with an overridden `call()` method.
    
    Call arguments received by layer 'model' (type Model):
      • inputs=tf.Tensor(shape=(None, 124), dtype=float32)
      • training=False
      • mask=None


^ stopping here again. Going to start fresh from my best attempts (v13/14) then try to add these rewrites back in.