In [1]:
import gc
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # https://stackoverflow.com/q/40426502/3178898
try: # got tired of changing code between local and kaggle setup
    import cudf.pandas
    cudf.pandas.install() # must be called before pandas import
except ModuleNotFoundError:
    print('Module `cudf` not installed. Continuing with CPU dataframes.')
import pandas as pd
import numpy as np
import tensorflow as tf # https://github.com/tensorflow/tensorflow/issues/62075
tf.keras.utils.disable_interactive_logging() # https://stackoverflow.com/a/76863355/3178898
tf.keras.utils.set_random_seed(25)
if len(tf.config.list_physical_devices('GPU')) == 0: print('No GPU detected.')
from sklearn.model_selection import TimeSeriesSplit

In [2]:
DATA_TRAIN = '.data/train.csv'
DATA_TEST_X = '.data/test.csv'
DATA_TEST_Y = '.data/revealed_targets.csv'

KAGGLE_DATA_TRAIN = '/kaggle/input/optiver-trading-at-the-close/train.csv'
KAGGLE_DATA_TEST_X = '/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv'
KAGGLE_DATA_TEST_Y = '/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv'

DROPS = ['index', 'time_id', 'currently_scored', 'time_id_x', 'time_id_y', 'revealed_date_id', 'revealed_time_id']
SORTS = ['stock_id', 'date_id', 'seconds_in_bucket'] # order matters here
INDEX = 'row_id'
SKIPS = ['imbalance_buy_sell_flag']
TEMPS = list(zip(SORTS[-2:], [252, 540])) # 252 trading days/year, 540 seconds/bucket

def preprocess(data:pd.DataFrame) -> pd.DataFrame: # separate from load_data() for submission compat
    data = data.dropna(subset=['target'])                # some rows have null targets
    data = data.reset_index().set_index(INDEX)           # setup index for re-join
    skip = data[[*SORTS, 'target', *SKIPS]]              # some columns we don't want to process
    data = data.drop([col for col in [*DROPS, *SKIPS, 'target'] if col in data.columns], axis=1)
    data = data.groupby(SORTS).ffill().fillna(0)         # "impute" - groupby() ensures ffill() is per-stock, per-day; also removes SORTS from data
    data = (data - data.mean()) / data.std()             # normalize - done after impute so NaN isn't set to the mean
    data = pd.concat([skip, data], axis=1, join='inner') # re-join; adds back SORTS/TEMPS
    data = data.sort_values(by=SORTS[1:])                # re-sort by date
    for feature, period in TEMPS:                        # encode time feature(s) as sin/cos waves
        temp = data.pop(feature)                         # pop() removes TEMPS from data
        data[f'{feature}_sin'] = np.sin((temp * 2 * np.pi / period))
        data[f'{feature}_cos'] = np.cos((temp * 2 * np.pi / period))
    return data

def load_data(test:bool=False) -> pd.DataFrame: # tries to load local data, or defaults to kaggle setup. test=True fetches the test set
    def read_data(train, test_x, test_y):
        if test:
            data = pd.merge(*[pd.read_csv(path) for path in [test_x, test_y]], on=SORTS) # revealed_targets.csv doesn't have row_id
            data = data.rename(columns={'revealed_target':'target'}) # makes life easier
        else:
            data = pd.read_csv(train, index_col=INDEX)
        return data
    try:
        data = read_data(DATA_TRAIN, DATA_TEST_X, DATA_TEST_Y)
    except FileNotFoundError:
        data = read_data(KAGGLE_DATA_TRAIN, KAGGLE_DATA_TEST_X, KAGGLE_DATA_TEST_Y)
    data = preprocess(data)
    return data

In [3]:
data = load_data()
data

Unnamed: 0_level_0,stock_id,target,imbalance_buy_sell_flag,imbalance_size,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,date_id_sin,date_id_cos,seconds_in_bucket_sin,seconds_in_bucket_cos
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0_0_0,0,-3.029704,1,-0.123542,-0.028161,-0.226823,-0.646266,-0.912726,0.019781,0.079311,-0.037994,-0.348522,0.005930,0.00000,1.000000,0.000000e+00,1.0
0_0_1,1,-5.519986,-1,-0.270454,-0.013221,-0.310762,-0.646266,-0.912726,0.034764,-0.436012,0.074943,-0.254889,0.005930,0.00000,1.000000,0.000000e+00,1.0
0_0_2,2,-8.389950,-1,-0.263812,-0.072802,-0.309495,-0.646266,-0.912726,-0.053170,-0.124378,0.010458,-0.267336,0.005930,0.00000,1.000000,0.000000e+00,1.0
0_0_3,3,-4.010200,-1,0.302332,0.035689,-0.191000,-0.646266,-0.912726,0.053135,-0.444163,-0.004505,3.289022,0.005930,0.00000,1.000000,0.000000e+00,1.0
0_0_4,4,-7.349849,-1,-0.256760,-0.077960,-0.194784,-0.646266,-0.912726,-0.054775,-0.317073,-0.039776,-0.410822,0.005930,0.00000,1.000000,0.000000e+00,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480_540_195,195,2.310276,-1,-0.159606,0.061655,-0.120272,0.795246,1.095470,0.109855,-0.175525,0.034685,2.058545,0.064429,-0.56332,0.826239,-2.449294e-16,1.0
480_540_196,196,-8.220077,-1,-0.261539,0.119636,-0.256804,0.795816,1.096780,0.168001,1.375794,0.117696,0.307805,0.152000,-0.56332,0.826239,-2.449294e-16,1.0
480_540_197,197,1.169443,0,-0.278575,-0.743668,-0.231506,0.789558,1.087546,-0.697776,-0.314334,-0.776008,0.977623,-0.743682,-0.56332,0.826239,-2.449294e-16,1.0
480_540_198,198,-1.540184,1,-0.229788,-0.135229,0.355222,0.794491,1.094417,-0.130401,0.662501,-0.183352,4.764486,-0.170995,-0.56332,0.826239,-2.449294e-16,1.0


In [4]:
test = load_data(test=True)
test

Unnamed: 0_level_0,stock_id,target,imbalance_buy_sell_flag,imbalance_size,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,date_id_sin,date_id_cos,seconds_in_bucket_sin,seconds_in_bucket_cos
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
478_0_160,160,0.449419,1,2.509415,-0.270133,0.033432,-0.899687,-0.912680,-0.114105,-0.404918,-0.119834,-0.379340,-0.083726,-0.603804,0.797133,0.000000e+00,1.0
478_0_161,161,-5.189776,-1,-0.334367,-1.063632,-0.295342,-0.899687,-0.912680,-0.228545,-0.506445,0.218157,-0.355971,-0.083726,-0.603804,0.797133,0.000000e+00,1.0
478_0_162,162,-23.009777,1,-0.213917,-0.207712,-0.250358,-0.899687,-0.912680,-0.538723,-0.139692,-0.214928,-0.466671,-0.083726,-0.603804,0.797133,0.000000e+00,1.0
478_0_163,163,1.720190,1,0.041598,0.105985,-0.032865,-0.899687,-0.912680,-0.095982,-0.097991,-0.019849,0.044164,-0.083726,-0.603804,0.797133,0.000000e+00,1.0
478_0_164,164,1.579523,1,-0.077026,-0.451051,-0.254508,-0.899687,-0.912680,-0.147246,-0.418682,-0.119834,-0.399967,-0.083726,-0.603804,0.797133,0.000000e+00,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480_540_178,178,-1.140237,-1,-0.278900,-2.257584,-0.197262,1.104102,1.089601,-2.088059,-0.081346,-2.378175,3.971272,-2.317272,-0.563320,0.826239,-2.449294e-16,1.0
480_540_179,179,-1.929998,1,0.118839,-2.200981,1.393631,1.106823,1.091206,-2.084952,3.088303,-2.399910,0.598491,-2.280817,-0.563320,0.826239,-2.449294e-16,1.0
480_540_169,169,-1.569986,0,-0.359370,0.109159,-0.221418,1.114593,1.099786,0.228695,-0.482723,0.157840,-0.053187,0.143709,-0.563320,0.826239,-2.449294e-16,1.0
480_540_173,173,2.059937,-1,-0.343101,0.272091,-0.243136,1.115110,1.100303,0.335368,2.658583,0.252390,-0.153669,0.370601,-0.563320,0.826239,-2.449294e-16,1.0


In [5]:
class WindowGenerator: # https://www.tensorflow.org/tutorials/structured_data/time_series#data_windowing
    def __init__(self, lags:int, train:pd.DataFrame, valid:pd.DataFrame, test:pd.DataFrame) -> None:
        self._train = train
        self._valid = valid
        self._test = test
        self._column_indices = {name: i for i, name in enumerate(train.columns)}
        self._size = lags + 1 # prediction is for t+1
        self._input_slice = slice(0, lags)
        self._label_slice = slice(lags, None)
        indices = np.arange(self._size)
        self._input_indices = indices[self._input_slice]
        self._label_indices = indices[self._label_slice]
        del indices
    
    def __repr__(self) -> str:
        return '\n'.join([f'Window size: {self._size}', f'Input indices: {self._input_indices}', f'Label indices: {self._label_indices}'])
    
    def split_features(self, features:tf.Tensor) -> tuple[tf.Tensor, tf.Tensor]:
        inputs = features[:, self._input_slice, :]
        labels = features[:, self._label_slice, :]
        labels = tf.stack([labels[:, :, self._column_indices['target']]], axis=-1)
        labels.set_shape([None, 1, None])
        inputs.set_shape([None, self._size - 1, None])
        return inputs, labels
    
    def make_dataset(self, df:pd.DataFrame) -> tf.data.Dataset:
        data = np.array(df, dtype=np.float32)
        data = tf.keras.utils.timeseries_dataset_from_array(data=data, targets=None, sequence_length=self._size, shuffle=True, batch_size=200)
        data = data.map(self.split_features)
        return data
    
    @property
    def train(self) -> tf.data.Dataset:
        return self.make_dataset(self._train)
    
    @property
    def valid(self) -> tf.data.Dataset:
        return self.make_dataset(self._valid)
    
    @property
    def test(self) -> tf.data.Dataset:
        return self.make_dataset(self._test)
    
    @property
    def example(self) -> tuple[tf.Tensor, tf.Tensor]:
        res = getattr(self, '_example', None) # check cache
        if res is None:                       # if empty,
            res = next(iter(self.train))      # ...generate example
            self._example = res               # ...and cache it
        return res

In [6]:
nn = tf.keras.Sequential
layers = tf.keras.layers

def compile_model(model:tf.keras.Model, patience:int=2) -> dict: # returns kwargs for `model.fit()`
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='min') # TODO: customize
    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam(), metrics=[tf.keras.metrics.MeanAbsoluteError()]) # TODO: customize
    return dict(callbacks=[early_stop])

models = [
    nn([layers.Dense(1)], name='linear'),
    nn([
        layers.Dense(64, activation='tanh'),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)
    ], name='dense'),
    nn([
        layers.Flatten(), # shape: (time, features) => (time * features)
        layers.Dense(32, activation='tanh'),
        layers.Dense(32, activation='relu'),
        layers.Dense(1),
        layers.Reshape([1, -1]) # shape: (outputs) => (1, outputs)
    ], name='dense_multi'),
    nn([ 
        layers.Conv1D(32, kernel_size=(200,), activation='tanh'), # kernel_size must match the window lags
        layers.Dense(32, activation='relu'),
        layers.Dense(1)
    ], name='conv'),
    nn([ 
        layers.LSTM(32, return_sequences=True),
        layers.Dense(1)
    ], name='lstm'),
]

fit_kw = {m.name: compile_model(m) for m in models}
scores = [] # https://stackoverflow.com/a/17496530/3178898
splits = 5
for i, (i_train, i_valid) in enumerate(TimeSeriesSplit(splits).split(data)):
    print(f'Training: Fold {i+1}/{splits}...', end='\r')
    win = WindowGenerator(lags=200, train=data.iloc[i_train, :], valid=data.iloc[i_valid, :], test=test)
    for j, m in enumerate(models):
        m.fit(win.train, validation_data=win.valid, **fit_kw[m.name])
        v_mse, v_mae = m.evaluate(win.valid)
        t_mse, t_mae = m.evaluate(win.test)
        scores.append({'model_name': m.name, 'model_index': j,'fold': i+1, 'valid_mse': v_mse, 'valid_mae': v_mae, 'test_mse': t_mse, 'test_mae': t_mae})
print(f'Training: Complete. Model scores:')
pd.DataFrame(scores).sort_values(by='model_index')

Training: Fold 1/5...

I0000 00:00:1701058573.120934   52685 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Training: Complete. Model scores:


Unnamed: 0,model_name,model_index,fold,valid_mse,valid_mae,test_mse,test_mae
0,linear,0,1,117.633308,7.478696,64.435913,5.395017
5,linear,0,2,100.535522,6.908837,64.429153,5.394599
10,linear,0,3,79.547737,6.187143,64.319267,5.386766
15,linear,0,4,90.379677,6.427221,64.426735,5.394623
20,linear,0,5,79.970467,5.990577,64.369118,5.39068
1,dense,1,1,117.385651,7.46521,64.23674,5.381098
6,dense,1,2,100.489479,6.903286,64.285072,5.383888
11,dense,1,3,79.558762,6.187398,64.29837,5.384478
16,dense,1,4,90.309212,6.421356,64.296951,5.383964
21,dense,1,5,79.949661,5.988527,64.301178,5.385251
