Just a refactor of the code everyone else is using.

In [1]:
import gc
import os
import time
import typing
import joblib
import warnings
import itertools
import pandas as pd
import numpy as np
import numba as nb
import lightgbm as lgb
import sklearn.svm as svm
import sklearn.impute as imp
import sklearn.metrics as met
import sklearn.model_selection as sel
warnings.simplefilter('ignore')

In [2]:
is_offline = False
is_train = True
is_infer = True
split_day = 435

In [3]:
try: # load data regardless of local or kaggle setup
    df = pd.read_csv('.data/train.csv')
except FileNotFoundError:
    df = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
df = df.dropna(subset=['target'])
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.50,1.000026,8493.03,1.000000,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.000660,20605.09,1.000000,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.00,1.000298,18995.00,1.000000,-8.389950,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.90,1.000214,479032.40,1.000000,-4.010200,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.10,1.000000,-7.349849,0,0_0_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237887,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,32257.04,1.000434,319862.40,1.000328,2.310276,26454,480_540_195
5237888,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,205108.40,1.000900,93393.07,1.000819,-8.220077,26454,480_540_196
5237889,197,480,540,0.00,0,0.995789,12725436.10,0.995789,0.995789,0.995789,16790.66,0.995883,180038.32,0.995797,1.169443,26454,480_540_197
5237890,198,480,540,1000898.84,1,0.999210,94773271.05,0.999210,0.999210,0.998970,125631.72,0.999210,669893.00,0.999008,-1.540184,26454,480_540_198


In [73]:
def reduce_mem_usage(df:pd.DataFrame, verbose:int=0) -> pd.DataFrame:
    for col in df.columns:
        match df[col].dtype:
            case 'object':
                continue
            case 'int32' | 'int64':
                for int_size in [np.int8, np.int16, np.int32]:
                    if df[col].min() > np.iinfo(int_size).min and df[col].max() < np.iinfo(int_size).max:
                        df[col] = df[col].astype(int_size)
            case 'float32' | 'float64':
                for float_size in [np.float16, np.float32]:
                    if df[col].min() > np.finfo(float_size).min and df[col].max() < np.finfo(float_size).max:
                        df[col] = df[col].astype(float_size)
            case _:
                raise Exception(df[col].dtype)
    return df

In [74]:
@nb.njit(parallel=True)
def compute_triplet_imbalance(df_values:np.ndarray, combo_indices:list[tuple[int, int, int]]) -> np.ndarray:
    num_rows = df_values.shape[0]
    num_combinations = len(combo_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in nb.prange(num_combinations): # used instead of enumerate(combo_indices) to run in parallel
        a, b, c = combo_indices[i]
        for j in nb.prange(num_rows):
            _a, _b, _c = df_values[j, a], df_values[j, b], df_values[j, c]
            max_val = max(_a, _b, _c)
            min_val = min(_a, _b, _c)
            mid_val = sum([_a, _b, _c])-max_val-min_val
            imbalance_features[j, i] = np.nan if mid_val == min_val else (max_val-mid_val)/(mid_val-min_val)
    return imbalance_features   

def calculate_triplet_imbalance_numba(cols:list[str], df:pd.DataFrame) -> pd.DataFrame:
    df_values = df[cols].values
    combo_indices = []
    columns = []
    for a, b, c in itertools.combinations(cols, 3):
        combo_indices.append(tuple([cols.index(col) for col in [a, b, c]]))
        columns.append(f'{a}_{b}_{c}_imbalance')
    features_array = compute_triplet_imbalance(df_values, combo_indices)
    features = pd.DataFrame(features_array, columns=columns)
    return features

@nb.njit(fastmath=True)
def rolling_average(arr:np.ndarray, window:int) -> np.ndarray:
    raise NotImplementedError # TODO: figure out why pd.rolling() wasn't used here instead

@nb.njit(parallel=True)
def compute_rolling_averages(df_values:np.ndarray, window_sizes:list[int]) -> np.ndarray:
    raise NotImplementedError # TODO: again, figure out why pd.rolling() wasn't used

In [75]:
def imbalance_features(df:pd.DataFrame) -> pd.DataFrame:
    prices = [*[col for col in df.columns if 'price' in col], 'wap']
    sizes = [col for col in df.columns if 'size' in col]
    df['volume'] = df.eval('ask_size+bid_size')
    df['mid_price'] = df.eval('(ask_price+bid_price)/2')
    df['liquidity_imbalance'] = df.eval('(bid_size-ask_size)/volume')
    df['matched_imbalance'] = df.eval('(imbalance_size-matched_size)/(imbalance_size+matched_size)')
    df['size_imbalance'] = df.eval('bid_size/ask_size')
    for cols in itertools.combinations(prices, 2):
        df[f'{cols[0]}_{cols[1]}_imbalance'] = df.eval(f'({cols[0]}-{cols[1]})/({cols[0]}+{cols[1]})')
    for cols in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(cols, df)
        df[triplet_feature.columns] = triplet_feature.values
    df['imbalance_momentum'] = df.groupby('stock_id').imbalance_size.diff(periods=1) / df.matched_size
    df['price_spread'] = df.eval('ask_price-bid_price')
    df['spread_intensity'] = df.groupby('stock_id').price_spread.diff()
    df['price_pressure'] = df.eval('imbalance_size*price_spread')
    df['market_urgency'] = df.eval('price_spread*liquidity_imbalance')
    df['depth_pressure'] = df.eval('(ask_size-bid_size)*(far_price-near_price)')
    for func in ['mean', 'std', 'skew', 'kurt']:
        df[f'all_prices_{func}'] = df[prices].agg(func, axis=1)
        df[f'all_sizes_{func}'] = df[sizes].agg(func, axis=1)
    for window in [1, 2, 3, 10]:
        for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
            df[f'{col}_shift_{window}'] = df.groupby('stock_id')[col].shift(window)
            df[f'{col}_ret_{window}'] = df.groupby('stock_id')[col].pct_change(window) # why 'ret'?
        for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'market_urgency', 'imbalance_momentum', 'size_imbalance']:
            df[f'{col}_diff_{window}'] = df.groupby('stock_id')[col].diff(window)
    df = df.replace([np.inf, -np.inf], 0)
    return df

def other_features(df:pd.DataFrame) -> pd.DataFrame:
    df['weekday'] = df.eval('date_id%5')
    df['seconds'] = df.eval('seconds_in_bucket%60')
    df['minutes'] = df.eval('seconds_in_bucket//60')
    gdf = df.groupby('stock_id')
    global_stock_id_feats = dict[str, pd.DataFrame]()
    for feat in ['size', 'price']:
        bid, ask = f'bid_{feat}', f'ask_{feat}'
        global_stock_id_feats[f'median_{feat}'] = gdf[bid].median()-gdf[ask].median()
        global_stock_id_feats[f'std_{feat}'] = gdf[bid].std()-gdf[ask].std()
        global_stock_id_feats[f'ptp_{feat}'] = gdf[bid].max()-gdf[ask].min() # TODO: why 'ptp'?
    for k, v in global_stock_id_feats.items():
        df[f'global_{k}'] = df.stock_id.map(v.to_dict())
    return df
        
def generate_all_features(df:pd.DataFrame) -> pd.DataFrame:
    cols = [c for c in df.columns if c not in ['row_id', 'time_id', 'target']]
    df = df[cols]
    print('Building imbalance features...', end='\r')
    df = imbalance_features(df)
    print('Building other features...    ', end='\r')
    df = other_features(df)
    gc.collect()
    feature_name = [i for i in df.columns if i not in ['row_id', 'time_id', 'target', 'date_id']]
    df = df[feature_name]
    print(f'Done. Total features in dataset: {len(df.columns)}')
    return df

In [76]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {k: v for k, v in enumerate(weights)}

In [77]:
if is_offline:
    df_train = df[df.date_id <= split_day]
    df_valid = df[df.date_id > split_day]
    print('Offline mode')
    print(f'train: {df_train.shape}, valid: {df_valid.shape}')
else:
    df_train = df
    print('Online mode')

Online mode


In [78]:
if is_train:
    df_train_feats = generate_all_features(df_train)
    df_train_feats = reduce_mem_usage(df_train_feats)

Done. Total features in dataset: 124


In [79]:
model_save_path = os.path.join(os.getcwd(), '.models')
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

svr_params = {'kernel': 'rbf', 'C':1, 'gamma':'scale'}

imputer = imp.SimpleImputer(strategy='mean')
date_ids = df_train.date_id.values
num_folds = 5
fold_size = 480 // num_folds
gap = 5
models = []
scores = []
for i in range(num_folds): # TODO: reimplement with KFold or TimeSeriesSplit
    # train/test split
    start = i * fold_size
    end = start + fold_size
    if i < num_folds - 1: # purge on all but the last fold # TODO: why?
        purged_start = end - 2
        purged_end = end + gap + 2
        train_indices = (date_ids >= start) & (date_ids < purged_start) | (date_ids > purged_end) # TODO: why?
    else:
        train_indices = (date_ids >= start) & (date_ids < end)
    test_indices = (date_ids >= end) & (date_ids < end + fold_size)
    df_fold_train = df_train_feats[train_indices]
    df_fold_train_target = df_train['target'][train_indices]
    df_fold_valid = df_train_feats[test_indices]
    df_fold_valid_target = df_train['target'][test_indices]
    # impute
    df_fold_train_imputed = imputer.fit_transform(df_fold_train)
    df_fold_valid_imputed = imputer.transform(df_fold_valid)
    print(f'Training fold {i+1}/{num_folds}...', end='\r')
    # train model(s) # TODO: add generic model protocol
    svr_model = svm.SVR(**svr_params)
    svr_model.fit(df_fold_train_imputed, df_fold_train_target)
    models.append(svr_model)
    # save and score
    model_filename = os.path.join(model_save_path, f'svr_model_fold_{i+1}.joblib')
    joblib.dump(svr_model, model_filename)
    fold_predictions = svr_model.predict(df_fold_valid_imputed)
    fold_score = met.mean_absolute_error(df_fold_valid_target, fold_predictions)
    scores.append(fold_score)
    print(f'Model saved to {model_filename}. MAE: {fold_score}.')
    # free memory
    del df_fold_train, df_fold_train_target, df_fold_valid, df_fold_valid_target, df_fold_valid_imputed, df_fold_train_imputed
    gc.collect()

print('Training final model...', end='\r')
final_model = svm.SVR(**svr_params) # TODO: why?
final_model.fit(df_train_feats, df_train['target'])
final_model_filename = os.path.join(model_save_path, f'svr_model_final.joblib')
joblib.dump(final_model, final_model_filename)
print(f'Final model saved to file: {final_model_filename}. Average MAE: {np.mean(scores)}')

Training fold 1/5...

In [None]:
def load_models_from_folder(model_save_path:str, num_folds:int=5) -> list:
    raise NotImplementedError # TODO: [f for f in os.listdir(model_save_path)] to traverse the folder and pull everything

folders = [] # TODO: remove once func is updated
all_loaded_models = []
for folder in folders:
    all_loaded_models.extend(load_models_from_folder(folder))

In [None]:
def zero_sum(prices, volumes): # TODO: return type
    raise NotImplementedError

if is_infer:
    raise NotImplementedError # the code is so inefficient I think its malicious