In [1]:
import pandas as pd
import numpy as np

from itertools import groupby
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
from itertools import combinations
from sklearn.decomposition import PCA
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
revealed_targets = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv')
test = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv')
sample_submission = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv')

Add median MedianVolV2.csv from the [Optiver|Baseline|Models](https://www.kaggle.com/code/ravi20076/optiver-baseline-models)

In [3]:
median_vol = pd.read_csv("/kaggle/input/optiver-memoryreduceddatasets/MedianVolV2.csv")
median_vol.index.name = "stock_id";
median_vol = median_vol[['overall_medvol', "first5min_medvol", "last5min_medvol"]]

In [4]:
median_sizes = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median()
std_sizes = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median()

In [5]:
def feat_eng(df):
    
    cols = [c for c in df.columns if c not in ['row_id', 'time_id']]
    df = df[cols]
    df = df.merge(median_vol, how = "left", left_on = "stock_id", right_index = True)
    
    df['bid_plus_ask_sizes'] = df['bid_size'] + train['ask_size']
#     df['median_size'] = df['stock_id'].map(median_sizes.to_dict())
    df['std_size'] = df['stock_id'].map(std_sizes.to_dict())
#     df['high_volume'] = np.where(df['bid_plus_ask_sizes'] > df['median_size'], 1, 0) 
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')

    df['ask_x_size'] = df.eval('ask_size*ask_price')
    df['bid_x_size'] = df.eval('bid_size*bid_price')
        
    df['ask_minus_bid'] = df['ask_x_size'] - df['bid_x_size'] 
    
    df["bid_size_over_ask_size"] = df["bid_size"].div(df["ask_size"])
    df["bid_price_over_ask_price"] = df["bid_price"].div(df["ask_price"])
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for c in combinations(prices, 2):
        
        df[f'{c[0]}_minus_{c[1]}'] = (df[f'{c[0]}'] - df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_times_{c[1]}'] = (df[f'{c[0]}'] * df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_{c[1]}_imb'] = df.eval(f'({c[0]}-{c[1]})/({c[0]}+{c[1]})')

    for c in combinations(prices, 3):
        
        max_ = df[list(c)].max(axis=1)
        min_ = df[list(c)].min(axis=1)
        mid_ = df[list(c)].sum(axis=1)-min_-max_

        df[f'{c[0]}_{c[1]}_{c[2]}_imb2'] = (max_-mid_)/(mid_-min_)
    
        
    df.drop(columns=[
        'date_id', 
        'reference_price_far_price_imb',
        'reference_price_minus_near_price',
        'reference_price_near_price_imb',
        'far_price_near_price_imb',
        'far_price_ask_price_imb',
        'far_price_bid_price_imb',
        'far_price_minus_wap',
        'std_size',
        'bid_size_over_ask_size',
        'ask_price_bid_price_imb',
        'near_price_times_wap'
    ], inplace=True)
        
    gc.collect()
    
    return df

In [6]:
%%time

y = train['target'].values
X = feat_eng(train.drop(columns='target'))

CPU times: user 1min 2s, sys: 17.3 s, total: 1min 19s
Wall time: 1min 19s


In [7]:
prices = [c for c in train.columns if 'price' in c]
pca_prices = PCA(n_components=1)
X['pca_prices'] = pca_prices.fit_transform(X[prices].fillna(1))

In [8]:
X

Unnamed: 0,stock_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,...,far_price_near_price_bid_price_imb2,far_price_near_price_wap_imb2,far_price_ask_price_bid_price_imb2,far_price_ask_price_wap_imb2,far_price_bid_price_wap_imb2,near_price_ask_price_bid_price_imb2,near_price_ask_price_wap_imb2,near_price_bid_price_wap_imb2,ask_price_bid_price_wap_imb2,pca_prices
0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.50,...,-1.000000e+00,-1.000000e+00,-1.000214e+00,-1.000026e+00,-1.000188,-1.000214e+00,-1.000026e+00,-1.000188,0.138298,-0.000766
1,1,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,...,-1.000000e+00,-1.000000e+00,-1.000764e+00,-1.000660e+00,-1.000104,-1.000764e+00,-1.000660e+00,-1.000104,6.346154,-0.000766
2,2,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.00,...,-1.000000e+00,-1.000000e+00,-1.000896e+00,-1.000298e+00,-1.000597,-1.000896e+00,-1.000298e+00,-1.000597,0.499162,-0.000766
3,3,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.90,...,-1.000000e+00,-1.000000e+00,-1.000215e+00,-1.000214e+00,-1.000001,-1.000215e+00,-1.000214e+00,-1.000001,214.000000,-0.000766
4,4,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,...,-1.000000e+00,-1.000000e+00,-1.000622e+00,-1.000016e+00,-1.000606,-1.000622e+00,-1.000016e+00,-1.000606,0.026403,-0.000766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,32257.04,...,5.251197e+12,-1.783425e+12,2.006861e-01,1.784512e-01,0.018868,2.006861e-01,1.784512e-01,0.018868,9.636364,-0.001032
5237976,196,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,205108.40,...,1.000000e+00,1.684825e+00,5.000000e-01,1.173913e-01,0.342412,1.000000e+00,1.870670e-01,0.684825,0.460227,-0.000637
5237977,197,540,0.00,0,0.995789,12725436.10,0.995789,0.995789,0.995789,16790.66,...,-1.000000e+00,inf,-2.822256e+11,1.075000e+01,inf,-2.822256e+11,1.075000e+01,inf,10.750000,-0.004980
5237978,198,540,1000898.84,1,0.999210,94773271.05,0.999210,0.999210,0.998970,125631.72,...,-9.251859e-13,-1.099231e-12,-9.251859e-13,-1.099231e-12,5.315789,-9.251859e-13,-1.099231e-12,5.315789,5.315789,-0.001557


In [9]:
%%time

m = lgb.LGBMRegressor(learning_rate=0.018052307589575444, max_depth=9, n_estimators=700,
              num_leaves=442, objective='mae', random_state=42,
              reg_alpha=0.02216069565875271, reg_lambda=0.01223572246957101)
m.fit(X, y)

CPU times: user 54min 12s, sys: 36.8 s, total: 54min 49s
Wall time: 16min 34s


In [10]:
feat_imp = pd.Series(m.feature_importances_, index=X.columns).sort_values()
print('Columns with poor contribution', feat_imp[feat_imp<100].index)
fig = px.bar(x=feat_imp, y=feat_imp.index, orientation='h')
fig.show()

Columns with poor contribution Index([], dtype='object')


In [11]:
feat_imp.sort_values()

far_price_wap_imb                        848
far_price_times_wap                      867
ask_price_wap_imb                        901
far_price_bid_price_wap_imb2             927
reference_price_near_price_wap_imb2      940
                                       ...  
imbalance_ratio                         5984
stock_id                                6132
imbalance_size                          7888
matched_size                           10461
seconds_in_bucket                      15191
Length: 81, dtype: int32

In [12]:
test = feat_eng(test)
test['pca_prices'] = pca_prices.transform(test[prices].fillna(1))

In [13]:
test.shape

(33000, 81)

In [14]:
m.predict(test)

array([-0.88394646,  1.86271992,  3.52227879, ...,  0.03889031,
        0.98769131, -2.47166108])

In [15]:
def zero_sum(prices, volumes):
    
#    I got this idea from https://github.com/gotoConversion/goto_conversion/
    
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

In [16]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [17]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    
    feat = feat_eng(test)
    feat['pca_prices'] = pca_prices.transform(feat[prices].fillna(1))
    sample_prediction['target'] = m.predict(feat)
    
    sample_prediction['target'] = zero_sum(sample_prediction['target'], test.loc[:,'bid_size'] + test.loc[:,'ask_size'])
    
    env.predict(sample_prediction)
    
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
