# Setup

In [17]:
import sys
# sys.path.append("/kaggle/input/python-helpers/") # Comment this out when you are not in Kaggle kernel
sys.path.append("../") # Comment this out when you are not in local environment
import utils
from utils import (
    # General Functions
    check_memory_usage, check_memory_by_global_variable, # For memory handling
    get_time_now, cprint, # For logging
    get_cols, downcast_to_32bit, my_concat, my_power, my_log, list_diff, # For preprocessing
    plot_feature_importance, plot_scatterplot, # For visualization
    get_feature_summary, # For feature engineering & feature selection
    check_auc, plot_heatmap, # For EDA
    # Competition Constants
    META_COLUMNS, 
    # Competition Functions
    filter_df, sort_df, check_target_dependency, # EDA
    calculate_psi, # Validation
    train_lgbm, # Model Training
    clean_df, get_price_clippers, get_volume_clippers, clip_df, # Pre-processing: Clean & Clip
    calc_robust_scale, calc_std_scale, scale_base_columns,  # Pre-processing: Scaling
    setup_validation_zip, # Simulation
    lgbm_inference_by_batch, # Inference
    zero_sum, # Post-processing
)
from utils import (
    get_master_daily_target_data, generate_interday_target_features, 
    get_master_daily_price_data, generate_interday_price_features, 
    get_master_daily_volume_data, generate_interday_volume_features
)

In [18]:
import gc
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from itertools import chain, repeat
from tqdm import tqdm
tqdm.pandas()

In [19]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
# This should be equivalent to '/kaggle/input' in kaggel environment
# Meaning to say, we should place our working data folder same as the Kaggle datasets structure to make our life easier
DATA_DIR = '../data/'

In [21]:
%%time
train = pd.read_csv(f"{DATA_DIR}/optiver-trading-at-the-close/train.csv")

CPU times: user 3.76 s, sys: 1.09 s, total: 4.85 s
Wall time: 5.29 s


In [22]:
%%time
train.to_parquet(f'{DATA_DIR}/optiver-train-data/raw_train.parquet')

CPU times: user 1.7 s, sys: 115 ms, total: 1.81 s
Wall time: 1.65 s


# Prepare Data Zip for loop data pipeline

## For validation data

In [23]:
# val_iter_zip = setup_validation_zip(data_dir=DATA_DIR, val_start_date=435, val_end_date=480)
# joblib.dump(val_iter_zip, f'{DATA_DIR}/optiver-train-data/iter_val_subset.pkl')

# Simple Cleaning
Steps:
- Remove columns
- Downcast DataType
- Rename columns
- Remove stock-date pairs with missing data (exclude far_price and near_price)

Further Enhancements:
- Not sure if we could use row_id & time_id to ease our preprocessing, but atm just drop both of them

In [24]:
print(train.shape)
train.head()

(5237980, 17)


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [25]:
%%time
train = clean_df(train)

CPU times: user 132 ms, sys: 307 ms, total: 439 ms
Wall time: 513 ms


In [26]:
print(train.shape)
train.head()

(5237980, 15)


Unnamed: 0,stock_id,date_id,seconds,imb_size,imb_flag,ref_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wa_price,target
0,0,0,0,3180603.0,1,0.999812,13380277.0,,,0.999812,60651.5,1.000026,8493.030273,1.0,-3.029704
1,1,0,0,-166603.9,-1,0.999896,1642214.25,,,0.999896,3233.040039,1.00066,20605.089844,1.0,-5.519986
2,2,0,0,-302879.9,-1,0.999561,1819368.0,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995
3,3,0,0,-11917680.0,-1,1.000171,18389746.0,,,0.999999,2324.899902,1.000214,479032.40625,1.0,-4.010201
4,4,0,0,-447550.0,-1,0.999532,17860614.0,,,0.999394,16485.539062,1.000016,434.100006,1.0,-7.349849


In [27]:
train

Unnamed: 0,stock_id,date_id,seconds,imb_size,imb_flag,ref_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wa_price,target
0,0,0,0,3.180603e+06,1,0.999812,13380277.00,,,0.999812,60651.500000,1.000026,8493.030273,1.000000,-3.029704
1,1,0,0,-1.666039e+05,-1,0.999896,1642214.25,,,0.999896,3233.040039,1.000660,20605.089844,1.000000,-5.519986
2,2,0,0,-3.028799e+05,-1,0.999561,1819368.00,,,0.999403,37956.000000,1.000298,18995.000000,1.000000,-8.389950
3,3,0,0,-1.191768e+07,-1,1.000171,18389746.00,,,0.999999,2324.899902,1.000214,479032.406250,1.000000,-4.010201
4,4,0,0,-4.475500e+05,-1,0.999532,17860614.00,,,0.999394,16485.539062,1.000016,434.100006,1.000000,-7.349849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,-2.440723e+06,-1,1.000317,28280362.00,0.999734,0.999734,1.000317,32257.039062,1.000434,319862.406250,1.000328,2.310276
5237976,196,480,540,-3.495105e+05,-1,1.000643,9187699.00,1.000129,1.000386,1.000643,205108.406250,1.000900,93393.070312,1.000819,-8.220077
5237977,197,480,540,0.000000e+00,0,0.995789,12725436.00,0.995789,0.995789,0.995789,16790.660156,0.995883,180038.312500,0.995797,1.169443
5237978,198,480,540,1.000899e+06,1,0.999210,94773272.00,0.999210,0.999210,0.998970,125631.718750,0.999210,669893.000000,0.999008,-1.540184


- Simple cleaning is done successfully :)

In [28]:
%%time
train.to_parquet(f'{DATA_DIR}/optiver-train-data/cleaned_train.parquet')

CPU times: user 1.18 s, sys: 55.8 ms, total: 1.23 s
Wall time: 1.11 s


# Simple Clipping

Steps:
- Clip price columns from both tails
- Clip volume columns from upper tail
- Clip target column from both tails
- Create 2 extra binary target columns (Not sure if these are useful)

Further Enhancements:
- To implement stock-based clipping instead of global clipping, because some stock might have high trading volumes / higher volatility?
- By defining the bound using outlier factor instead of simple percentile for all [price / volume] columns?

In [29]:
base_price_cols = get_cols(train, contains="price")
price_clippers = get_price_clippers(train, base_price_cols)

[34m[1mFor ref_price, the global clip bound is[0m [32m[1m(0.9798, 1.0246)[0m
[34m[1mFor far_price, the global clip bound is[0m [32m[1m(0.7866, 1.3063)[0m
[34m[1mFor near_price, the global clip bound is[0m [32m[1m(0.8942, 1.1092)[0m
[34m[1mFor bid_price, the global clip bound is[0m [32m[1m(0.9793, 1.0223)[0m
[34m[1mFor ask_price, the global clip bound is[0m [32m[1m(0.9814, 1.0249)[0m
[34m[1mFor wa_price, the global clip bound is[0m [32m[1m(0.9802, 1.0233)[0m


In [30]:
base_volume_cols = get_cols(train, contains="size")
volume_clippers = get_volume_clippers(train, base_volume_cols)

[34m[1mFor imb_size, the global clip bound is[0m [32m[1m(-412,151,000, 412,151,000)[0m
[34m[1mFor matched_size, the global clip bound is[0m [32m[1m(-4,502,560,000, 4,502,560,000)[0m
[34m[1mFor bid_size, the global clip bound is[0m [32m[1m(-2,392,000, 2,392,000)[0m
[34m[1mFor ask_size, the global clip bound is[0m [32m[1m(-2,755,000, 2,755,000)[0m


In [31]:
# Set the float formatting options
pd.options.display.float_format = lambda x: f"{x:,.0f}"

In [32]:
train[get_cols(train, "size")].describe()

Unnamed: 0,imb_size,matched_size,bid_size,ask_size
count,5237760,5237760,5237980,5237980
mean,-252466,45100236,51814,53576
std,21295616,139841296,111421,129355
min,-2982027776,4317,0,0
25%,-1217970,5279576,7375,7824
50%,0,12882638,21969,23018
75%,1017940,32700130,55832,57878
max,1228661248,7713681920,30287840,54405000


In [33]:
%%time
train = clip_df(train, price_clippers=price_clippers, volume_clippers=volume_clippers)

CPU times: user 182 ms, sys: 149 ms, total: 330 ms
Wall time: 329 ms


In [34]:
train[get_cols(train, "size")].describe()

Unnamed: 0,imb_size,matched_size,bid_size,ask_size
count,5237760,5237760,5237980,5237980
mean,-228197,44978084,51576,53208
std,19100268,135028233,94601,98628
min,-412151000,4317,0,0
25%,-1217970,5279575,7375,7824
50%,0,12882638,21969,23018
75%,1017940,32700130,55832,57878
max,412151000,4502560000,2392000,2755000


- Simple clipping is done successfully :)

In [35]:
pd.reset_option("display.float_format")

In [36]:
%%time
train.to_parquet(f'{DATA_DIR}/optiver-train-data/clipped_train.parquet')

CPU times: user 1.5 s, sys: 54.9 ms, total: 1.55 s
Wall time: 1.38 s


In [37]:
joblib.dump(price_clippers, f"{DATA_DIR}/optiver-preprocess-helpers/price_clippers.pkl")
joblib.dump(volume_clippers, f"{DATA_DIR}/optiver-preprocess-helpers/volume_clippers.pkl")

['../data//optiver-preprocess-helpers/volume_clippers.pkl']