# Setup

In [1]:
import sys
# sys.path.append("/kaggle/input/python-helpers/") # Comment this out when you are not in Kaggle kernel
sys.path.append("../") # Comment this out when you are not in local environment
import utils
from utils import (
    # General Functions
    check_memory_usage, check_memory_by_global_variable, # For memory handling
    get_time_now, cprint, # For logging
    get_cols, downcast_to_32bit, my_concat, my_power, my_log, list_diff, # For preprocessing
    plot_feature_importance, plot_scatterplot, # For visualization
    get_feature_summary, # For feature engineering & feature selection
    check_auc, plot_heatmap, # For EDA
    # Competition Constants
    META_COLUMNS, 
    # Competition Functions
    filter_df, sort_df, check_target_dependency, # EDA
    calculate_psi, # Validation
    train_lgbm, # Model Training
    clean_df, get_price_clippers, get_volume_clippers, clip_df, # Pre-processing: Clean & Clip
    calc_robust_scale, calc_std_scale, scale_base_columns,  # Pre-processing: Scaling
    setup_validation_zip, # Simulation
    lgbm_inference_by_batch, # Inference
    zero_sum, # Post-processing
)
from utils import (
    get_master_daily_target_data, generate_interday_target_features, 
    get_master_daily_price_data, generate_interday_price_features, 
    get_master_daily_volume_data, generate_interday_volume_features
)

In [2]:
import gc
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from itertools import chain, repeat
from tqdm import tqdm
tqdm.pandas()

In [3]:
%load_ext autoreload
%autoreload

In [4]:
# This should be equivalent to '/kaggle/input' in kaggel environment
# Meaning to say, we should place our working data folder same as the Kaggle datasets structure to make our life easier
DATA_DIR = '../data/'

In [5]:
%%time
train = pd.read_csv(f"{DATA_DIR}/optiver-trading-at-the-close/train.csv")

CPU times: user 4.18 s, sys: 955 ms, total: 5.14 s
Wall time: 6.68 s


In [6]:
%%time
train.to_parquet(f'{DATA_DIR}/optiver-train-data/raw_train.parquet')

CPU times: user 1.9 s, sys: 176 ms, total: 2.08 s
Wall time: 2.19 s


# Prepare Data Zip for loop data pipeline

## For validation data

In [7]:
# val_iter_zip = setup_validation_zip(data_dir=DATA_DIR, val_start_date=435, val_end_date=480)
# joblib.dump(val_iter_zip, f'{DATA_DIR}/optiver-train-data/iter_val_subset.pkl')

# Simple Cleaning
Steps:
- Remove columns
- Downcast DataType
- Rename columns
- Remove stock-date pairs with missing data (exclude far_price and near_price)

Further Enhancements:
- Not sure if we could use row_id & time_id to ease our preprocessing, but atm just drop both of them

In [8]:
missing_stock_dates = joblib.load(f"{DATA_DIR}/optiver-preprocess-helpers/missing_stock_dates.pkl")

In [9]:
print(train.shape)
train.head()

(5237980, 17)


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [10]:
%%time
train = clean_df(train, missing_stock_dates=missing_stock_dates)

CPU times: user 278 ms, sys: 430 ms, total: 708 ms
Wall time: 906 ms


In [11]:
print(train.shape)
train.head()

(5237760, 17)


Unnamed: 0,stock_id,date_id,seconds,imb_size,imb_flag,ref_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wa_price,target,c_size,c_price
0,0,0,0,3180603.0,1,0.999812,13380277.0,,,0.999812,60651.5,1.000026,8493.030273,1.0,-3.029704,69144.53125,0.999838
1,1,0,0,166603.9,-1,0.999896,1642214.25,,,0.999896,3233.040039,1.00066,20605.089844,1.0,-5.519986,23838.128906,1.000556
2,2,0,0,302879.9,-1,0.999561,1819368.0,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,56951.0,0.999701
3,3,0,0,11917680.0,-1,1.000171,18389746.0,,,0.999999,2324.899902,1.000214,479032.40625,1.0,-4.010201,481357.3125,1.000213
4,4,0,0,447550.0,-1,0.999532,17860614.0,,,0.999394,16485.539062,1.000016,434.100006,1.0,-7.349849,16919.638672,0.99941


- Simple cleaning is done successfully :)

In [12]:
%%time
train.to_parquet(f'{DATA_DIR}/optiver-train-data/cleaned_train.parquet')

CPU times: user 1.58 s, sys: 65 ms, total: 1.65 s
Wall time: 1.82 s


# Simple Clipping

Steps:
- ~Clip price columns from both tails~ Since I tau the benchmark is wap=1 at seconds 0, so the prices are relative aldy, no need to clip I think
- ~Clip volume columns from upper tail~
- Clip target column from both tails
- Create 2 extra binary target columns (Not sure if these are useful)

Further Enhancements:
- To implement stock-based clipping instead of global clipping, because some stock might have high trading volumes / higher volatility?
- By defining the bound using outlier factor instead of simple percentile for all [price / volume] columns?

In [13]:
base_price_cols = get_cols(train, contains="price")
price_clippers = get_price_clippers(train, base_price_cols)

[34m[1mFor ref_price, the global clip bound is[0m [32m[1m(0.9759, 1.0289)[0m
[34m[1mFor far_price, the global clip bound is[0m [32m[1m(0.7097, 1.6328)[0m
[34m[1mFor near_price, the global clip bound is[0m [32m[1m(0.8922, 1.1146)[0m
[34m[1mFor bid_price, the global clip bound is[0m [32m[1m(0.9747, 1.0268)[0m
[34m[1mFor ask_price, the global clip bound is[0m [32m[1m(0.9770, 1.0297)[0m
[34m[1mFor wa_price, the global clip bound is[0m [32m[1m(0.9761, 1.0282)[0m
[34m[1mFor c_price, the global clip bound is[0m [32m[1m(0.9762, 1.0282)[0m


In [14]:
# base_volume_cols = get_cols(train, contains="size")
# volume_clippers = get_volume_clippers(train, base_volume_cols)

In [15]:
# Set the float formatting options
pd.options.display.float_format = lambda x: f"{x:,.4f}"

In [16]:
train[get_cols(train, "price")].describe()

Unnamed: 0,ref_price,far_price,near_price,bid_price,ask_price,wa_price,c_price
count,5237760.0,2343638.0,2380800.0,5237760.0,5237760.0,5237760.0,5237760.0
mean,1.0,1.0017,0.9997,0.9997,1.0003,1.0,1.0
std,0.0025,0.7215,0.0122,0.0025,0.0025,0.0025,0.0025
min,0.9353,0.0001,0.787,0.9349,0.9398,0.938,0.9367
25%,0.9988,0.9963,0.9971,0.9985,0.999,0.9988,0.9988
50%,1.0,0.9999,0.9999,0.9997,1.0002,1.0,1.0
75%,1.0012,1.0033,1.0026,1.0009,1.0014,1.0011,1.0012
max,1.0775,437.9531,1.3097,1.0775,1.0778,1.0777,1.0776


In [17]:
%%time
train = clip_df(train, price_clippers=price_clippers)

CPU times: user 226 ms, sys: 161 ms, total: 387 ms
Wall time: 428 ms


In [18]:
train[get_cols(train, "price")].describe()

Unnamed: 0,ref_price,far_price,near_price,bid_price,ask_price,wa_price,c_price
count,5237760.0,2343638.0,2380800.0,5237760.0,5237760.0,5237760.0,5237760.0
mean,1.0,0.9997,0.9997,0.9997,1.0003,1.0,1.0
std,0.0025,0.0194,0.0121,0.0025,0.0025,0.0025,0.0025
min,0.9759,0.7097,0.8922,0.9747,0.977,0.9761,0.9762
25%,0.9988,0.9963,0.9971,0.9985,0.999,0.9988,0.9988
50%,1.0,0.9999,0.9999,0.9997,1.0002,1.0,1.0
75%,1.0012,1.0033,1.0026,1.0009,1.0014,1.0011,1.0012
max,1.0289,1.6328,1.1146,1.0268,1.0297,1.0282,1.0282


In [19]:
pd.reset_option("display.float_format")

- Simple clipping is done successfully :)

In [20]:
%%time
train.to_parquet(f'{DATA_DIR}/optiver-train-data/clipped_train.parquet')

CPU times: user 1.82 s, sys: 60.4 ms, total: 1.88 s
Wall time: 2.09 s


In [21]:
joblib.dump(price_clippers, f"{DATA_DIR}/optiver-preprocess-helpers/price_clippers.pkl")
# joblib.dump(volume_clippers, f"{DATA_DIR}/optiver-preprocess-helpers/volume_clippers.pkl")

['../data//optiver-preprocess-helpers/price_clippers.pkl']