# Setup

In [18]:
import sys
# sys.path.append("/kaggle/input/python-helpers/") # Comment this out when you are not in Kaggle kernel
sys.path.append("../") # Comment this out when you are not in local environment
import utils
from utils import (
    # General Functions
    check_memory_usage, check_memory_by_global_variable, # For memory handling
    get_time_now, cprint, # For logging
    get_cols, downcast_to_32bit, my_concat, my_power, my_log, list_diff, # For preprocessing
    plot_feature_importance, plot_scatterplot, # For visualization
    get_feature_summary, # For feature engineering & feature selection
    check_auc, plot_heatmap, # For EDA
    # Competition Constants
    META_COLUMNS, 
    # Competition Functions
    filter_df, sort_df, check_target_dependency, # EDA
    calculate_psi, # Validation
    train_lgbm, # Model Training
    clean_df, get_price_clippers, get_volume_clippers, clip_df, # Pre-processing: Clean & Clip
    calc_robust_scale, calc_std_scale, scale_base_columns,  # Pre-processing: Scaling
    setup_validation_zip, # Simulation
    lgbm_inference_by_batch, # Inference
    zero_sum, # Post-processing
)
from utils import (
    get_master_daily_target_data, generate_interday_target_features, 
    get_master_daily_price_data, generate_interday_price_features, 
    get_master_daily_volume_data, generate_interday_volume_features
)

In [19]:
import gc
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from itertools import chain, repeat
from tqdm import tqdm
tqdm.pandas()

In [20]:
%load_ext autoreload
%autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
# This should be equivalent to '/kaggle/input' in kaggel environment
# Meaning to say, we should place our working data folder same as the Kaggle datasets structure to make our life easier
DATA_DIR = '../data/'

In [22]:
%%time
train = pd.read_csv(f"{DATA_DIR}/optiver-trading-at-the-close/train.csv")

CPU times: user 3.89 s, sys: 1.11 s, total: 5 s
Wall time: 5.75 s


In [23]:
%%time
train.to_parquet(f'{DATA_DIR}/optiver-train-data/raw_train.parquet')

CPU times: user 1.75 s, sys: 139 ms, total: 1.89 s
Wall time: 1.73 s


# Prepare Data Zip for loop data pipeline

## For validation data

In [24]:
# val_iter_zip = setup_validation_zip(data_dir=DATA_DIR, val_start_date=435, val_end_date=480)
# joblib.dump(val_iter_zip, f'{DATA_DIR}/optiver-train-data/iter_val_subset.pkl')

# Simple Cleaning
Steps:
- Remove columns
- Downcast DataType
- Rename columns
- Remove stock-date pairs with missing data (exclude far_price and near_price)

Further Enhancements:
- Not sure if we could use row_id & time_id to ease our preprocessing, but atm just drop both of them

In [25]:
missing_stock_dates = joblib.load(f"{DATA_DIR}/optiver-preprocess-helpers/missing_stock_dates.pkl")

In [26]:
print(train.shape)
train.head()

(5237980, 17)


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180603,1,1,13380277,,,1,60652,1,8493,1,-3,0,0_0_0
1,1,0,0,166604,-1,1,1642214,,,1,3233,1,20605,1,-6,0,0_0_1
2,2,0,0,302880,-1,1,1819368,,,1,37956,1,18995,1,-8,0,0_0_2
3,3,0,0,11917682,-1,1,18389746,,,1,2325,1,479032,1,-4,0,0_0_3
4,4,0,0,447550,-1,1,17860615,,,1,16486,1,434,1,-7,0,0_0_4


In [27]:
%%time
train = clean_df(train, missing_stock_dates=missing_stock_dates)

CPU times: user 199 ms, sys: 295 ms, total: 494 ms
Wall time: 519 ms


In [28]:
print(train.shape)
train.head()

(5237760, 15)


Unnamed: 0,stock_id,date_id,seconds,imb_size,imb_flag,ref_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wa_price,target
0,0,0,0,3180603,1,1,13380277,,,1,60652,1,8493,1,-3
1,1,0,0,-166604,-1,1,1642214,,,1,3233,1,20605,1,-6
2,2,0,0,-302880,-1,1,1819368,,,1,37956,1,18995,1,-8
3,3,0,0,-11917682,-1,1,18389746,,,1,2325,1,479032,1,-4
4,4,0,0,-447550,-1,1,17860614,,,1,16486,1,434,1,-7


- Simple cleaning is done successfully :)

In [29]:
%%time
train.to_parquet(f'{DATA_DIR}/optiver-train-data/cleaned_train.parquet')

CPU times: user 1.17 s, sys: 53.2 ms, total: 1.22 s
Wall time: 1.13 s


# Simple Clipping

Steps:
- ~Clip price columns from both tails~ Since I tau the benchmark is wap=1 at seconds 0, so the prices are relative aldy, no need to clip I think
- ~Clip volume columns from upper tail~
- Clip target column from both tails
- Create 2 extra binary target columns (Not sure if these are useful)

Further Enhancements:
- To implement stock-based clipping instead of global clipping, because some stock might have high trading volumes / higher volatility?
- By defining the bound using outlier factor instead of simple percentile for all [price / volume] columns?

In [30]:
# base_price_cols = get_cols(train, contains="price")
# price_clippers = get_price_clippers(train, base_price_cols)

In [31]:
# base_volume_cols = get_cols(train, contains="size")
# volume_clippers = get_volume_clippers(train, base_volume_cols)

In [39]:
# Set the float formatting options
pd.options.display.float_format = lambda x: f"{x:,.0f}"

In [40]:
train[get_cols(train, "target")].describe()

Unnamed: 0,target,clipped_target,is_positive_target,is_mild_target
count,5237760,5237760,5237760,5237760
mean,0,0,0,1
std,9,9,0,0
min,-385,-100,0,0
25%,-5,-5,0,0
50%,0,0,0,1
75%,4,4,1,1
max,446,100,1,1


In [41]:
%%time
train = clip_df(train)

CPU times: user 29.5 ms, sys: 28.2 ms, total: 57.7 ms
Wall time: 57.3 ms


In [43]:
train[get_cols(train, "target")].describe()

Unnamed: 0,target,clipped_target,is_positive_target,is_mild_target
count,5237760,5237760,5237760,5237760
mean,0,0,0,1
std,9,9,0,0
min,-385,-100,0,0
25%,-5,-5,0,0
50%,0,0,0,1
75%,4,4,1,1
max,446,100,1,1


In [45]:
pd.reset_option("display.float_format")

In [46]:
train[get_cols(train, "target")].describe()

Unnamed: 0,target,clipped_target,is_positive_target,is_mild_target
count,5237760.0,5237760.0,5237760.0,5237760.0
mean,-0.04758888,-0.04879543,0.4957493,0.5009888
std,9.452816,9.354238,0.499982,0.4999991
min,-385.2898,-100.0,0.0,0.0
25%,-4.559755,-4.559755,0.0,0.0
50%,-0.06020069,-0.06020069,0.0,1.0
75%,4.409552,4.409552,1.0,1.0
max,446.0704,100.0,1.0,1.0


- Simple clipping is done successfully :)

In [47]:
%%time
train.to_parquet(f'{DATA_DIR}/optiver-train-data/clipped_train.parquet')

CPU times: user 1.38 s, sys: 38.2 ms, total: 1.41 s
Wall time: 1.3 s


In [48]:
# joblib.dump(price_clippers, f"{DATA_DIR}/optiver-preprocess-helpers/price_clippers.pkl")
# joblib.dump(volume_clippers, f"{DATA_DIR}/optiver-preprocess-helpers/volume_clippers.pkl")