In [1]:
%load_ext autoreload
%autoreload 2
import gc  
import os  
import joblib
import time  
import warnings 
from itertools import combinations  
from warnings import simplefilter 
import lightgbm as lgb  
from lightgbm import Booster
if hasattr(Booster, '__deepcopy__'):
    del Booster.__deepcopy__
from copy import deepcopy
import numpy as np  
import pandas as pd  
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import KFold, TimeSeriesSplit  
import polars as pl
from utils_kgl.utils import *
from utils_kgl.model_selection import *
from utils_kgl.processing import *
from datetime import datetime
from tqdm import tqdm
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)



# Setup
with_gpu = False
is_local = True
path = 'data/optiver-trading-at-the-close/' if is_local else '/kaggle/input/optiver-trading-at-the-close/'

model_save_path = 'models'
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

# Do we split data in train and test
is_testing = True
split_day = 400
save_test_model = True

# Cross validation
is_CV = False
save_model_CV = True

# Final model
is_train_final = True
save_final_model = True


In [2]:
# Data Loading
df = pd.read_csv(path + "train.csv")
df = df.dropna(subset=["target"])
df.reset_index(drop=True, inplace=True)
df = reduce_mem_usage(df)

if is_testing:
    df_train = df[df["date_id"] <= split_day]
    df_test = df[df["date_id"] > split_day]
    print("Testing mode")
    print(f"train : {df_train.shape}, valid : {df_test.shape}")
    
else:
    df_train = df
    print(f"Dataset : {df_train.shape}")
    print("Online mode")

Testing mode
train : (4357893, 17), valid : (879999, 17)


# **Model Training**

In [3]:
# Model Params
lgb_params = {
     "objective": "mae",
    "n_estimators": 6000,
    # "n_estimators": 10,
    "num_leaves": 256,
    # "num_leaves": 16,
    "subsample": 0.6,
    "colsample_bytree": 0.8,
#         "learning_rate": 0.00871,
    "learning_rate": 0.01,
    'max_depth': 11,
    # 'max_depth': 4,
    
    "n_jobs": -1,
    "verbosity": -1,
    "importance_type": "gain",
#         "reg_alpha": 0.1,
    "reg_alpha": 0.2,
    "reg_lambda": 3.25
}

if with_gpu:
    lgb_params['device']='gpu'


prepro = Preprocessing()
model = lgb.LGBMRegressor(**lgb_params)

model_base_name = f'lgb_para_{get_name_from_param_dict(lgb_params)}'
model_filename_CV = os.path.join(model_save_path, f'{model_base_name}_CV.pkl')
model_filename_test = os.path.join(model_save_path, f'{model_base_name}_test.pkl')
model_filename_final = os.path.join(model_save_path, f'{model_base_name}_final.pkl')


In [4]:
# Cross validation
if is_CV:
    date_ids = df_train['date_id'].values
    y_train = df_train['target']
    num_folds = 5
    gap = 5
    prepro_CV = deepcopy(prepro)
    scores, modelsCV, preprosCV = scoreCV(df_train, y_train, date_ids,model, prepro_CV, num_folds, gap, verbose = True)

    if save_model_CV:
        joblib.dump((modelsCV, preprosCV), model_filename_CV)

In [5]:
import math
def random_split_date(df, frac=0.8):
    date_id = np.unique(df.date_id)
    n = len(date_id)
    train_size = math.floor(n*frac)
    date_id_sample = np.sort(np.random.choice(np.unique(df_train.date_id), size=train_size))
    df_frac_train = df[df.date_id.isin(date_id_sample)]
    df_frac_test = df[~df.date_id.isin(date_id_sample)]
    return df_frac_train, df_frac_test


In [6]:
# Training

models = []
prepros = []
n_model = 5
for i in tqdm(range(n_model)):
    testing_model = deepcopy(model)
    prepro_testing = deepcopy(prepro)
    
    df_frac_train, df_frac_valid = random_split_date(df_train, frac=0.9)

    df_fold_train = df_frac_train
    df_fold_train_target = df_frac_train['target']    
    df_fold_valid = df_frac_valid
    df_fold_valid_target = df_frac_valid['target']

    # Features generation
    df_fold_train = prepro_testing.fit_transform(df_fold_train)
    df_fold_valid = prepro_testing.transform(df_fold_valid)


    testing_model.fit(
                df_fold_train,
                df_fold_train_target,
                eval_set=[(df_fold_valid, df_fold_valid_target)],
                callbacks=[
                    lgb.callback.early_stopping(stopping_rounds=100),
                    lgb.callback.log_evaluation(period=100),
                ],
            )
    gc.collect()

    prepros.append(prepro_testing)
    models.append(testing_model)

joblib.dump((models, prepros), model_filename_test)


  0%|          | 0/5 [00:00<?, ?it/s]

DEBUG:numba.core.byteflow:bytecode dump:
>          0	NOP(arg=None, lineno=31)
           2	LOAD_FAST(arg=0, lineno=33)
           4	LOAD_ATTR(arg=0, lineno=33)
           6	LOAD_CONST(arg=1, lineno=33)
           8	BINARY_SUBSCR(arg=None, lineno=33)
          10	STORE_FAST(arg=2, lineno=33)
          12	LOAD_GLOBAL(arg=1, lineno=34)
          14	LOAD_FAST(arg=1, lineno=34)
          16	CALL_FUNCTION(arg=1, lineno=34)
          18	STORE_FAST(arg=3, lineno=34)
          20	LOAD_GLOBAL(arg=2, lineno=35)
          22	LOAD_METHOD(arg=3, lineno=35)
          24	LOAD_FAST(arg=2, lineno=35)
          26	LOAD_FAST(arg=3, lineno=35)
          28	BUILD_TUPLE(arg=2, lineno=35)
          30	CALL_METHOD(arg=1, lineno=35)
          32	STORE_FAST(arg=4, lineno=35)
          34	LOAD_GLOBAL(arg=4, lineno=36)
          36	LOAD_FAST(arg=3, lineno=36)
          38	CALL_FUNCTION(arg=1, lineno=36)
          40	GET_ITER(arg=None, lineno=36)
>         42	FOR_ITER(arg=98, lineno=36)
          44	STORE_FAST(arg

  0%|          | 0/5 [00:08<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Testing
if is_testing:
    testing_model = deepcopy(model)
    prepro_testing = deepcopy(prepro)

    df_train_feats = prepro_testing.fit_transform(df_train)
    df_test_feats = prepro_testing.transform(df_test)

    testing_model.fit(
        df_train_feats,
        df_train['target'],
            callbacks=[
                lgb.callback.log_evaluation(period=100),
            ],
        )
    test_score = score(testing_model,df_test_feats,  df_test['target'])
    print(test_score)

    if save_test_model:
        joblib.dump((testing_model, prepro_testing), model_filename_test)

    

5.9469350761610835


In [None]:
# Final training
if is_train_final:

    prepro_final = deepcopy(prepro)
    final_model = deepcopy(model)

    df_feats = prepro_final.fit_transform(df)

    final_model.fit(
        df_feats,
        df['target'],
            callbacks=[
                lgb.callback.log_evaluation(period=100),
            ],
        )
    if save_final_model:
        joblib.dump((final_model, prepro_final), model_filename_final)


# **Submission**

In [None]:

models = joblib.load(model_filename_CV)
global_stock_id_feats_final = {
    "median_size": df.groupby("stock_id")["bid_size"].median() + df.groupby("stock_id")["ask_size"].median(),
    "std_size": df.groupby("stock_id")["bid_size"].std() + df.groupby("stock_id")["ask_size"].std(),
    "ptp_size": df.groupby("stock_id")["bid_size"].max() - df.groupby("stock_id")["bid_size"].min(),
    "median_price": df.groupby("stock_id")["bid_price"].median() + df.groupby("stock_id")["ask_price"].median(),
    "std_price": df.groupby("stock_id")["bid_price"].std() + df.groupby("stock_id")["ask_price"].std(),
    "ptp_price": df.groupby("stock_id")["bid_price"].max() - df.groupby("stock_id")["ask_price"].min(),
}

In [None]:
y_min, y_max = -64, 64
lgb_model_weights = weighted_average(models)
lgb_model_weights


[0.125, 0.125, 0.25, 0.5]

In [None]:
weighted_average(range(3))

for date in tqdm(np.sort(df_test.date_id.unique())):
        df_date = df_test[df_test.date_id==date]
        for sec in range(0, 541, 10):
            # Data setup
            test = df_date[df_date['seconds_in_bucket']==sec].copy()
            y_test = test['target']
            test.drop('target', axis=1, inplace=True)
            sample_prediction = pd.DataFrame({'target':np.nan}, index = test.index)

            pred = np.zeros(len(test))
            for model, weight in zip(models, lgb_model_weights):
                test_feat = generate_all_features(test, global_stock_id_feats_final)
                pred += weight * model.predict(test_feat)
            
            pred = pred - np.mean(pred)
            pred = np.clip(pred, y_min, y_max)

            sample_prediction['target'] = pred
            # break
        break
            # Submission model
            # X_test = preprocess.transform(test)
            # model.add_data() #TO MODIFY =======
            # sample_prediction['target'] = model.predict(X_test)



  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/45 [01:00<?, ?it/s]


In [None]:
test_feat[feature_columns]

Unnamed: 0,stock_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,...,dow,seconds,minute,time_to_market_close,global_median_size,global_std_size,global_ptp_size,global_median_price,global_std_price,global_ptp_price
0,0,0,0.000000e+00,0,1.000268,12874820.00,,,0.999911,11182.000000,...,1,0,0,540,42739.160156,132986.919326,5.898989e+06,1.999695,0.003353,0.017414
1,1,0,1.378668e+06,1,0.999853,2806215.25,,,0.999853,801.200012,...,1,0,0,540,25548.500000,66444.908576,6.938986e+05,1.999827,0.005588,0.029370
2,2,0,0.000000e+00,0,0.999373,4873746.50,,,0.999373,8005.459961,...,1,0,0,540,26228.099609,75674.654277,1.069838e+06,2.000200,0.005333,0.051622
3,3,0,7.863030e+06,1,0.999576,46879784.00,,,0.999576,21946.099609,...,1,0,0,540,41667.000000,93875.770485,1.928848e+06,1.999980,0.002903,0.018551
4,4,0,4.599490e+06,-1,1.000425,15193377.00,,,0.999952,169.009995,...,1,0,0,540,34014.580078,80670.274575,1.604066e+06,1.999816,0.003717,0.017379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,0,5.944218e+06,1,0.999566,16517714.00,,,0.999675,9224.000000,...,1,0,0,540,51941.548828,98218.034911,2.761659e+06,1.999930,0.003051,0.014076
196,196,0,2.441752e+06,1,0.999332,3866432.00,,,0.999332,16768.000000,...,1,0,0,540,42476.951172,78070.064273,4.596574e+05,2.000042,0.003416,0.017398
197,197,0,4.641126e+05,-1,0.999984,2801073.25,,,0.999984,171.699997,...,1,0,0,540,30070.040039,71964.174970,1.575294e+06,1.999984,0.004696,0.020387
198,198,0,1.650333e+07,1,0.999477,68372056.00,,,0.999961,4134.000000,...,1,0,0,540,304739.250000,354682.788090,2.159163e+06,1.999917,0.003146,0.015738


In [None]:
test_feat

Unnamed: 0,stock_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,...,dow,seconds,minute,time_to_market_close,global_median_size,global_std_size,global_ptp_size,global_median_price,global_std_price,global_ptp_price
0,0,0,0.000000e+00,0,1.000268,12874820.00,,,0.999911,11182.000000,...,1,0,0,540,42739.160156,132986.919326,5.898989e+06,1.999695,0.003353,0.017414
1,1,0,1.378668e+06,1,0.999853,2806215.25,,,0.999853,801.200012,...,1,0,0,540,25548.500000,66444.908576,6.938986e+05,1.999827,0.005588,0.029370
2,2,0,0.000000e+00,0,0.999373,4873746.50,,,0.999373,8005.459961,...,1,0,0,540,26228.099609,75674.654277,1.069838e+06,2.000200,0.005333,0.051622
3,3,0,7.863030e+06,1,0.999576,46879784.00,,,0.999576,21946.099609,...,1,0,0,540,41667.000000,93875.770485,1.928848e+06,1.999980,0.002903,0.018551
4,4,0,4.599490e+06,-1,1.000425,15193377.00,,,0.999952,169.009995,...,1,0,0,540,34014.580078,80670.274575,1.604066e+06,1.999816,0.003717,0.017379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,0,5.944218e+06,1,0.999566,16517714.00,,,0.999675,9224.000000,...,1,0,0,540,51941.548828,98218.034911,2.761659e+06,1.999930,0.003051,0.014076
196,196,0,2.441752e+06,1,0.999332,3866432.00,,,0.999332,16768.000000,...,1,0,0,540,42476.951172,78070.064273,4.596574e+05,2.000042,0.003416,0.017398
197,197,0,4.641126e+05,-1,0.999984,2801073.25,,,0.999984,171.699997,...,1,0,0,540,30070.040039,71964.174970,1.575294e+06,1.999984,0.004696,0.020387
198,198,0,1.650333e+07,1,0.999477,68372056.00,,,0.999961,4134.000000,...,1,0,0,540,304739.250000,354682.788090,2.159163e+06,1.999917,0.003146,0.015738


In [None]:
pred

array([ 0.00372014,  0.12091547, -0.2335647 , -0.20005453,  0.28919267,
       -0.07303482, -0.24893318, -0.35595845, -0.12437316, -0.2335647 ,
        0.05778488,  0.39755979,  0.01802448,  0.36466701,  0.22805028,
       -0.23845915,  0.39754376, -0.11948629,  0.02879339,  0.38217582,
       -0.14783484, -0.14114001,  0.33601693, -0.14783484, -0.14783484,
       -0.14783484,  0.05033691, -0.14866659, -0.20659555, -0.2335647 ,
        0.0515903 , -0.15154023,  0.08667565, -0.00783628,  0.24518352,
       -0.29399148, -0.05993284, -0.35595845, -0.30333634, -0.145917  ,
       -0.00637755, -0.14590054,  0.2941225 ,  0.03869534, -0.35595845,
       -0.06573168,  0.37235659, -0.13035243, -0.01152377,  0.0456894 ,
        0.20504763, -0.01500222, -0.11235186, -0.0270448 , -0.35595845,
        0.08469761,  0.30408525,  0.3355455 ,  0.04976161,  0.38313664,
        0.11255045, -0.06410775,  0.40448684,  0.08137458,  0.40448684,
       -0.0018346 , -0.35595845, -0.13278826, -0.35595845, -0.00

In [None]:
if is_infer:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()
    counter = 0
    y_min, y_max = -64, 64
    qps, predictions = [], []
    cache = pd.DataFrame()

    # Weights for each fold model
    if LGB:
        lgb_model_weights = weighted_average(models)
        #cbt_model_weights = weighted_average(models_cbt)
    
    for (test, revealed_targets, sample_prediction) in iter_test:
        now_time = time.time()
        cache = pd.concat([cache, test], ignore_index=True, axis=0)
        if counter > 0:
            cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
        feat = generate_all_features(cache)[-len(test):]
        print(f"Feat Shape is: {feat.shape}")

        # Generate predictions for each model and calculate the weighted average
        if LGB:
            lgb_predictions = np.zeros(len(test))
            for model, weight in zip(models, lgb_model_weights):
                lgb_predictions += weight * model.predict(feat[feature_columns])

        predictions = lgb_predictions
        
        #Using mean predictions rather than zero sum
        final_predictions = predictions - np.mean(predictions)
        clipped_predictions = np.clip(final_predictions, y_min, y_max)
        sample_prediction['target'] = clipped_predictions
        env.predict(sample_prediction)
        counter += 1
        qps.append(time.time() - now_time)
        if counter % 10 == 0:
            print(counter, 'qps:', np.mean(qps))

    time_cost = 1.146 * np.mean(qps)
    print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")
