In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
from sklearn.model_selection import TimeSeriesSplit
import optiver2023
import warnings
from warnings import simplefilter
warnings.filterwarnings('ignore')
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [None]:
df = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
df.head()

In [None]:
print(sum(df['target'].isna()))
df['target'].describe()

In [None]:
df.dropna(subset=['target'], inplace=True)

In [None]:
# Feature Engineering function
def generate_features(df):
    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag', 'imbalance_size', 'matched_size',
                'bid_size', 'ask_size', 'reference_price', 'far_price', 'near_price', 'ask_price',
                'bid_price', 'wap', 'imb_s1', 'imb_s2']
    
    # Imbalance features
    df['imb_s1'] = (df['bid_size'] - df['ask_size']) / (df['bid_size'] + df['ask_size'])
    df['imb_s2'] = (df['imbalance_size'] - df['matched_size']) / (df['matched_size'] + df['imbalance_size'])
    
    prices = ['reference_price', 'far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    for i, a in enumerate(prices):
        for j, b in enumerate(prices[i+1:], i+1):
            df[f'{a}_{b}_diff'] = df[a] - df[b]
            features.append(f'{a}_{b}_diff')
            df[f'{a}_{b}_delta'] = (df[a] - df[b]) / (df[a] + df[b])
            features.append(f'{a}_{b}_delta')
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1)
                    min_ = df[[a,b,c]].min(axis=1)
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_

                    df[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)
                    features.append(f'{a}_{b}_{c}_imb2')
    
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 4, 8, 16]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window, fill_method=None)
    
    df = additional_features(df)
    
    feature_names = [w for w in df.columns if w not in ["row_id", "time_id", "target"]]
    
    return df, feature_names

def additional_features(df):
    df['signed_imb_size'] = df['imbalance_buy_sell_flag'] * df['imbalance_size']
    df['mid_price'] = (df['ask_price'] + df['bid_price']) / 2
    df['volume'] = df['bid_size'] + df['ask_size']
    df['imb_ratio'] = df['imbalance_size'] / df['matched_size']
    df['size_imbalance'] = df['bid_size']/df['ask_size']
    df['imb_bid_r'] = df['imbalance_size']/df['bid_size']
    df['imb_ask_r'] = df['imbalance_size']/df['ask_size']
    df['mat_bid_ask_r'] = df.eval('matched_size/(bid_size+ask_size)')
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["spread_intensity"] = df.groupby(['stock_id'])['ask_price_bid_price_diff'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['ask_price_bid_price_diff'] * df['imb_s1']
    df['delta_bid_ask_size'] = (df['bid_size'] - df['ask_size']) / (df['bid_size'] + df['ask_size'] + 1)
    df['bid_ask_matched_r'] = (df['bid_size'] - df['ask_size']) / (df['matched_size'] + 1)
    
    prices = ['reference_price', 'far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    # Discrete derivatives
    for i, a in enumerate(prices):
        df[f'{a}_first_derivative'] = df.groupby(['stock_id'])[a].diff()
    for i, a in enumerate(sizes):
        df[f'{a}_first_derivative'] = df.groupby(['stock_id'])[a].diff()
    
    return df

In [None]:
# Apply feature engineering to the training data
df_train, feature_names = generate_features(df)
X = df_train[feature_names].values
y = df_train['target'].values
print(f"Number of Features: {len(feature_names)}")

In [None]:
# Model Training with Hyperparameter Tuning and Cross-Validation
from lightgbm import early_stopping

def train_model(X, y, n_splits=5, tr_folds=[3,4]):
    models = []
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    for fold, (train_index, test_index) in enumerate(tscv.split(X)):
        if fold not in tr_folds:
            continue
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # LightGBM model
#         model_lgb = lgb.LGBMRegressor(objective='regression_l1', n_estimators=500, learning_rate=0.05)
#         model_lgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], 
#                       callbacks=[early_stopping(stopping_rounds=50)])

#         # XGBoost model
#         print("Training with XGBoost")
#         model_xgb = xgb.XGBRegressor(objective='reg:squarederror', 
#                                      n_estimators=500, 
#                                      learning_rate=0.05, 
#                                      early_stopping_rounds=50)
#         model_xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)] , verbose=20)

#         # CatBoost model
        print("Training with CatBoost: Fold {fold}")
        model_cbt = cbt.CatBoostRegressor(loss_function='MAE',
                                          eval_metric = 'MAE',
                                          iterations=1000, 
                                          learning_rate=5e-3, 
                                          early_stopping_rounds=50,
                                          task_type="GPU",
                                          devices='0',
                                          boosting_type='Plain',
                                          l2_leaf_reg=10,
                                          thread_count=-1,
                                          verbose=True)
        
                        
        model_cbt.fit(X_train, y_train, eval_set=[(X_test, y_test)],  verbose=20)
        
        yval_pred = model_cbt.predict(X_test)
        loss_val = np.mean(np.abs(yval_pred - y_test))
        print(f"After fold {fold}, the validation loss is {loss_val:.3f}")

        models.append(model_cbt)

    return models

In [None]:
# Train the models
models = train_model(X, y)

In [None]:
env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
for (test, revealed_targets, prediction) in iter_test:
    test_df, all_features = generate_features(test)
    X = test_df[feature_names]
    prediction['target'] = models[0].predict(X)
    env.predict(prediction)