# Version 1: Feature Selection

## 1 Download Data

## 2 Feature Selection

In [1]:
import gc
import os
import time
import warnings
from itertools import combinations
from warnings import simplefilter

import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, TimeSeriesSplit
import polars as pl
from sklearn.preprocessing import MinMaxScaler
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor, EShapCalcType, EFeaturesSelectionAlgorithm
from sklearn.metrics import mean_absolute_error

is_offline = True
is_train = True
is_infer = True
max_lookback = np.nan
split_day = 435 #The testing data constixtute 90% = 435 / 481



In [2]:
df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/optiver-trading-at-the-close/train.csv")
df = df.dropna(subset=["target"])
df.reset_index(drop=True, inplace=True)
df.shape

(5237892, 17)

In [3]:
#To reduce memory usage
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """

    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    return df


### 2.1 Imbalance Features

In [4]:
# Import Numba for just-in-time (JIT) compilation and parallel processing
from numba import njit, prange

# Function to compute triplet imbalance in parallel using Numba
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    # Loop through all combinations of triplets
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]

        # Loop through rows of the DataFrame
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val

            # Prevent division by zero
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

# Function to calculate triplet imbalance for given price data and a DataFrame
def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance using the Numba-optimized function
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features



In [5]:
# generate imbalance features
def imbalance_features(df):
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    # Create features for pairwise price imbalances
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
    # Calculate triplet imbalance features using the Numba-optimized function
    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values


    # V2
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']

    #价量横截面统计特征（均值，标准差，偏度，峰度）
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)


    # V3
    # Calculate shifted and return features for specific columns
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)


    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size',
                'wap', 'near_price', 'far_price']:#'weighted_wap','price_spread'
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    #V4
    for window in [3, 5, 10]:
        df[f'price_change_diff_{window}'] = df[f'bid_price_diff_{window}'] - df[f'ask_price_diff_{window}']
        df[f'size_change_diff_{window}'] = df[f'bid_size_diff_{window}'] - df[f'ask_size_diff_{window}']

    #V5
    pl_df = pl.from_pandas(df)

    windows = [3, 5, 10]
    columns = ['ask_price', 'bid_price', 'ask_size', 'bid_size']

    group = ["stock_id"]
    expressions = []

    for window in windows:
        for col in columns:
            rolling_mean_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_mean(window)
                .over(group)
                .alias(f'rolling_diff_{col}_{window}')
            )

            rolling_std_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_std(window)
                .over(group)
                .alias(f'rolling_std_diff_{col}_{window}')
            )

            expressions.append(rolling_mean_expr)
            expressions.append(rolling_std_expr)

    lazy_df = pl_df.lazy().with_columns(expressions)

    pl_df = lazy_df.collect()

    df = pl_df.to_pandas()
    gc.collect()

    df['mid_price*volume'] = df['mid_price_movement'] * df['volume']
    df['harmonic_imbalance'] = df.eval('2 / ((1 / bid_size) + (1 / ask_size))')

    for col in df.columns:
        df[col] = df[col].replace([np.inf, -np.inf], 0)

    return df

# generate time & stock features
def other_features(df):
    df["dow"] = df["date_id"] % 5
    df["dom"] = df["date_id"] % 20
    df["seconds"] = df["seconds_in_bucket"] % 60
    df["minute"] = df["seconds_in_bucket"] // 60

    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df


In [6]:
#Stock weights for calculating imbalnce features
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]

weights = {int(k):v for k,v in enumerate(weights)}

### 2.2 Stock Classification Features

We first calculate ***correlation matrix*** of WAP, based on which we assign clusters to stocks. 1）After normalize the clusters, ranging from （-1, 1）, we try to construct ***stock classification features***. 2）We try to apply Stratifies K-Fold Cross Validation, so to speak carrying out CV by groups to train model ( LightGBM ) and fature selection ( REF ).

In [7]:
#Correlation Matrix of WAP
def calculate_daily_returns(stock_data):
    stock_data['return'] = stock_data['wap'].pct_change()
    return stock_data[['return', 'seconds_in_bucket']].dropna()  # Keep 'return' and 'seconds_in_bucket' columns

returns = df.groupby(['stock_id', 'date_id']).apply(calculate_daily_returns).reset_index()

# 2. Align the data for each stock by filling in the gaps (if any) and then concatenate the returns to form a matrix
# For this step, we will pivot the data so each stock has its own column, and each row represents a timestamp.
pivot_returns = returns.pivot_table(index=['date_id', 'seconds_in_bucket'],
                                    columns='stock_id',
                                    values='return')

# handle missing values by filling the average of all available
pivot_returns = pivot_returns.apply(lambda row: row.fillna(row.mean()), axis=1)

# 3. Compute the correlation matrix for all stocks
correlation_matrix = pivot_returns.corr()
#print(correlation_matrix)

In [8]:
# Hierarchical clustering
Z = linkage(correlation_matrix, 'ward')
clusters = fcluster(Z, 10, criterion = 'maxclust')

#Assign clusters to stocks
stock_clusters = pd.DataFrame({'stock_id': correlation_matrix.index, 'cluster': clusters})
print(stock_clusters)

     stock_id  cluster
0           0        4
1           1        7
2           2        7
3           3        4
4           4        4
..        ...      ...
195       195        5
196       196        9
197       197        3
198       198        5
199       199        9

[200 rows x 2 columns]


In [9]:
#Normalized stock cluster feature

def cluster_feature(df):
    #normalize
    scaler = MinMaxScaler(feature_range=(-1, 1))
    normalized_clusters = scaler.fit_transform(clusters.reshape(-1, 1))

    # Assign clusters to stocks

    # Assign clusters to stocks
    # 1. Extract cluster labels from hierarchical clustering
    df_clusters = pd.DataFrame({'stock_id': pivot_returns.columns, 'cluster_label': normalized_clusters.flatten()})

    # 2. Map cluster labels to each stock ID
    stock_id_to_cluster = dict(zip(df_clusters['stock_id'], df_clusters['cluster_label']))

    # 3. Add cluster labels to your original DataFrame
    df['cluster'] = df['stock_id'].map(stock_id_to_cluster)
    return df

### 2.3 PCA_Weighed Average Features * 4

In this section, we apply PCA on WAG and retrieve the first four elements as features for model training.

In [10]:
from sklearn.decomposition import PCA

pca = PCA()
principal_components = pca.fit_transform(correlation_matrix)

In [11]:
#PCA-weighed average price features
def pca_wap_feature(df):

    #Focus on the first 4 components and save as DataFrame

    #Create a pivot table for wap
    price_pivot = df.pivot_table(index=['date_id', 'seconds_in_bucket'], columns='stock_id', values='wap')

    #Generate principal DataFrame
    principal_df = pd.DataFrame(data=principal_components,
                                index=correlation_matrix.index,  # use stock_ids as the index
                                columns=['PC'+str(i) for i in range(1, principal_components.shape[1] + 1)])

    #Ensure the ordering of stock_id in price_pivot and principal_pca is consistent
    ordered_columns = price_pivot.columns
    principal_df = principal_df.loc[ordered_columns].reset_index()

    #Handle NaN values and replace with 0
    price_pivot.fillna(0, inplace=True)
    principal_df.fillna(0, inplace=True)

    #Initialize a dataframe to hold the PCA_WAP values
    pca_wap_df = pd.DataFrame(index=price_pivot.index)

    #Compute 4 WAPs using PCA
    for i in range(1,5):
        pca_wap_df[f'PCA_WAP_{i}'] = (price_pivot.values * principal_df.set_index('stock_id')[f'PC{i}'].values).sum(axis=1)

    #Resetting index for merging purposes
    pca_wap_df = pca_wap_df.reset_index()

    #Merging the PCA_WAP columns with the initial dataset df
    df = df.merge(pca_wap_df, on=['date_id', 'seconds_in_bucket'], how='left')

    return df

### 2.4 Feature Generation Function

In [12]:
# generate all features
def generate_all_features(df):
    cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    df = df[cols]
    df = imbalance_features(df)
    df = other_features(df)
    df = cluster_feature(df)
    df = pca_wap_feature(df)

    gc.collect()

    feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id", "date_id"]]

    return df[feature_name]

## 3 Feature Generation

### 3.1 Dataset Splitting

When the code is running offline, the entire dataset will be used for training. When the code is running online, 90% data will be used for training and the rest will be used for validation.
Additionally, for memory saving, after data splitting, we delete the original dataset, so to speak, dataframe, ***df***.

In [13]:
# Check if the code is running in offline or online mode
if is_offline:
    # In offline mode, split the data into training and validation sets based on the split_day
    df_train = df[df["date_id"] <= split_day]
    df_valid = df[df["date_id"] > split_day]
    print("Offline mode")
    print(f"train : {df_train.shape}, valid : {df_valid.shape}")
else:
    # In online mode, use the entire dataset for training
    df_train = df
    print("Online mode")

del df
gc.collect()

Offline mode
train : (4742893, 17), valid : (494999, 17)


0

### 3.2 Feature Generation

In [14]:
%%time
if is_train:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
    }
    if is_offline:
        df_train_feats = generate_all_features(df_train)
        print("Build Train Feats Finished.")
        df_valid_feats = generate_all_features(df_valid)
        print("Build Valid Feats Finished.")
        df_valid_feats = reduce_mem_usage(df_valid_feats)
    else:
        df_train_feats = generate_all_features(df_train)
        print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)

feature_name = list(df_train_feats.columns)
print(f"Feature length = {len(feature_name)}")

Build Train Feats Finished.
Build Valid Feats Finished.
Feature length = 184
CPU times: user 1min 28s, sys: 36 s, total: 2min 5s
Wall time: 1min 53s


## 4 Feature Selection

### 4.1 CatBoost for Feature Selection: reducing features from 184 -> 100

In [15]:
# %%time
# # Train procedure
# if is_train:
#     offline_split = df_train['date_id']>(split_day - 45)
#     df_offline_train = df_train_feats[~offline_split]
#     df_offline_valid = df_train_feats[offline_split]
#     df_offline_train_target = df_train['target'][~offline_split]
#     df_offline_valid_target = df_train['target'][offline_split]
#     df_train_target = df_train["target"]



#     ctb_params = dict(iterations=1200,
#                       learning_rate=1.0,
#                       depth=8,
#                       l2_leaf_reg=30,
#                       bootstrap_type='Bernoulli',
#                       subsample=0.66,
#                       loss_function='MAE',
#                       eval_metric = 'MAE',
#                       metric_period=100,
#                       od_type='Iter',
#                       od_wait=30,
#                       task_type='GPU',
#                       allow_writing_files=False,
#                       )

#     print("Feature Elimination Performing.")
#     ctb_model = CatBoostRegressor(**ctb_params)
#     summary = ctb_model.select_features(
#         df_offline_train[feature_name], df_offline_train_target,
#         eval_set=[(df_offline_valid[feature_name], df_offline_valid_target)],
#         features_for_select=feature_name,
#         num_features_to_select=len(feature_name)-96,    # Dropping from 184 to 100
#         steps=9,
#         algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
#         shap_calc_type=EShapCalcType.Regular,
#         train_final_model=False,
#         plot=True,
#     )

#     print("Valid Model Training on Selected Features Subset.")
#     ctb_model = CatBoostRegressor(**ctb_params)
#     ctb_model.fit(
#         df_offline_train[summary['selected_features_names']], df_offline_train_target,
#         eval_set=[(df_offline_valid[summary['selected_features_names']], df_offline_valid_target)],
#         use_best_model=True,
#     )

#     del df_offline_train, df_offline_valid, df_offline_train_target, df_offline_valid_target
#     gc.collect()

#     print("Infer Model Training on Selected Features Subset.")
#     infer_params = ctb_params.copy()
#     # CatBoost train best with Valid number of iterations
#     infer_params["iterations"] = ctb_model.best_iteration_
#     infer_ctb_model = CatBoostRegressor(**infer_params)
#     infer_ctb_model.fit(df_train_feats[summary['selected_features_names']], df_train_target)
#     print("Infer Model Training on Selected Features Subset Complete.")

#     if is_offline:
#         # Offline predictions
#         df_valid_target = df_valid["target"]
#         offline_predictions = infer_ctb_model.predict(df_valid_feats[summary['selected_features_names']])
#         offline_score = mean_absolute_error(offline_predictions, df_valid_target)
#         print(f"Offline Score {np.round(offline_score, 4)}")





In [16]:
# summary['eliminated_features_names']

In [17]:
# feat_importances = infer_ctb_model.get_feature_importance(prettified=True)

# plt.figure(figsize=(12, 20))
# sns.barplot(x="Importances", y="Feature Id", data=feat_importances)
# plt.title('CatBoost features importance:')
# plt.tight_layout()

In [18]:
# from catboost import EFstrType
# feat_interactions = infer_ctb_model.get_feature_importance(type=EFstrType.Interaction, prettified=True)
# top_interactions = feat_interactions[:10]
# top_interactions

In [19]:
# top_interactions['First Feature Index'] = top_interactions['First Feature Index'].apply(lambda x: summary['selected_features_names'][x])
# top_interactions['Second Feature Index'] = top_interactions['Second Feature Index'].apply(lambda x: summary['selected_features_names'][x])
# top_interactions.columns = ['First Feature', 'Second Feature', 'Interaction']
# top_interactions

In [20]:
del_list=['PCA_WAP_3',
 'global_std_price',
 'PCA_WAP_2',
 'dow',
 'near_price_wap_imb',
 'weighted_wap',
 'near_price_diff_10',
 'ask_price_wap_imb',
 'dom',
 'matched_size_ret_3',
 'reference_price_ret_10',
 'wap_diff_10',
 'matched_size_ret_5',
 'rolling_diff_bid_price_3',
 'reference_price_far_price_imb',
 'all_prices_skew',
 'global_ptp_size',
 'rolling_std_diff_ask_price_5',
 'PCA_WAP_4',
 'PCA_WAP_1',
 'cluster',
 'global_median_price',
 'all_prices_mean',
 'global_std_size',
 'all_prices_kurt',
 'stock_id',
 'matched_size_shift_10',
 'rolling_std_diff_bid_price_3',
 'rolling_std_diff_bid_price_5',
 'stock_weights',
 'rolling_std_diff_ask_size_3',
 'depth_pressure',
 'spread_depth_ratio',
 'bid_size_diff_10',
 'near_price_bid_price_imb',
 'global_ptp_price',
 'far_price_near_price_imb',
 'imbalance_buy_sell_flag_shift_3',
 'rolling_std_diff_bid_price_10',
 'ask_price_bid_price_reference_price_imb2',
 'size_imbalance',
 'mid_price',
 'near_price',
 'rolling_std_diff_ask_price_10',
 'ask_price_bid_price_imb',
 'rolling_diff_bid_size_10',
 'bid_price_diff_2',
 'matched_size_shift_5']

# ['PCA_WAP_3',
#  'global_std_price',
#  'PCA_WAP_2',
#  'dow',
#  'near_price_wap_imb',
#  'weighted_wap',
#  'near_price_diff_10',
#  'ask_price_wap_imb',
#  'dom',
#  'matched_size_ret_3',
#  'reference_price_ret_10',
#  'wap_diff_10',
#  'matched_size_ret_5',
#  'rolling_diff_bid_price_3',
#  'reference_price_far_price_imb',
#  'all_prices_skew',
#  'global_ptp_size',
#  'rolling_std_diff_ask_price_5',
#  'rolling_std_diff_bid_price_3',
#  'far_price_bid_price_imb',
#  'far_price_wap_imb',
#  'reference_price_ret_3',
#  'rolling_std_diff_bid_price_10',
#  'rolling_std_diff_ask_price_3',
#  'matched_size_shift_5',
#  'stock_weights',
#  'all_prices_kurt',
#  'bid_price_diff_2',
#  'rolling_diff_bid_size_5',
#  'ask_size_diff_2',
#  'rolling_std_diff_ask_price_10',
#  'global_std_size',
#  'volume',
#  'reference_price_shift_3',
#  'PCA_WAP_4',
#  'all_prices_mean',
#  'global_ptp_price',
#  'reference_price_near_price_imb',
#  'cluster',
#  'near_price_diff_1',
#  'bid_price_diff_3',
#  'size_imbalance',
#  'stock_id',
#  'rolling_std_diff_bid_price_5',
#  'matched_size_shift_10',
#  'far_price',
#  'far_price_diff_3',
#  'bid_price_diff_10',
#  'spread_depth_ratio',
#  'bid_size_diff_10',
#  'imbalance_size_shift_2',
#  'far_price_ask_price_imb',
#  'imbalance_buy_sell_flag_ret_1',
#  'imbalance_buy_sell_flag_ret_2',
#  'imbalance_buy_sell_flag_ret_5',
#  'imbalance_buy_sell_flag_ret_10',
#  'size_change_diff_5',
#  'rolling_diff_bid_size_3',
#  'mid_price_movement',
#  'bid_size_diff_5',
#  'matched_size_shift_3',
#  'global_median_price',
#  'near_price',
#  'ask_price_bid_price_reference_price_imb2',
#  'ask_price_diff_3',
#  'rolling_diff_bid_price_5',
#  'depth_pressure',
#  'ask_price_diff_10',
#  'matched_size',
#  'price_spread',
#  'matched_size_ret_10',
#  'bid_price_diff_5',
#  'ask_size_diff_5',
#  'reference_price',
#  'rolling_diff_bid_size_10',
#  'ask_size_diff_3',
#  'rolling_diff_ask_size_5',
#  'liquidity_imbalance',
#  'reference_price_ret_2',
#  'rolling_diff_ask_size_3',
#  'imbalance_buy_sell_flag_ret_3',
#  'minute',
#  'bid_size_diff_3',
#  'ask_size_diff_10']


In [21]:
df_train_feats.drop(del_list,axis=1,inplace=True)
df_valid_feats.drop(del_list,axis=1,inplace=True)
feature_name = list(df_train_feats.columns)
print(f"Feature length = {len(feature_name)}")

Feature length = 136


In [22]:
feature_name = list(df_train_feats.columns)
print(f"Feature length = {len(feature_name)}")

Feature length = 136


In [23]:
# # Set LightGBM parameters
# lgb_params = {
#     "objective": "mae",
#     "n_estimators": 5000, #less estimators
#     "num_leaves": 256, #more leaves
#     "subsample": 0.6,
#     "colsample_bytree": 0.6,
#     "learning_rate": 0.00871, #larger learning rate(from 0.00871 to 0.00005)
#     'max_depth': 11,
#     "n_jobs": 4,
#     "device": "gpu",
#     "verbosity": -1,
#     "importance_type": "gain",
# }
# # Get feature names
# feature_name = list(df_train_feats.columns)
# print(f"Feature length = {len(feature_name)}")

# # Set up cross-validation parameters
# num_folds = 5
# fold_size = 435 // num_folds #offline mode
# gap = 5

# # Initialize lists to store models and scores
# models = []
# scores = []

# # Set model save path
# model_save_path = 'modelitos_para_despues'
# if not os.path.exists(model_save_path):
#     os.makedirs(model_save_path)

# # Get date IDs from the training data
# date_ids = df_train['date_id'].values

# # Loop over folds for cross-validation
# for i in range(num_folds):
#     start = i * fold_size
#     end = start + fold_size
    
#     # Define the purged set ranges
#     purged_before_start = start - 2
#     purged_before_end = start + 2
#     purged_after_start = end - 2
#     purged_after_end = end + 2
    
#     # Exclude the purged ranges from the test set
#     purged_set = ((date_ids >= purged_before_start) & (date_ids <= purged_before_end)) | \
#                  ((date_ids >= purged_after_start) & (date_ids <= purged_after_end))
    
#     # Define test_indices excluding the purged set
#     test_indices = (date_ids >= start) & (date_ids < end) & ~purged_set
#     train_indices = ~test_indices & ~purged_set

#     # Create fold-specific training and validation sets
#     df_fold_train = df_train_feats[train_indices]
#     df_fold_train_target = df_train['target'][train_indices]
#     df_fold_valid = df_train_feats[test_indices]
#     df_fold_valid_target = df_train['target'][test_indices]

#     print(f"Fold {i+1} Model Training")

#     # Train a LightGBM model for the current fold
#     lgb_model = lgb.LGBMRegressor(**lgb_params)
#     lgb_model.fit(
#         df_fold_train[feature_name],
#         df_fold_train_target,
#         eval_set=[(df_fold_valid[feature_name], df_fold_valid_target)],
#         callbacks=[
#             lgb.callback.early_stopping(stopping_rounds=100),
#             lgb.callback.log_evaluation(period=100),
#         ],
#     )

#     models.append(lgb_model)

#     # Save the model to a file
#     model_filename = os.path.join(model_save_path, f'doblez_{i+1}.txt')
#     lgb_model.booster_.save_model(model_filename)
#     print(f"Model for fold {i+1} saved to {model_filename}")

#     # Evaluate model performance on the validation set
#     fold_predictions = lgb_model.predict(df_fold_valid[feature_name])
#     fold_score = mean_absolute_error(fold_predictions, df_fold_valid_target)
#     scores.append(fold_score)
#     print(f"Fold {i+1} MAE: {fold_score}")

#     # Free up memory by deleting fold-specific variables
#     del df_fold_train, df_fold_train_target, df_fold_valid, df_fold_valid_target
#     gc.collect()

# # Calculate the average best iteration from all regular folds
# average_best_iteration = int(np.mean([model.best_iteration_ for model in models]))

# # Update the lgb_params with the average best iteration
# final_model_params = lgb_params.copy()
# final_model_params['n_estimators'] = average_best_iteration

# print(f"Training final model with average best iteration: {average_best_iteration}")

# # Train the final model on the entire dataset
# final_model = lgb.LGBMRegressor(**final_model_params)
# final_model.fit(
#     df_train_feats[feature_name],
#     df_train['target'],
#     callbacks=[
#         lgb.callback.log_evaluation(period=100),
#     ],
# )
# # Append the final model to the list of models
# models.append(final_model)

# # Append the final model to the list of models
# models.append(final_model)

# # Save the final model to a file
# final_model_filename = os.path.join(model_save_path, 'doblez-conjunto.txt')
# final_model.booster_.save_model(final_model_filename)
# print(f"Final model saved to {final_model_filename}")

# # Now 'models' holds the trained models for each fold and 'scores' holds the validation scores
# print(f"Average MAE across all folds: {np.mean(scores)}")

# if is_offline:
#     # offline predictions
#     df_valid_target = df_valid["target"]
#     offline_predictions = final_model.predict(df_valid_feats[feature_name])
#     offline_score = mean_absolute_error(offline_predictions, df_valid_target)
#     print(f"Offline Score {np.round(offline_score, 4)}")

###4.2 Recursive Feature Elimination with Stratified K-Fold CV

In [24]:
# from sklearn.feature_selection import RFECV
# from sklearn.model_selection import StratifiedKFold


# # Train procedure
# if is_train:
#     offline_split = df_train['date_id']>(split_day - 45)
#     df_offline_train = df_train_feats[~offline_split]
#     df_offline_valid = df_train_feats[offline_split]
#     df_offline_train_target = df_train['target'][~offline_split]
#     df_offline_valid_target = df_train['target'][offline_split]
#     df_train_target = df_train["target"]
#     del df_train
#     gc.collect()

#     lgb_params = {
#         "objective" : "mae",
#         "n_estimators" : 3000,
#         "num_leaves" : 128,
#         "subsample" : 0.6,
#         "colsample_bytree" : 0.6,
#         "learning_rate" : 0.05,
#         "n_jobs" : 4,
#         "device" : "gpu",
#         "verbosity": -1,
#         "importance_type" : "gain",
#     }


#     min_features_to_select = 120 #Minimum number of features to consider
#     lgb = lgb.LGBMRegressor(**lgb_params)
#     cv = StratifiedKFold(10) #Assign 10 clusters to stocks


#     print("Feature Elimination Performing.")
#     rfecv = RFECV(
#         estimator = lgb,
#         step = 9, #greater than 1 corresponds to num of feas to remove, within (0.0, 1.0) then corresponds to removal percantage
#         cv = cv,
#         scoring = 'accuracy',
#         min_features_to_select = min_features_to_select,
#         n_jobs = 2, #number of cores to run
#     )


#     rfecv.fit(
#         df_offline_train,
#         df_offline_train_target,
#     )

#     print("Valid Model Training on Selected Features Subset.")
#     selected_feas = df_offline_train.columns[rfecv.support_] #slected features

#     lgb = lgb.LGBMRegrssor(**lgb_params)
#     lgb.fit(
#         df_offline_train[selected_feas],
#         df_offline_train_target,
#         eval_set=[(df_offline_valid[selected_feas], df_offline_valid_target)],
#         callbacks = [
#             lgb.callback.early_stopping(stopping_rounds=100),
#             lgb.callback.log_evaluation(period=100),
#         ]

#     )

#     del df_offline_train, df_offline_valid, df_offline_train_target, df_offline_valid_target
#     gc.collect()

#     print("Infer Model Trainning.")
#     infer_params = lgb_params.copy()
#     infer_params["n_estimators"] = int(1.2 * lgb.best_iteration_)
#     infer_lgb_model = lgb.LGBMRegressor(**infer_params)
#     infer_lgb_model.fit(df_train_feats[selected_feas], df_train_target)
#     print("Infer Model Training on Selected Features Subset Complete.")

#     if is_offline:
#         # Offline predictions
#         df_valid_target = df_valid["target"]
#         offline_predictions = infer_lgb_model.predict(df_valid_feats[selected_feats])
#         offline_score = mean_absolute_error(offline_predictions, df_valid_target)
#         print(f"Offline Score {np.round(offline_score, 4)}")

#         del df_valid, df_valid_feats
#         gc.collect()

#     del df_train_feats
#     gc.collect()


In [25]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

# lgb_cv_params = {
#     'n_estimators':range(2000,5000),
# }

# lgb_params = {
# #     'n_estimators':300,
#     'max_depth':9, #一般这个值设为7-10即可
#     'num_leaves':256, #要小于2**max_depth，比如2**7也就是128，
#     'learning_rate':0.00871,
#     'subsample':0.6,
#     'feature_frequency':0.8,
#     'random_state':32,
#     'class_weight':'balanced',
#     'n_jobs':4,
#     'subsample_freq':2
# }

# lgb_cv = GridSearchCV(lgb.LGBMClassifier(**lgb_params), 
#                       lgb_cv_params,
#                       scoring='accuracy',
#                       cv=5,
#                       n_jobs=-1)

# lgb_cv.fit(df_train_feats, df_train['target'])

# print(lgb_cv.best_params_)
# print(lgb_cv.best_score_)

In [26]:
import numpy as np

oof_lgb = np.zeros(df_train_feats.shape[0])
oof_xgb = np.zeros(df_train_feats.shape[0])
oof_cat = np.zeros(df_train_feats.shape[0])

test_output_df = pd.DataFrame(columns=['lgb','xgb','cat'],index=range(df_valid_feats.shape[0]))
test_output_df = test_output_df.fillna(0)



In [27]:
lgb_reg = lgb.LGBMRegressor(max_depth=9,
                           n_estimators=5000,
                           subsample=0.6,
                           colsample_bytree=0.6,
                           random_state=32)

cat_reg = ctb.CatBoostRegressor(learning_rate=0.00871,
                               depth=9,
                               random_seed=32,
                               )

xgb_reg = xgb.XGBRegressor(max_depth=9,
                          learning_rate=0.00871,
                          n_estimators=5000,
                          n_jobs=4,
                          colsample_bytree=0.6,
                          subsample=0.6,
                          random_state=32,
                          )

In [28]:
# Set up cross-validation parameters
num_folds = 5
fold_size = 435 // num_folds #offline mode
gap = 5

# Initialize lists to store models and scores
model_lgb = []
model_xgb = []
model_cat = []
score_lgb = []
score_xgb = []
score_cat = []
# Set model save path
model_save_path = 'modelitos_para_despues'
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

# Get date IDs from the training data
date_ids = df_train['date_id'].values

# Loop over folds for cross-validation
for i in range(num_folds):
    start = i * fold_size
    end = start + fold_size
    
    # Define the purged set ranges
    purged_before_start = start - 2
    purged_before_end = start + 2
    purged_after_start = end - 2
    purged_after_end = end + 2
    
    # Exclude the purged ranges from the test set
    purged_set = ((date_ids >= purged_before_start) & (date_ids <= purged_before_end)) | \
                 ((date_ids >= purged_after_start) & (date_ids <= purged_after_end))
    
    # Define test_indices excluding the purged set
    test_indices = (date_ids >= start) & (date_ids < end) & ~purged_set
    train_indices = ~test_indices & ~purged_set

    # Create fold-specific training and validation sets
    train_x = df_train_feats[train_indices]
    train_y = df_train['target'][train_indices]
    valid_x = df_train_feats[test_indices]
    valid_y = df_train['target'][test_indices]

    print(f"Fold {i+1} Model Training")
    
    lgb_reg.fit(train_x,train_y, eval_set=[(train_x,train_y),(valid_x,valid_y)],early_stopping_rounds=100)
    xgb_reg.fit(train_x,train_y, eval_set=[(train_x,train_y),(valid_x,valid_y)],early_stopping_rounds=100)
    cat_reg.fit(train_x,train_y, eval_set=[(train_x,train_y),(valid_x,valid_y)],early_stopping_rounds=100)
    model_lgb.append(lgb_reg)
    model_xgb.append(xgb_reg)
    model_cat.append(cat_reg)
    # Save the model to a file
    model_filename = os.path.join(model_save_path, f'doblez_{i+1}.txt')
    lgb_reg.booster_.save_model(model_filename)
    xgb_reg.booster_.save_model(model_filename)
    cat_reg.booster_.save_model(model_filename)
    print(f"Models for fold {i+1} saved to {model_filename}")
    
    # Evaluate model performance on the validation set
    
    fold_score = mean_absolute_error(fold_predictions, df_fold_valid_target)
    scores.append(fold_score)
    print(f"Fold {i+1} MAE: {fold_score}")
    oof_lgb[valid_idx] = lgb_reg.predict(valid_x)
    oof_xgb[valid_idx] = xgb_reg.predict(valid_x)
    oof_cat[valid_idx] = cat_reg.predict(valid_x)
    fold_score_lgb = mean_absolute_error(oof_lgb[valid_idx], valid_y)
    fold_score_xgb = mean_absolute_error(oof_xgb[valid_idx], valid_y)
    fold_score_cat = mean_absolute_error(oof_cat[valid_idx], valid_y)
    score_lgb.append(fold_score_lgb)
    score_xgb.append(fold_score_xgb)
    score_cat.append(fold_score_cat)
    print(f"Fold {i+1} LGB MAE: {fold_score_lgb}")
    print(f"Fold {i+1} XGB MAE: {fold_score_xgb}")
    print(f"Fold {i+1} CAT MAE: {fold_score_cat}")
    
    test_output_df['lgb'] += lgb_reg.predict(df_valid_feats)
    test_output_df['xgb'] += xgb_reg.predict(df_valid_feats)
    test_output_df['cat'] += cat_reg.predict(df_valid_feats)
    
    del train_x, train_y, valid_x, valid_y
    gc.collect()
    

test_output_df['lgb'] = test_output_df['lgb'] / 5
test_output_df['xgb'] = test_output_df['xgb'] / 5
test_output_df['cat'] = test_output_df['cat'] / 5


Fold 1 Model Training
[1]	training's l2: 94.0501	valid_1's l2: 71.2627
[2]	training's l2: 93.6497	valid_1's l2: 70.8996
[3]	training's l2: 93.3092	valid_1's l2: 70.576
[4]	training's l2: 93.039	valid_1's l2: 70.3492
[5]	training's l2: 92.8095	valid_1's l2: 70.1314
[6]	training's l2: 92.6137	valid_1's l2: 69.9543
[7]	training's l2: 92.4489	valid_1's l2: 69.8166
[8]	training's l2: 92.2849	valid_1's l2: 69.6811
[9]	training's l2: 92.148	valid_1's l2: 69.5654
[10]	training's l2: 92.0267	valid_1's l2: 69.4653
[11]	training's l2: 91.9235	valid_1's l2: 69.3889
[12]	training's l2: 91.8379	valid_1's l2: 69.3229
[13]	training's l2: 91.7564	valid_1's l2: 69.2622
[14]	training's l2: 91.68	valid_1's l2: 69.2122
[15]	training's l2: 91.6115	valid_1's l2: 69.1651
[16]	training's l2: 91.5507	valid_1's l2: 69.122
[17]	training's l2: 91.4888	valid_1's l2: 69.0879
[18]	training's l2: 91.4263	valid_1's l2: 69.0479
[19]	training's l2: 91.373	valid_1's l2: 69.0146
[20]	training's l2: 91.3156	valid_1's l2: 68

AttributeError: 'XGBRegressor' object has no attribute 'booster_'

In [None]:
oof_df = pd.DataFrame({'lgb':oof_lgb,'xgb':oof_xgb,'cat':oof_cat})

In [None]:
from sklearn.linear_model import LogisticRegression

log_r = LogisticRegression()
log_r.fit(oof_df, df_train['target'])
final_pre1 = log_r.predict(test_output_df)
# Save the final model to a file
final_model_filename = os.path.join(model_save_path, 'doblez-conjunto.txt')
log_r.booster_.save_model(final_model_filename)
print(f"Final model saved to {final_model_filename}")

# Now 'models' holds the trained models for each fold and 'scores' holds the validation scores
print(f"Average LGB MAE across all folds: {np.mean(score_lgb)}")
print(f"Average XGB MAE across all folds: {np.mean(score_xgb)}")
print(f"Average CAT MAE across all folds: {np.mean(score_cat)}")

if is_offline:
    # offline predictions
    df_valid_target = df_valid["target"]
    offline_score = mean_absolute_error(final_pre1, df_valid_target)
    print(f"Offline Score {np.round(offline_score, 4)}")