In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import polars as pl
import xgboost as xgb
from itertools import combinations, product

In [3]:
train = pd.read_parquet('train.parquet')
train = train[~train['target'].isna()]
print(train.shape)
size_col = ['imbalance_size','matched_size','bid_size','ask_size']

# Normalize Volume Features for each Stock by dividing by median
for _ in size_col:
    train[f"scale_{_}"] = train[_] / train.groupby(['stock_id'])[_].transform('median')
    
# buy-side imbalance; 1
# sell-side imbalance; -1
# no imbalance; 0
train['auc_bid_size'] = train['matched_size']
train['auc_ask_size'] = train['matched_size']
train.loc[train['imbalance_buy_sell_flag']==1,'auc_bid_size'] += train.loc[train['imbalance_buy_sell_flag']==1,'imbalance_size']
train.loc[train['imbalance_buy_sell_flag']==-1,'auc_ask_size'] += train.loc[train['imbalance_buy_sell_flag']==-1,'imbalance_size']
# Gives a better signal of true market pressure (excess demand or supply).
# This is especially useful in an auction setting where imbalance can signal future price direction.

weight_df = pd.DataFrame()
weight_df['stock_id'] = list(range(200))
weight_df['weight'] =  [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]

train = train.merge(weight_df,how='left',on=['stock_id'])


(5237892, 17)


In [4]:
train

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,target,time_id,row_id,scale_imbalance_size,scale_matched_size,scale_bid_size,scale_ask_size,auc_bid_size,auc_ask_size,weight
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,...,-3.029704,0,0_0_0,1.547844,0.635182,3.001806,0.376896,16560879.33,13380276.64,0.004
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,...,-5.519986,0,0_0_1,1.031025,0.593159,0.261912,1.560460,1642214.25,1808818.16,0.001
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,...,-8.389950,0,0_0_2,0.945033,0.457058,3.063661,1.372570,1819368.03,2122247.90,0.002
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,...,-4.010200,0,0_0_3,2.187799,0.308522,0.114156,22.488728,18389745.62,30307427.89,0.006
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,...,-7.349849,0,0_0_4,0.223200,0.790134,0.982033,0.025198,17860614.95,18308164.91,0.004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237887,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,...,2.310276,26454,480_540_195,1.033448,1.177623,1.259572,12.147279,28280361.74,30721084.63,0.004
5237888,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,...,-8.220077,26454,480_540_196,0.692713,1.753503,9.938564,4.276373,9187699.11,9537209.58,0.001
5237889,197,480,540,0.00,0,0.995789,12725436.10,0.995789,0.995789,0.995789,...,1.169443,26454,480_540_197,0.000000,1.324129,1.117586,11.965859,12725436.10,12725436.10,0.004
5237890,198,480,540,1000898.84,1,0.999210,94773271.05,0.999210,0.999210,0.998970,...,-1.540184,26454,480_540_198,0.132882,1.117509,0.828980,4.372968,95774169.89,94773271.05,0.006


In [5]:
train['date_id'].max()

480

In [6]:
def generate_features_no_hist_polars(df):
    # Add feature for ask_size - bid_size and then apply rolling calculations
    df = pl.from_pandas(df)
    feas_list = ['stock_id','seconds_in_bucket','imbalance_size','imbalance_buy_sell_flag',
               'reference_price','matched_size','far_price','near_price','bid_price','bid_size',
                'ask_price','ask_size','wap','scale_imbalance_size','scale_matched_size','scale_bid_size','scale_ask_size'
                 ,'auc_bid_size','auc_ask_size']
    # Stage 1: Feature Engineering
    df = df.with_columns([
        # Notional Values, Auction Sizes, Liquidity and Spread, Imbalance Signals and Depth
        # Depth measures how imbalanced the order sizes are relative to the auction’s potential crossing price range, giving a sense of market depth or the pressure that the net volume might exert on the closing auction price
        (pl.col('ask_size') * pl.col('ask_price')).alias("ask_money"),
        (pl.col('bid_size') * pl.col('bid_price')).alias("bid_money"),
        (pl.col('ask_size') + pl.col("auc_ask_size")).alias("ask_size_all"),
        (pl.col('bid_size') + pl.col("auc_bid_size")).alias("bid_size_all"),
        (pl.col('ask_size') + pl.col("auc_ask_size") + pl.col('bid_size') + pl.col("auc_bid_size")).alias("volumn_size_all"),
        (pl.col('reference_price') * pl.col('auc_ask_size')).alias("ask_auc_money"),
        (pl.col('reference_price') * pl.col('auc_bid_size')).alias("bid_auc_money"),
        (pl.col('ask_size') * pl.col('ask_price') + pl.col('bid_size') * pl.col('bid_price')).alias("volumn_money"),
        (pl.col('ask_size') + pl.col('bid_size')).alias('volume_cont'),
        (pl.col('ask_size') - pl.col('bid_size')).alias('diff_ask_bid_size'),
        (pl.col('imbalance_size') + 2 * pl.col('matched_size')).alias('volumn_auc'),
        ((pl.col('imbalance_size') + 2 * pl.col('matched_size')) * pl.col("reference_price")).alias('volumn_auc_money'),
        ((pl.col('ask_price') + pl.col('bid_price'))/2).alias('mid_price'),
        ((pl.col('near_price') + pl.col('far_price'))/2).alias('mid_price_near_far'),
        (pl.col('ask_price') - pl.col('bid_price')).alias('price_diff_ask_bid'),
        (pl.col('ask_price') / pl.col('bid_price')).alias('price_div_ask_bid'),
        (pl.col('imbalance_buy_sell_flag') * pl.col('scale_imbalance_size')).alias('flag_scale_imbalance_size'),
        (pl.col('imbalance_buy_sell_flag') * pl.col('imbalance_size')).alias('flag_imbalance_size'),
        (pl.col('imbalance_size') / pl.col('matched_size') * pl.col('imbalance_buy_sell_flag')).alias("div_flag_imbalance_size_2_balance"),
        ((pl.col('ask_price') - pl.col('bid_price')) * pl.col('imbalance_size')).alias('price_pressure'),
        ((pl.col('ask_price') - pl.col('bid_price')) * pl.col('imbalance_size') * pl.col('imbalance_buy_sell_flag')).alias('price_pressure_v2'),
        ((pl.col("ask_size") - pl.col("bid_size")) / (pl.col("far_price") - pl.col("near_price"))).alias("depth_pressure"),
        (pl.col("bid_size") / pl.col("ask_size")).alias("div_bid_size_ask_size"),
    ])
    feas_list.extend(['ask_money', 'bid_money', 'ask_auc_money','bid_auc_money',"ask_size_all","bid_size_all","volumn_size_all",
                      'volumn_money','volume_cont',"volumn_auc","volumn_auc_money","mid_price",
                      'mid_price_near_far','price_diff_ask_bid',"price_div_ask_bid","flag_imbalance_size","div_flag_imbalance_size_2_balance",
                     "price_pressure","price_pressure_v2","depth_pressure","flag_scale_imbalance_size","diff_ask_bid_size"])        

    # Ratio Features: yields a slight improvement
    add_cols = []
    for col1, col2 in [
        ("imbalance_size","bid_size"),
        ("imbalance_size","ask_size"),
        ("matched_size","bid_size"),
        ("matched_size","ask_size"),
        ("imbalance_size","volume_cont"),
        ("matched_size","volume_cont"),
        ("auc_bid_size","bid_size"),
        ("auc_ask_size","ask_size"),
        ("bid_auc_money","bid_money"),
        ("ask_auc_money","ask_money"),
    ]:
        add_cols.append((pl.col(col1) / pl.col(col2)).alias(f"div_{col1}_2_{col2}"))
        feas_list.append(f"div_{col1}_2_{col2}")        
    df = df.with_columns(add_cols)

    # Stage 2 Creates additional imbalance features by comparing pairs of columns: 
    # Capture the difference or imbalance between buy and sell sides
    # Measures how skewed one side is vs. the other, producing values typically between -1 and +1
    # Exclude price-related features
    add_cols = []
    for pair1,pair2 in [
        ('ask_size','bid_size'),
        ('ask_money','bid_money'),
        ('volumn_money','volumn_auc_money'),
        ('volume_cont','volumn_auc'),
        ('imbalance_size','matched_size'),
        ('auc_ask_size','auc_bid_size'),
        ("ask_size_all",'bid_size_all')
    ]:
        col_imb = f"imb1_{pair1}_{pair2}"
        add_cols.extend([
            ((pl.col(pair1) - pl.col(pair2)) / (pl.col(pair1) + pl.col(pair2))).alias(col_imb),
        ])
        feas_list.extend([col_imb])
    df = df.with_columns(add_cols)
    
    # Price Imbalance
    # Takes every pair of price columns, Computes price imbalance ratios
    # Captures how different two price references are relative to their sum
    fea_append_list = []
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap","mid_price"]
    for c in combinations(prices, 2):
        fea_append_list.append(((pl.col(c[0]) - pl.col(c[1])) / (pl.col(c[0]) + pl.col(c[1]))).alias(f"imb1_{c[0]}_{c[1]}"))
        # fea_append_list.append((pl.col(c[0]) - pl.col(c[1])).alias(f"diff_{c[0]}_{c[1]}"))
        feas_list.extend([f"imb1_{c[0]}_{c[1]}"])
    df = df.with_columns(fea_append_list)
    
    
    # Market Urgency
    df = df.with_columns([
        ((pl.col("imb1_ask_size_bid_size") + 2) * (pl.col("imb1_ask_price_bid_price") + 2) * (pl.col("imb1_auc_ask_size_auc_bid_size")+2)).alias("market_urgency_v2"),
        (pl.col('price_diff_ask_bid') * (pl.col('imb1_ask_size_bid_size'))).alias('market_urgency'),
        (pl.col('imb1_ask_price_bid_price') * (pl.col('imb1_ask_size_bid_size'))).alias('market_urgency_v3'),
    ])
    feas_list.extend([f"market_urgency_v3",'market_urgency','market_urgency_v2'])
    
    feas_list = ['imb1_wap_mid_price', 'imb1_ask_money_bid_money', 'imb1_volume_cont_volumn_auc', 'imb1_reference_price_ask_price', 
                 'imb1_reference_price_mid_price', 'seconds_in_bucket', 'div_flag_imbalance_size_2_balance', 'ask_price', 
                 'imb1_reference_price_bid_price', 'scale_matched_size', 'imb1_near_price_wap', 'volumn_auc_money', 'imb1_far_price_wap', 
                 'bid_size', 'scale_bid_size', 'bid_size_all']
    # Isolation of features
    # Define base columns and window sizes
    add_cols = []
    for col in ["bid_auc_money","imb1_reference_price_wap","bid_size_all",
                "imb1_auc_ask_size_auc_bid_size","div_flag_imbalance_size_2_balance",
                "imb1_ask_size_all_bid_size_all","flag_imbalance_size","imb1_reference_price_mid_price"]:
        # Generate rolling features (mean + std)
        for window in [3,6,18,36,60]:
            add_cols.append(pl.col(col).rolling_mean(window_size=window,min_periods=1).over('stock_id','date_id').alias(f'rolling{window}_mean_{col}'))
            add_cols.append(pl.col(col).rolling_std(window_size=window,min_periods=1).over('stock_id','date_id').alias(f'rolling{window}_std_{col}'))
            feas_list.extend([f'rolling{window}_mean_{col}',f'rolling{window}_std_{col}'])
    feas_list = ['imb1_wap_mid_price', 'imb1_ask_money_bid_money', 'imb1_volume_cont_volumn_auc', 
                     'imb1_reference_price_ask_price', 'imb1_reference_price_mid_price', 
                     'seconds_in_bucket', 'div_flag_imbalance_size_2_balance', 'ask_price', 
                     'imb1_reference_price_bid_price', 'scale_matched_size', 'imb1_near_price_wap', 
                     'volumn_auc_money', 'imb1_far_price_wap', 'bid_size', 'scale_bid_size', 'bid_size_all', 
                     'rolling18_mean_imb1_auc_ask_size_auc_bid_size', 'rolling3_mean_div_flag_imbalance_size_2_balance', 
                     'rolling60_std_div_flag_imbalance_size_2_balance', 'rolling36_mean_flag_imbalance_size', 
                     'rolling3_std_imb1_auc_ask_size_auc_bid_size', 'rolling18_mean_imb1_ask_size_all_bid_size_all', 
                     'rolling6_mean_div_flag_imbalance_size_2_balance', 'rolling6_std_imb1_auc_ask_size_auc_bid_size', 
                     'rolling3_mean_imb1_auc_ask_size_auc_bid_size', 'rolling60_std_imb1_auc_ask_size_auc_bid_size', 
                     'rolling6_std_bid_size_all', 'rolling3_std_bid_size_all', 'rolling3_mean_bid_size_all', 
                     'rolling18_std_bid_auc_money', 'rolling36_mean_bid_auc_money',"rolling60_mean_imb1_reference_price_wap",
                    'rolling18_mean_imb1_reference_price_wap', 'rolling3_mean_imb1_reference_price_mid_price']
    df = df.with_columns(add_cols)
    
# Time-shifted (lag) features for temporal patterns
  # for col in ["flag_imbalance_size", "imb1_reference_price_wap", "imb1_reference_price_mid_price","mid_price", "imb1_far_price_wap", 
  #             "matched_size", "reference_price", "imbalance_buy_sell_flag"]:
  #     add_cols = []
  #     for window_size in [1,2,4,6,12]:
              #Shift the column backwards (i.e., look at past values)
  #           add_cols.append(pl.col(col).shift(window_size).over('stock_id','date_id').alias(f'shift{window_size}_{col}'))
              # Ratio of current to past values
  #           add_cols.append((pl.col(col) / pl.col(col).shift(window_size).over('stock_id','date_id')).alias(f'div_shift{window_size}_{col}'))
              # Difference from past value
  #           add_cols.append((pl.col(col) - pl.col(col).shift(window_size).over('stock_id','date_id')).alias(f'diff_shift{window_size}_{col}'))
              # Add feature names to feature list
  #           feas_list.extend([f'shift{window_size}_{col}',f'div_shift{window_size}_{col}',f'diff_shift{window_size}_{col}'])
        # Add all created features to the dataframe
  #     df = df.with_columns(add_cols)

    # Miscellaneous
    # Momentum and Spread Intensity
    df = df.with_columns([
        # Captures how quickly the auction-side pressure (buy vs sell imbalance) is shifting, indicating a possible change in market direction.
        pl.col("flag_imbalance_size").diff().over('stock_id','date_id').alias("imbalance_momentum_unscaled"),
        # Measures the rate of change in bid-ask spread, signaling rising or falling short-term market uncertainty or liquidity.
        pl.col("price_diff_ask_bid").diff().over('stock_id','date_id').alias("spread_intensity"),
    ])
    feas_list.extend(["imbalance_momentum_unscaled","spread_intensity"])
    # Normalize Imbalance Momentum
    # Normalizes change in imbalance sentiment by trade volume, giving a clearer picture of how significant the pressure shift is relative to market activity.
    df = df.with_columns([
        (pl.col("imbalance_momentum_unscaled")/pl.col("matched_size")).alias("imbalance_momentum")
    ])
    feas_list.extend(["imbalance_momentum"])

    # Calculate diff features for specific columns
    # Computes time-based differences for each feature over rolling windows (e.g., 1, 2, 3, 10 steps back) within each stock-day group 
    # to capture short-term trends or volatility—helping detect momentum, reversals, or unusual market behavior.
    add_cols = []
    for col in ['ask_price',
 'bid_price',
 'imb1_reference_price_near_price',
 'bid_size',
 'scale_bid_size',
 'mid_price',
 'ask_size',
 'price_div_ask_bid',
 'div_bid_size_ask_size',
 'market_urgency',
 'wap',
 'imbalance_momentum']:
        for window in [1, 2, 3, 10]:
            add_cols.append((pl.col(col).diff(window).over('stock_id','date_id')).alias(f"{col}_diff_{window}"))
            feas_list.append(f"{col}_diff_{window}")
    df = df.with_columns(add_cols)
    
    # Looping over mock_period
    # Each number represents how many time steps ahead we want to simulate the target (i.e., forecasting wap a few steps into the future).
    for mock_period in [1,3,12,6]:
        # Computes the future WAP (weighted average price) mock_period steps ahead for each stock and day
        df = df.with_columns([pl.col("wap").shift(-mock_period).over("stock_id","date_id").alias(f"wap_shift_n{mock_period}")])
        # Relative return or ratio between future WAP and current WAP; gives raw signal of price movement
        df = df.with_columns([(pl.col(f"wap_shift_n{mock_period}")/pl.col("wap")).alias("target_single")])


        tmp_df = df.select(pl.col("target_single"),pl.col("weight")).to_pandas()
        # Cleaning NaN weights
        tmp_df.loc[tmp_df["target_single"].isna(),"weight"] = 0
        # Reintroduces the adjusted weights back into the Polars DataFrame
        df = df.with_columns([pl.lit(np.array(tmp_df["weight"])).alias("weight_tmp")])

        # Weighted average of target_single across all stocks for the same timestamp (seconds_in_bucket) and day, creating a synthetic index return
        df = df.with_columns([
            (((pl.col("weight_tmp") * pl.col("target_single")).sum().over("date_id","seconds_in_bucket")) / ((pl.col("weight_tmp")).sum().over("date_id","seconds_in_bucket"))).alias("index_target_mock")])
        
        # Simulate the price movement relative to the market index
        df = df.with_columns([((pl.col("target_single") - pl.col("index_target_mock"))*10000).alias("target_mock")])

        # Shifts the mock target backwards, aligning the prediction with the current timestep; for supervised learning
        df = df.with_columns([pl.col("target_mock").shift(mock_period).over("stock_id","date_id").alias(f"target_mock_shift{mock_period}")])
            #pl.col("index_target_mock").shift(mock_period).over("stock_id","date_id").alias(f"index_target_mock_shift{mock_period}")
            #pl.col("target_single").shift(mock_period).over("stock_id","date_id").alias(f"target_single_shift{mock_period}")
        
        # df.drop_in_place("wap_shift_6")
        # df.drop_in_place("target_single_shift6")
        # df.drop_in_place("indexwap_shift6")
        # add_cols_new = []

    add_cols = []
    # Computes rolling averages of lagged mock target values (predicted auction return) across various window sizes to capture short- and long-term trends for each stock on given day
    for col in ['target_mock_shift6','target_mock_shift1','target_mock_shift3','target_mock_shift12']:
        for window in [1, 3,6,12,24,48]:
            add_cols.append(pl.col(col).rolling_mean(window_size=window,min_periods=1).over('stock_id','date_id').alias(f'rolling{window}_mean_{col}'))
            #add_cols.append(pl.col(col).rolling_std(window_size=window,min_periods=1).over('stock_id','date_id').alias(f'rolling{window}_std_{col}'))
            # add_cols_new.extend([f'rolling{window}_mean_{col}'])
    df = df.with_columns(add_cols)
    # Appends only selected rolling target features to the list of features used in modeling
    keep_cols_new = ['rolling48_mean_target_mock_shift3', 'rolling48_mean_target_mock_shift1', 'rolling48_mean_target_mock_shift12',
                     'rolling1_mean_target_mock_shift6', 'rolling24_mean_target_mock_shift6','rolling24_mean_target_mock_shift12',]
    feas_list.extend(keep_cols_new)
    
    # Captures how selected features change over time by computing shifted, relative, and absolute differences — helps detect trend reversals or momentum changes
    add_cols = []
    for col in ["imb1_auc_ask_size_auc_bid_size","flag_imbalance_size","price_pressure_v2","scale_matched_size"]:
        for window_size in [1,2,3,6,12]:
            add_cols.append(pl.col(col).shift(window_size).over('stock_id','date_id').alias(f'shift{window_size}_{col}'))
            add_cols.append((pl.col(col) / pl.col(col).shift(window_size).over('stock_id','date_id')).alias(f'div_shift{window_size}_{col}'))
            add_cols.append((pl.col(col) - pl.col(col).shift(window_size).over('stock_id','date_id')).alias(f'diff_shift{window_size}_{col}'))
            #feas_list.extend([f'shift{window_size}_{col}',f'div_shift{window_size}_{col}',f'diff_shift{window_size}_{col}'])
            # shift3_price_pressure_v2: Value 3 time steps ago
            # div_shift3_*: How many times current is over past value
            # diff_shift3_*: Absolute change from 3 steps ago
    feas_list.extend(['div_shift6_imb1_auc_ask_size_auc_bid_size',
 'diff_shift6_price_pressure_v2',
 'shift1_price_pressure_v2',
 'div_shift3_flag_imbalance_size',
 'div_shift12_imb1_auc_ask_size_auc_bid_size',
 'div_shift3_scale_matched_size',
 'diff_shift6_flag_imbalance_size',
 'shift12_imb1_auc_ask_size_auc_bid_size',
 'div_shift12_price_pressure_v2',
 'shift6_flag_imbalance_size',
 'diff_shift3_imb1_auc_ask_size_auc_bid_size',
 'div_shift12_flag_imbalance_size',
 'shift12_flag_imbalance_size'])
    df = df.with_columns(add_cols)
    
    # Global features summarize weighted auction-level behavior, helping the model understand how a stock compares to the broader market context 
    # Useful for capturing relative sentiment and positioning
    add_cols = []
    for col in ['imb1_ask_price_mid_price',
 'market_urgency',
 'market_urgency_diff_1',
 'imb1_ask_money_bid_money',
 'rolling18_mean_imb1_ask_size_all_bid_size_all',
 'rolling18_mean_imb1_auc_ask_size_auc_bid_size',
 'rolling18_mean_imb1_reference_price_wap',
 'ask_price_diff_3',
 'diff_shift1_price_pressure_v2',
 'diff_shift12_scale_matched_size',
 'diff_shift1_flag_imbalance_size',
 'imb1_ask_size_bid_size',
 'imb1_bid_price_mid_price',
 'rolling48_mean_target_mock_shift6']:
        add_cols.append((((pl.col(col) * pl.col("weight")).sum().over("date_id","seconds_in_bucket"))/(((pl.col("weight")).sum().over("date_id","seconds_in_bucket")))).alias(f"global_{col}"))
        feas_list.append(f"global_{col}")
    # Tidying
    df = df.with_columns(add_cols)
    
    
    # MACD:  Moving Average Convergence Divergence
    # Extract momentum-based features from key price signals (mid_price_near_far, imb1_reference_price_wap, near_price) 
    rsi_cols = ["mid_price_near_far","imb1_reference_price_wap","near_price",]
    add_cols = []
    for col in rsi_cols:
        for window_size in [3,6,12,24,48]:
            # Exponentially Weighted Moving Averages (EWMA)
            # EWMA places more weight on recent data → captures momentum and short-term trends better than simple averages.
            add_cols.append(pl.col(col).ewm_mean(span=window_size, adjust=False).over('stock_id','date_id').alias(f"rolling_ewm_{window_size}_{col}"))
            #feas_list.append(f"rolling_ewm_{window_size}_{col}")
    df = df.with_columns(add_cols)
    
    add_cols = []
    for col in rsi_cols:
        for w1,w2 in zip((3,6,12,24),(6,12,24,48)):
            # Difference Between Fast and Slow EWMAs (e.g., 12-period minus 24-period) 
            # → this is the "DIF" signal in MACD which capture shifts in momentum
            add_cols.append((pl.col(f"rolling_ewm_{w1}_{col}") - pl.col(f"rolling_ewm_{w2}_{col}")).alias(f"dif_{col}_{w1}_{w2}"))
            #feas_list.append(f"dif_{col}_{w1}_{w2}")
    df = df.with_columns(add_cols)
    
    add_cols = []
    for col in rsi_cols:
        for w1,w2 in zip((3,6,12,24),(6,12,24,48)):
            # Calculates an EWMA of the DIF line to create the MACD signal line (DEA)
            # Smooths out short-term fluctuations in momentum to highlight clearer trend signals.
            add_cols.append(pl.col(f"dif_{col}_{w1}_{w2}").ewm_mean(span=9, adjust=False).over('stock_id','date_id').alias(f"dea_{col}_{w1}_{w2}"))
            #feas_list.append(f"dea_{col}_{w1}_{w2}")
    df = df.with_columns(add_cols)
    
    add_cols = []
    for col in rsi_cols:
        for w1,w2 in zip((3,6,12,24),(6,12,24,48)):
            add_cols.append((pl.col(f"dif_{col}_{w1}_{w2}") - pl.col(f"dea_{col}_{w1}_{w2}")).alias(f"macd_{col}_{w1}_{w2}"))
            #feas_list.append(f"macd_{col}_{w1}_{w2}")
    
    feas_list.extend(['macd_imb1_reference_price_wap_12_24',
 'dif_imb1_reference_price_wap_3_6',
 'macd_mid_price_near_far_12_24',
 'dif_near_price_3_6',
 'macd_near_price_24_48',
 'dea_imb1_reference_price_wap_12_24',
 'macd_near_price_12_24',
 'rolling_ewm_24_imb1_reference_price_wap',
 'dif_near_price_6_12',
 'dea_mid_price_near_far_6_12',
 'dea_near_price_24_48',
 'rolling_ewm_12_imb1_reference_price_wap',
 'dif_imb1_reference_price_wap_12_24'])
    df = df.with_columns(add_cols)
    
    add_cols = []
    for col in ["target"]:
        # 176 1,2,3,5,10,15,20,25,30
        # [1,2,3,5,10,15,20,25,30,35,40,45,60] 5.8704926 157
        # [1,2,3,5,10,15,20,30,45,60] 5.8708683137
        for window_size in [1,2,3,5,10,15,20,25,30,35,40,45,60]:
            add_cols.append(pl.col(col).shift(1).rolling_mean(window_size=window_size,min_periods=1).over('stock_id','seconds_in_bucket').alias(f'rolling_mean_{window_size}_{col}_second'))
            add_cols.append(pl.col(col).shift(1).rolling_std(window_size=window_size,min_periods=1).over('stock_id','seconds_in_bucket').alias(f'rolling_std_{window_size}_{col}_second'))
            # Helps the model learn patterns from recent returns and volatility, capturing both short-term trends (e.g., 1–5 periods) and longer-term dynamics (e.g., 30–60 periods)
            # Performed within each stock_id and seconds_in_bucket group, preserving the intraday structure and avoiding leakage across stocks or time buckets
            
            feas_list.extend([f'rolling_mean_{window_size}_{col}_second',f'rolling_std_{window_size}_{col}_second',])

    df = df.with_columns(add_cols)
    
    return df.to_pandas(), feas_list



In [7]:
train_feas_all, feas_list = generate_features_no_hist_polars(train)
valid_feas = train_feas_all[train_feas_all['date_id'] >= 390]
train_feas = train_feas_all[train_feas_all['date_id'] < 390]
# train_feas = train_feas[train_feas['fold']==0]

#4,236,893 rows (observations) and 157 columns (features)
print(train_feas[feas_list].shape)

  add_cols.append(pl.col(col).rolling_mean(window_size=window,min_periods=1).over('stock_id','date_id').alias(f'rolling{window}_mean_{col}'))
  add_cols.append(pl.col(col).rolling_std(window_size=window,min_periods=1).over('stock_id','date_id').alias(f'rolling{window}_std_{col}'))
  add_cols.append(pl.col(col).rolling_mean(window_size=window,min_periods=1).over('stock_id','date_id').alias(f'rolling{window}_mean_{col}'))
  add_cols.append(pl.col(col).shift(1).rolling_mean(window_size=window_size,min_periods=1).over('stock_id','seconds_in_bucket').alias(f'rolling_mean_{window_size}_{col}_second'))
  add_cols.append(pl.col(col).shift(1).rolling_std(window_size=window_size,min_periods=1).over('stock_id','seconds_in_bucket').alias(f'rolling_std_{window_size}_{col}_second'))


(4236893, 157)


In [8]:
train_feas = train_feas.fillna(-9e10)
valid_feas = valid_feas.fillna(-9e10)
from tqdm.auto import tqdm
for _ in tqdm(feas_list):
    train_feas[_] = train_feas[_].clip(lower=-9e9,upper=9e9)
    valid_feas[_] = valid_feas[_].clip(lower=-9e9,upper=9e9)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 157/157 [00:03<00:00, 51.69it/s]


In [10]:
params = {
    'random_state': 47,
    'learning_rate':0.01,
    'n_estimators':10000,
    'n_jobs':-1,
    'objective':'reg:absoluteerror',
    "device": "gpu",
    'max_depth': 10,
     'min_child_weight': 8.860379669551103,
     'subsample': 0.7711820080525443,
     'colsample_bytree': 0.5348780216605801,
     'reg_alpha': 0.12854342791716195,
     'reg_lambda': 0.39326076062073634,
     'gamma': 0.24378704040107024
}

clf = xgb.XGBRegressor(**params)

clf.fit(
    train_feas[feas_list], train_feas['target'],
    eval_set=[(valid_feas[feas_list], valid_feas['target'])],
    verbose=200
)



  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation_0-mae:6.02907
[200]	validation_0-mae:5.91408
[400]	validation_0-mae:5.89466
[600]	validation_0-mae:5.88516
[800]	validation_0-mae:5.88006
[1000]	validation_0-mae:5.87699
[1200]	validation_0-mae:5.87504
[1400]	validation_0-mae:5.87367
[1600]	validation_0-mae:5.87271
[1800]	validation_0-mae:5.87215


KeyboardInterrupt: 