In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, TimeSeriesSplit
import joblib
import lightgbm as lgb

import gc
import os
import time
import warnings
from itertools import combinations
from warnings import simplefilter

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preparation

In [None]:
all = pd.read_csv("/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Input/train.csv")
all = all.dropna(subset=['target','wap'])
all.isnull().sum()

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                   0
imbalance_buy_sell_flag          0
reference_price                  0
matched_size                     0
far_price                  2894122
near_price                 2856960
bid_price                        0
bid_size                         0
ask_price                        0
ask_size                         0
wap                              0
target                           0
time_id                          0
row_id                           0
dtype: int64

In [None]:
all_sample = all[all["date_id"]> 450]
train = all_sample[all_sample["date_id"]<475]
test = all_sample[all_sample["date_id"]>=475]

print(train.shape)
print(test.shape)
train.head()

(264000, 17)
(66000, 17)


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
4907980,0,451,0,101663700.0,-1,1.00157,27901114.55,,,0.999892,56601.0,1.00006,31225.16,1.0,12.749434,24805,451_0_0
4907981,1,451,0,1481195.0,1,0.999733,2922825.96,,,0.999782,20602.0,1.000218,20611.0,1.0,-0.090003,24805,451_0_1
4907982,2,451,0,0.0,0,0.999531,7211807.46,,,0.99948,19685.0,1.000546,20691.3,1.0,5.480051,24805,451_0_2
4907983,3,451,0,5426093.0,-1,1.000054,52742546.23,,,0.999902,10059.75,1.000104,10653.66,1.0,-3.87013,24805,451_0_3
4907984,4,451,0,6102518.0,-1,1.000013,10336604.95,,,0.998472,5346.0,1.000153,535.5,1.0,2.980232,24805,451_0_4


In [None]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]

weights = {int(k):v for k,v in enumerate(weights)}

# Feature Engineering

In [None]:
global_stock_id_feats = {
        "median_size": train.groupby("stock_id")["bid_size"].median() + train.groupby("stock_id")["ask_size"].median(),
        "std_size": train.groupby("stock_id")["bid_size"].std() + train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": train.groupby("stock_id")["bid_size"].max() - train.groupby("stock_id")["bid_size"].min(),
        "median_price": train.groupby("stock_id")["bid_price"].median() + train.groupby("stock_id")["ask_price"].median(),
        "std_price": train.groupby("stock_id")["bid_price"].std() + train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": train.groupby("stock_id")["bid_price"].max() - train.groupby("stock_id")["ask_price"].min(),
    }

In [None]:
from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            if mid_val == min_val:  # Prevent division by zero
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features


def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

In [None]:
# generate imbalance features
def imbalance_features(df):
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values

    # V2
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']

    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)

    # V3
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)

    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size',
                'wap', 'near_price', 'far_price']:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    return df.replace([np.inf, -np.inf], 0)

# generate time & stock features
def other_features(df):
    df["dow"] = df["date_id"] % 5
    df["dom"] = df["date_id"] % 20
    df["seconds"] = df["seconds_in_bucket"] % 60
    df["minute"] = df["seconds_in_bucket"] // 60

    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

# generate all features
def generate_all_features(df):
    cols = [c for c in df.columns if c not in ["row_id", "time_id"]]
    df = df[cols]
    df = imbalance_features(df)
    df = other_features(df)
    gc.collect()

    feature_name = [i for i in df.columns if i not in ["row_id", "time_id", "date_id"]]

    return df[feature_name]

In [None]:
# Generate data set
train = generate_all_features(train)
print("Build Train Finished.")
test = generate_all_features(test)
print("Build Test Finished.")

# Preparing the data
X_train = train.drop(columns=['target'])
X_test = test.drop(columns=['target'])
y_train = train['target']
y_test = test['target']

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

Build Train Finished.
Build Test Finished.


In [None]:
X_train_scaled

Unnamed: 0,stock_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,bid_price,bid_size,ask_price,ask_size,...,dow,dom,seconds,minute,global_median_size,global_std_size,global_ptp_size,global_median_price,global_std_price,global_ptp_price
0,-1.723412,-1.70084,2.744234,-1.019090,0.825356,-0.176137,0.220618,-0.021494,0.069364,-0.265601,...,-0.783929,0.185164,-1.423025,-1.547132,0.254522,0.498949,-0.121411,-1.028397,-0.653712,-0.487893
1,-1.706091,-1.70084,-0.214852,1.144705,0.035576,-0.287537,0.171964,-0.266139,0.138022,-0.353967,...,-0.783929,0.185164,-1.423025,-1.547132,-0.584723,-0.446504,-0.155793,-0.590873,1.122376,0.755571
2,-1.688771,-1.70084,-0.258602,0.062807,-0.051270,-0.268409,0.038388,-0.272371,0.280554,-0.353299,...,-0.783929,0.185164,-1.423025,-1.547132,-0.420630,-0.287355,-0.159773,1.514656,-0.241122,-0.132555
3,-1.671450,-1.70084,-0.098332,-1.019090,0.173583,-0.065347,0.225041,-0.337783,0.088484,-0.436865,...,-0.783929,0.185164,-1.423025,-1.547132,-0.354166,-0.496265,-0.339539,1.093923,-0.653399,-0.737550
4,-1.654129,-1.70084,-0.078353,-1.019090,0.155956,-0.254473,-0.407457,-0.369817,0.109777,-0.521101,...,-0.783929,0.185164,-1.423025,-1.547132,-0.406117,-0.558376,-0.391308,-0.023677,-0.876765,-0.999924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263995,1.654129,1.70084,-0.080006,-1.019090,1.226911,-0.044697,1.375921,3.034905,1.183543,3.069554,...,1.386952,0.740656,-1.423025,1.856558,-0.274758,-0.316324,-0.165014,-0.165476,-0.861125,-1.002994
263996,1.671450,1.70084,-0.223781,1.144705,1.303438,-0.182645,1.339210,2.122332,1.208747,-0.467620,...,1.386952,0.740656,-1.423025,1.856558,-0.362182,-0.560625,-0.337133,-0.482658,-0.229730,-0.160911
263997,1.688771,1.70084,-0.240210,-1.019090,-0.970459,-0.091882,-0.884705,2.002697,-1.046554,1.125152,...,1.386952,0.740656,-1.423025,1.856558,-0.455767,-0.563364,-0.252552,-2.127339,0.056662,-0.231804
263998,1.706091,1.70084,-0.168766,1.144705,0.545042,0.906534,0.563847,3.882983,0.442205,7.688680,...,1.386952,0.740656,-1.423025,1.856558,2.655621,1.329000,0.523802,0.026699,-0.700633,-0.347131


In [None]:
# Select features with none missing value
feature_no_missing = train.isnull().sum() == 0
X_train_scaled = X_train_scaled.loc[:, feature_no_missing]
X_test_scaled = X_test_scaled.loc[:, feature_no_missing]

# Traditional Feature Engineering

In [None]:
# # Traditional Feature Setting
# features = ['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size','wap']
# target = 'target'

# # Preparing the data
# X_train = train[features]
# y_train = train[target]
# X_test = test[features]
# y_test = test[target]

# # Standardize the data
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
# X_test_scaled = scaler.transform(X_test)
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [None]:
# Get Prediction result
def calculate_mae(X_train, X_test, y_train, y_test, model):

    # Make predictions on the training set
    train_predictions = model.predict(X_train)
    # Flatten predictions if necessary
    train_predictions = np.ravel(train_predictions)

    # Make predictions on the test set
    test_predictions = model.predict(X_test)
    # Flatten predictions if necessary
    test_predictions = np.ravel(test_predictions)

    # Calculate Mean Absolute Error on training data
    train_mae = mean_absolute_error(y_train, train_predictions)
    # Calculate Mean Absolute Error
    test_mae = mean_absolute_error(y_test, test_predictions)

    return train_mae, test_mae

# Modeling

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['auc', 'binary_logloss'],
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}


In [None]:
num_round = 1000
lgb_model = lgb.train(params, train_data, num_round, valid_sets=[test_data])


[LightGBM] [Info] Number of positive: 129509, number of negative: 134491
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.313116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32659
[LightGBM] [Info] Number of data points in the train set: 264000, number of used features: 147
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.490564 -> initscore=-0.037747
[LightGBM] [Info] Start training from score -0.037747


In [None]:
lgb_model.save_model('/content/drive/MyDrive/Desktop/Advanced Analytics Edge/Project/Individual Work/Output/lgb_model.txt')

<lightgbm.basic.Booster at 0x7a3de37789a0>

In [None]:
y_pred = lgb_model.predict(X_test)
# Evaluate the predictions here, e.g., using accuracy or other metrics

In [None]:
# calculate MAE for deep neural network model
lgb_train_mae, lgb_test_mae = calculate_mae(X_train, X_test, y_train, y_test, lgb_model)
print("Train MAE: ", lgb_train_mae)
print("Test MAE: ", lgb_test_mae)

Train MAE:  6.04354193654035
Test MAE:  5.36448578900237
