In [1]:
import gc  # Garbage collection for memory management
import os  # Operating system-related functions
import time  # Time-related functions
import warnings  # Handling warnings
from itertools import combinations  # For creating combinations of elements
from warnings import simplefilter  # Simplifying warning handling

# 📦 Importing machine learning libraries
import joblib  # For saving and loading models
import lightgbm as lgb  # LightGBM gradient boosting framework
import numpy as np  # Numerical operations
import pandas as pd  # Data manipulation and analysis
from sklearn.metrics import mean_absolute_error  # Metric for evaluation
from sklearn.model_selection import KFold, TimeSeriesSplit  # Cross-validation techniques

# 🤐 Disable warnings to keep the code clean
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# 📊 Define flags and variables
is_offline = True  # Flag for online/offline mode
is_train = True  # Flag for training mode
is_infer = True  # Flag for inference mode
max_lookback = np.nan  # Maximum lookback (not specified)
split_day = 477  # Split day for time series data


  from .autonotebook import tqdm as notebook_tqdm


## 📊 Data Loading and Preprocessing 📊

In [2]:
# 📂 Read the dataset from a CSV file using Pandas
df = pd.read_csv("optiver-trading-at-the-close/train.csv")

# 🧹 Remove rows with missing values in the "target" column
df = df.dropna(subset=["target"])

# 🔁 Reset the index of the DataFrame and apply the changes in place
df.reset_index(drop=True, inplace=True)

# 📏 Get the shape of the DataFrame (number of rows and columns)
df_shape = df.shape


## 🚀 Memory Optimization Function with Data Type Conversion 🧹

In [3]:
# 🧹 Function to reduce memory usage of a Pandas DataFrame
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    
    # 📏 Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    # 🔄 Iterate through each column in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype

        # Check if the column's data type is not 'object' (i.e., numeric)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            # Check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    # ℹ️ Provide memory optimization information if 'verbose' is True
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    # 🔄 Return the DataFrame with optimized memory usage
    return df


In [4]:
# 🏎️ Import Numba for just-in-time (JIT) compilation and parallel processing
from numba import njit, prange

# 📊 Function to compute triplet imbalance in parallel using Numba
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    # 🔁 Loop through all combinations of triplets
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        
        # 🔁 Loop through rows of the DataFrame
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            # 🚫 Prevent division by zero
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

# 📈 Function to calculate triplet imbalance for given price data and a DataFrame
def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance using the Numba-optimized function
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features


In [5]:
# 📊 Function to generate imbalance features
def imbalance_features(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1 features
    # Calculate various features using Pandas eval function
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    
    # Create features for pairwise price imbalances
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    # Calculate triplet imbalance features using the Numba-optimized function
    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
        
    # V2 features
    # Calculate additional features
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    # Calculate various statistical aggregation features
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        
    # V3 features
    # Calculate shifted and return features for specific columns
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    # Replace infinite values with 0
    return df.replace([np.inf, -np.inf], 0)

# 📅 Function to generate time and stock-related features
def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  # Seconds
    df["minute"] = df["seconds_in_bucket"] // 60  # Minutes

    # Map global features to the DataFrame
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

# 🚀 Function to generate all features by combining imbalance and other features
def generate_all_features(df):
    # Select relevant columns for feature generation
    cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    df = df[cols]
    
    # Generate imbalance features
    df = imbalance_features(df)
    
    # Generate time and stock-related features
    df = other_features(df)
    gc.collect()  # Perform garbage collection to free up memory
    
    # Select and return the generated features
    feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id", "date_id"]]
    
    return df[feature_name]


In [6]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]

weights = {int(k):v for k,v in enumerate(weights)}

## Data Splitting

In [7]:
# In offline mode, split the data into training and validation sets based on the split_day
df_train = df[df["date_id"] <= split_day]
df_valid = df[df["date_id"] > split_day]

# Display a message indicating offline mode and the shapes of the training and validation sets
print("Offline mode")
print(f"train : {df_train.shape}, valid : {df_valid.shape}")

Offline mode
train : (5204892, 17), valid : (33000, 17)


In [8]:
global_stock_id_feats = {
    "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
    "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
    "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
    "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
    "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
    "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
}

df_train_feats = generate_all_features(df_train)
print("Build Train Feats Finished.")
df_valid_feats = generate_all_features(df_valid)
print("Build Valid Feats Finished.")
df_valid_feats = reduce_mem_usage(df_valid_feats)

Build Train Feats Finished.
Build Valid Feats Finished.


## Cross-Validation Strategy 📊

In [11]:
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.metrics import mean_absolute_error
import gc

# Assuming df_train_feats and df_train are already defined and df_train contains the 'date_id' column

# Set up parameters for LightGBM, Xgboost, Catboost

lgb_params = {
    "objective": "mae",
    "n_estimators": 5000,
    "num_leaves": 256,
    "subsample": 0.6,
    "colsample_bytree": 0.6,
    "learning_rate": 0.00871,
    "n_jobs": 4,
    "device": "gpu",
    "verbosity": -1,
    "importance_type": "gain",
}

xgb_params = {
    "objective": "reg:squarederror",
    "n_estimators": 5000,
    "max_depth": 6,
    "subsample": 0.6,
    "colsample_bytree": 0.6,
    "learning_rate": 0.00871,
    "n_jobs": 4,
    "tree_method": "gpu_hist",  
    "gpu_id": 0  
}

cb_params = {
    "loss_function": "MAE",
    "iterations": 5000,
    "depth": 8,
    "learning_rate": 0.00871,
    "bootstrap_type": "Bernoulli",
    "subsample": 0.6,
    "task_type": "GPU",
    "devices": "0",  # Set the GPU device(s) you want to use
    "verbose": False,
}


feature_name = list(df_train_feats.columns)
print(f"Feature length = {len(feature_name)}")

# The total number of date_ids is 480, we split them into 5 folds with a gap of 5 days in between
num_folds = 5
fold_size = 480 // num_folds
gap = 5

lgb_models = []
xgb_models = []
cat_models = []

scores = []

lgb_save_path = 'ensemble_lgb'  # Directory to save models
if not os.path.exists(lgb_save_path):
    os.makedirs(lgb_save_path)
xgb_save_path = 'ensemble_xgb'  # Directory to save models
if not os.path.exists(xgb_save_path):
    os.makedirs(xgb_save_path)
cat_save_path = 'ensemble_cat'  # Directory to save models
if not os.path.exists(cat_save_path):
    os.makedirs(cat_save_path)

# We need to use the date_id from df_train to split the data
date_ids = df_train['date_id'].values

for i in range(num_folds):
    start = i * fold_size
    end = start + fold_size
    
    # Define the purged set ranges
    purged_before_start = start - 2
    purged_before_end = start + 2
    purged_after_start = end - 2
    purged_after_end = end + 2
    
    # Exclude the purged ranges from the test set
    purged_set = ((date_ids >= purged_before_start) & (date_ids <= purged_before_end)) | \
                 ((date_ids >= purged_after_start) & (date_ids <= purged_after_end))
    
    # Define test_indices excluding the purged set
    test_indices = (date_ids >= start) & (date_ids < end) & ~purged_set
    train_indices = ~test_indices & ~purged_set
    
    df_fold_train = df_train_feats[train_indices]
    df_fold_train_target = df_train['target'][train_indices]
    df_fold_valid = df_train_feats[test_indices]
    df_fold_valid_target = df_train['target'][test_indices]

    print(f"Fold {i+1} Model Training")
    
    # Train models for the current fold
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(
        df_fold_train[feature_name],
        df_fold_train_target,
        eval_set=[(df_fold_valid[feature_name], df_fold_valid_target)],
        callbacks=[
            lgb.callback.early_stopping(stopping_rounds=100),
            lgb.callback.log_evaluation(period=100),
        ],
    )

    xgb_model = xgb.XGBRegressor(**xgb_params)
    xgb_model.fit(
        df_fold_train[feature_name],
        df_fold_train_target,
        eval_set=[(df_fold_valid[feature_name], df_fold_valid_target)],
        early_stopping_rounds=100,  # early stopping
        verbose=100,
    )

    cb_model = cb.CatBoostRegressor(**cb_params)
    cb_model.fit(
        df_fold_train[feature_name],
        df_fold_train_target,
        eval_set=(df_fold_valid[feature_name], df_fold_valid_target),
        early_stopping_rounds=100,
        verbose_eval=100,
    )


    # Append the model to the list
    lgb_models.append(lgb_model)
    xgb_models.append(xgb_model)
    cat_models.append(cb_model)


    # Save the model to a file
    model_filename = os.path.join(lgb_save_path, f'doblez_{i+1}.txt')
    lgb_model.booster_.save_model(model_filename)
    print(f"LightGBM for fold {i+1} saved to {model_filename}")

    model_filename = os.path.join(xgb_save_path, f'doblez_{i+1}.txt')
    xgb_model.save_model(model_filename)
    print(f"Xgboost for fold {i+1} saved to {model_filename}")

    model_filename = os.path.join(cat_save_path, f'doblez_{i+1}.cbm')
    cb_model.save_model(model_filename)
    print(f"Catboost for fold {i+1} saved to {model_filename}")

    # Evaluate model performance on the validation set
    fold_predictions = (lgb_model.predict(df_fold_valid[feature_name]) + xgb_model.predict(df_fold_valid[feature_name]) + cb_model.predict(df_fold_valid[feature_name]))/3
    fold_score = mean_absolute_error(fold_predictions, df_fold_valid_target)
    scores.append(fold_score)
    print(f"Fold {i+1} MAE: {fold_score}")

    # Free up memory by deleting fold specific variables
    del df_fold_train, df_fold_train_target, df_fold_valid, df_fold_valid_target
    gc.collect()

# Calculate the average best iteration from all regular folds
average_best_iteration_lgb = int(np.mean([model.best_iteration_ for model in lgb_models]))
average_best_iteration_xgb = int(np.mean([model.best_iteration for model in xgb_models]))
average_best_iteration_cat = int(np.mean([model.get_best_iteration() for model in cat_models]))

# Update the lgb_params with the average best iteration
final_lgb_params = lgb_params.copy()
final_lgb_params['n_estimators'] = average_best_iteration_lgb
print(f"Training final LightGBM with average best iteration: {average_best_iteration_lgb}")

final_xgb_params = xgb_params.copy()
final_xgb_params['n_estimators'] = average_best_iteration_xgb
print(f"Training final Xgboost with average best iteration: {average_best_iteration_xgb}")

final_cat_params = cb_params.copy()
final_cat_params['iterations'] = average_best_iteration_cat
print(f"Training final Catboost with average best iteration: {average_best_iteration_cat}")

# Train the final model on the entire dataset
final_lgb = lgb.LGBMRegressor(**final_lgb_params)
final_lgb.fit(
    df_train_feats[feature_name],
    df_train['target'],
    callbacks=[
        lgb.callback.log_evaluation(period=100),
    ],
)

final_xgb = xgb.XGBRegressor(**final_xgb_params)
final_xgb.fit(
    df_train_feats[feature_name],
    df_train['target'],
    verbose=100,
)

final_cb = cb.CatBoostRegressor(**final_cat_params)
final_cb.fit(
    df_train_feats[feature_name],
    df_train['target'],
    verbose_eval=100,
)
# Append the final model to the list of models
lgb_models.append(final_lgb)
xgb_models.append(final_xgb)
cat_models.append(final_cb)

# Save the final model to a file
final_model_filename = os.path.join(lgb_save_path, 'doblez-conjunto.txt')
final_lgb.booster_.save_model(final_model_filename)
print(f"Final LightGBM saved to {final_model_filename}")

final_model_filename = os.path.join(xgb_save_path, 'doblez-conjunto.txt')
final_xgb.save_model(final_model_filename)
print(f"Final Xgboost saved to {final_model_filename}")

final_model_filename = os.path.join(cat_save_path, 'doblez-conjunto.txt')
final_cb.save_model(final_model_filename)
print(f"Final Catboost saved to {final_model_filename}")

# Now 'models' holds the trained models for each fold and 'scores' holds the validation scores
print(f"Average MAE across all folds: {np.mean(scores)}")


Feature length = 112
Fold 1 Model Training
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 5.60875
[200]	valid_0's l1: 5.57862
[300]	valid_0's l1: 5.56612
[400]	valid_0's l1: 5.55781
[500]	valid_0's l1: 5.55293
[600]	valid_0's l1: 5.54946
[700]	valid_0's l1: 5.54697
[800]	valid_0's l1: 5.54533
[900]	valid_0's l1: 5.54431
[1000]	valid_0's l1: 5.54356
[1100]	valid_0's l1: 5.54301
[1200]	valid_0's l1: 5.54246
[1300]	valid_0's l1: 5.54225
[1400]	valid_0's l1: 5.54192
[1500]	valid_0's l1: 5.54165
[1600]	valid_0's l1: 5.54144
[1700]	valid_0's l1: 5.54128
[1800]	valid_0's l1: 5.541
[1900]	valid_0's l1: 5.54078
[2000]	valid_0's l1: 5.54063
[2100]	valid_0's l1: 5.54049
[2200]	valid_0's l1: 5.54031
[2300]	valid_0's l1: 5.54018
[2400]	valid_0's l1: 5.54014
[2500]	valid_0's l1: 5.53999
[2600]	valid_0's l1: 5.53978
[2700]	valid_0's l1: 5.53974
[2800]	valid_0's l1: 5.53964
[2900]	valid_0's l1: 5.53954
[3000]	valid_0's l1: 5.53955
Early stopping, best iteration is:
[

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 6.5532847	test: 5.7210310	best: 5.7210310 (0)	total: 14ms	remaining: 1m 10s
100:	learn: 6.5461134	test: 5.7127732	best: 5.7127732 (100)	total: 1.36s	remaining: 1m 5s
200:	learn: 6.5395023	test: 5.7051515	best: 5.7051515 (200)	total: 2.69s	remaining: 1m 4s
300:	learn: 6.5333853	test: 5.6981032	best: 5.6981032 (300)	total: 4.02s	remaining: 1m 2s
400:	learn: 6.5276724	test: 5.6915379	best: 5.6915379 (400)	total: 5.4s	remaining: 1m 1s
500:	learn: 6.5223789	test: 5.6854254	best: 5.6854254 (500)	total: 6.75s	remaining: 1m
600:	learn: 6.5174648	test: 5.6797400	best: 5.6797400 (600)	total: 8.1s	remaining: 59.3s
700:	learn: 6.5128598	test: 5.6744501	best: 5.6744501 (700)	total: 9.46s	remaining: 58s
800:	learn: 6.5085917	test: 5.6695006	best: 5.6695006 (800)	total: 10.9s	remaining: 57s
900:	learn: 6.5045825	test: 5.6648737	best: 5.6648737 (900)	total: 12.3s	remaining: 55.8s
1000:	learn: 6.5008501	test: 5.6605547	best: 5.6605547 (1000)	total: 13.6s	remaining: 54.5s
1100:	learn: 6.497337

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 6.1763583	test: 7.1436647	best: 7.1436647 (0)	total: 14.2ms	remaining: 1m 11s
100:	learn: 6.1692258	test: 7.1343001	best: 7.1343001 (100)	total: 1.34s	remaining: 1m 5s
200:	learn: 6.1626816	test: 7.1255808	best: 7.1255808 (200)	total: 2.69s	remaining: 1m 4s
300:	learn: 6.1566456	test: 7.1174311	best: 7.1174311 (300)	total: 4.03s	remaining: 1m 2s
400:	learn: 6.1510480	test: 7.1098451	best: 7.1098451 (400)	total: 5.41s	remaining: 1m 1s
500:	learn: 6.1458728	test: 7.1026966	best: 7.1026966 (500)	total: 6.75s	remaining: 1m
600:	learn: 6.1410738	test: 7.0960666	best: 7.0960666 (600)	total: 8.1s	remaining: 59.3s
700:	learn: 6.1366296	test: 7.0898827	best: 7.0898827 (700)	total: 9.49s	remaining: 58.2s
800:	learn: 6.1325095	test: 7.0840566	best: 7.0840566 (800)	total: 10.9s	remaining: 57.1s
900:	learn: 6.1286600	test: 7.0786193	best: 7.0786193 (900)	total: 12.3s	remaining: 55.8s
1000:	learn: 6.1250908	test: 7.0735231	best: 7.0735231 (1000)	total: 13.6s	remaining: 54.4s
1100:	learn: 6

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 6.3730717	test: 6.4454254	best: 6.4454254 (0)	total: 13.4ms	remaining: 1m 7s
100:	learn: 6.3650941	test: 6.4387452	best: 6.4387452 (100)	total: 1.37s	remaining: 1m 6s
200:	learn: 6.3577464	test: 6.4326086	best: 6.4326086 (200)	total: 2.71s	remaining: 1m 4s
300:	learn: 6.3509324	test: 6.4269339	best: 6.4269339 (300)	total: 4.1s	remaining: 1m 4s
400:	learn: 6.3446219	test: 6.4216949	best: 6.4216949 (400)	total: 5.45s	remaining: 1m 2s
500:	learn: 6.3387694	test: 6.4168515	best: 6.4168515 (500)	total: 6.79s	remaining: 1m
600:	learn: 6.3333459	test: 6.4123726	best: 6.4123726 (600)	total: 8.13s	remaining: 59.5s
700:	learn: 6.3282984	test: 6.4082194	best: 6.4082194 (700)	total: 9.51s	remaining: 58.3s
800:	learn: 6.3235842	test: 6.4043700	best: 6.4043700 (800)	total: 10.9s	remaining: 56.9s
900:	learn: 6.3191915	test: 6.4007902	best: 6.4007902 (900)	total: 12.2s	remaining: 55.6s
1000:	learn: 6.3151036	test: 6.3974794	best: 6.3974794 (1000)	total: 13.6s	remaining: 54.2s
1100:	learn: 6.

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 6.4095746	test: 6.4333852	best: 6.4333852 (0)	total: 13.8ms	remaining: 1m 8s
100:	learn: 6.4016063	test: 6.4267400	best: 6.4267400 (100)	total: 1.33s	remaining: 1m 4s
200:	learn: 6.3942602	test: 6.4206584	best: 6.4206584 (200)	total: 2.64s	remaining: 1m 2s
300:	learn: 6.3874466	test: 6.4150735	best: 6.4150735 (300)	total: 3.95s	remaining: 1m 1s
400:	learn: 6.3811253	test: 6.4099232	best: 6.4099232 (400)	total: 5.32s	remaining: 1m 1s
500:	learn: 6.3752520	test: 6.4051602	best: 6.4051602 (500)	total: 6.65s	remaining: 59.7s
600:	learn: 6.3697880	test: 6.4007708	best: 6.4007708 (600)	total: 7.99s	remaining: 58.5s
700:	learn: 6.3646962	test: 6.3966988	best: 6.3966988 (700)	total: 9.31s	remaining: 57.1s
800:	learn: 6.3599786	test: 6.3929665	best: 6.3929665 (800)	total: 10.7s	remaining: 56.2s
900:	learn: 6.3555561	test: 6.3894785	best: 6.3894785 (900)	total: 12.1s	remaining: 55s
1000:	learn: 6.3514451	test: 6.3862583	best: 6.3862583 (1000)	total: 13.4s	remaining: 53.7s
1100:	learn: 

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 6.4948223	test: 6.0786163	best: 6.0786163 (0)	total: 12.9ms	remaining: 1m 4s
100:	learn: 6.4869205	test: 6.0721874	best: 6.0721874 (100)	total: 1.31s	remaining: 1m 3s
200:	learn: 6.4796156	test: 6.0662869	best: 6.0662869 (200)	total: 2.62s	remaining: 1m 2s
300:	learn: 6.4728370	test: 6.0608531	best: 6.0608531 (300)	total: 3.98s	remaining: 1m 2s
400:	learn: 6.4665477	test: 6.0558288	best: 6.0558288 (400)	total: 5.31s	remaining: 1m
500:	learn: 6.4607125	test: 6.0512211	best: 6.0512211 (500)	total: 6.63s	remaining: 59.5s
600:	learn: 6.4552716	test: 6.0469452	best: 6.0469452 (600)	total: 7.96s	remaining: 58.3s
700:	learn: 6.4502071	test: 6.0429904	best: 6.0429904 (700)	total: 9.35s	remaining: 57.4s
800:	learn: 6.4455216	test: 6.0393673	best: 6.0393673 (800)	total: 10.7s	remaining: 56.1s
900:	learn: 6.4411166	test: 6.0359695	best: 6.0359695 (900)	total: 12.1s	remaining: 54.9s
1000:	learn: 6.4370207	test: 6.0328315	best: 6.0328315 (1000)	total: 13.4s	remaining: 53.6s
1100:	learn: 6

Default metric period is 5 because MAE is/are not implemented for GPU


0:	learn: 6.4145934	total: 15.5ms	remaining: 1m 17s
100:	learn: 6.4069794	total: 1.64s	remaining: 1m 19s
200:	learn: 6.3999672	total: 3.31s	remaining: 1m 19s
300:	learn: 6.3934610	total: 4.93s	remaining: 1m 16s
400:	learn: 6.3874444	total: 6.6s	remaining: 1m 15s
500:	learn: 6.3818396	total: 8.22s	remaining: 1m 13s
600:	learn: 6.3766564	total: 9.89s	remaining: 1m 12s
700:	learn: 6.3718125	total: 11.6s	remaining: 1m 10s
800:	learn: 6.3673229	total: 13.2s	remaining: 1m 9s
900:	learn: 6.3631161	total: 14.9s	remaining: 1m 7s
1000:	learn: 6.3592032	total: 16.6s	remaining: 1m 6s
1100:	learn: 6.3555436	total: 18.3s	remaining: 1m 4s
1200:	learn: 6.3521164	total: 20s	remaining: 1m 3s
1300:	learn: 6.3489202	total: 21.8s	remaining: 1m 1s
1400:	learn: 6.3459415	total: 23.5s	remaining: 1m
1500:	learn: 6.3431622	total: 25.2s	remaining: 58.8s
1600:	learn: 6.3405262	total: 26.9s	remaining: 57.1s
1700:	learn: 6.3380773	total: 28.6s	remaining: 55.5s
1800:	learn: 6.3357695	total: 30.3s	remaining: 53.8s
19

## Predict

Ensemble

In [12]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out


def predict(pred_models):
    y_min, y_max = -64, 64

    # Weights for each fold model
    model_weights = [1/len(pred_models)] * len(pred_models) 

    # Generate predictions for each model and calculate the weighted average
    predictions = np.zeros(len(df_valid))
    for model, weight in zip(pred_models, model_weights):
        predictions += weight * model.predict(df_valid_feats)

    predictions = zero_sum(predictions, df_valid['bid_size'] + df_valid['ask_size'])
    predictions = np.clip(predictions, y_min, y_max)

    return predictions


ensemble_predictions = (predict(lgb_models) + predict(xgb_models) + predict(cat_models))/3
final_score = mean_absolute_error(ensemble_predictions, df_valid['target'])

print(f'Final score: {final_score}')

Final score: 5.192273304789406


In [18]:
weights = [0.6, 0.2, 0.2]

ensemble_predictions_1 = np.average([predict(lgb_models), predict(xgb_models), predict(cat_models)], axis=0, weights=weights)
final_score_1 = mean_absolute_error(ensemble_predictions_1, df_valid['target'])

print(f'Final score: {final_score_1}')

Final score: 5.184976785104516


LightGBM

In [13]:
lgb_predictions = predict(lgb_models)
lgb_score = mean_absolute_error(lgb_predictions, df_valid['target'])

print(f'LightGBM score: {lgb_score}')

LightGBM score: 5.180033128821017


Xgboost

In [15]:
xgb_predictions = predict(xgb_models)
xgb_score = mean_absolute_error(xgb_predictions, df_valid['target'])

print(f'Xgboost score: {xgb_score}')

Xgboost score: 5.215549938892475


Catboost

In [16]:
cat_predictions = predict(cat_models)
cat_score = mean_absolute_error(cat_predictions, df_valid['target'])

print(f'Catboost score: {cat_score}')

Catboost score: 5.219583261324764


In [21]:
from scipy.optimize import minimize

def objective_function(weights):
    ensemble_predictions = np.average([predict(lgb_models), predict(xgb_models), predict(cat_models)], axis=0, weights=weights)
    score = mean_absolute_error(ensemble_predictions, df_valid['target'])
    return score


def find_weight():
    initial_weight = np.ones(3)/3
    bounds = [(0, 1)] * 3
    result = minimize(objective_function, initial_weight, bounds=bounds, method='SLSQP')
    optimized_weights = result.x
    optimized_weights /= np.sum(optimized_weights)
    return optimized_weights

best_weight = find_weight()

print(f'Optimized weight: ', best_weight)

Optimized weight:  [9.04947418e-01 7.97515162e-17 9.50525816e-02]


LGBM + XGB + CAT

In [22]:
ensemble_predictions = np.average([predict(lgb_models), predict(xgb_models), predict(cat_models)], axis=0, weights=best_weight)
final_score = mean_absolute_error(ensemble_predictions, df_valid['target'])

final_score

5.179568085464417