In [None]:
import gc  # Garbage collection for memory management
import os  # Operating system-related functions
import time  # Time-related functions
import warnings  # Handling warnings
from itertools import combinations  # For creating combinations of elements
from warnings import simplefilter  # Simplifying warning handling

# 📦 Importing machine learning libraries
import joblib  # For saving and loading models
import xgboost as xgb
import lightgbm as lgb  # LightGBM gradient boosting framework
import catboost as ctb
import numpy as np  # Numerical operations
import pandas as pd  # Data manipulation and analysis
# from pandarallel import pandarallel
from sklearn.metrics import mean_absolute_error  # Metric for evaluation
from sklearn.model_selection import KFold, TimeSeriesSplit  # Cross-validation techniques

# 🤐 Disable warnings to keep the code clean
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
# pandarallel.initialize(nb_workers=4)

# 📊 Define flags and variables
# is_offline = True  # Flag for online/offline mode
# is_train = False  # Flag for training mode
# is_infer = False  # Flag for inference mode
max_lookback = np.nan  # Maximum lookback (not specified)
split_day = 435  # Split day for time series data
N_STOCKS = 200
MAX_N_NEIGHBOURS = 10
NEIGHBOUR_CORR_THRESHOLD = 0.3

In [None]:
# infer_lgb_model = joblib.load("/kaggle/input/model-1201-1/lgbm.model")

In [None]:
# 📂 Read the dataset from a CSV file using Pandas
# df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")

# 🧹 Remove rows with missing values in the "target" column
df = df.dropna(subset=["target", "wap"])

# 🔁 Reset the index of the DataFrame and apply the changes in place
df.reset_index(drop=True, inplace=True)

# 📏 Get the shape of the DataFrame (number of rows and columns)
df_shape = df.shape

df_train = df

In [None]:
# 🧹 Function to reduce memory usage of a Pandas DataFrame
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    
    # 📏 Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    # 🔄 Iterate through each column in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype

        # Check if the column's data type is not 'object' (i.e., numeric)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            # Check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    # ℹ️ Provide memory optimization information if 'verbose' is True
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    # 🔄 Return the DataFrame with optimized memory usage
    return df


In [None]:
# 🏎️ Import Numba for just-in-time (JIT) compilation and parallel processing
from numba import njit, prange

# 📊 Function to compute triplet imbalance in parallel using Numba
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    # 🔁 Loop through all combinations of triplets
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        
        # 🔁 Loop through rows of the DataFrame
        for j in range(num_rows):

            if df_values[j, a] < df_values[j, b]:
                min_val = df_values[j, a]
                max_val = df_values[j, b]
            else:
                max_val = df_values[j, a]
                min_val = df_values[j, b]

            if min_val < df_values[j, c]:
                if df_values[j, c] < max_val:
                    mid_val = df_values[j, c]
                else:
                    mid_val = max_val
                    max_val = df_values[j, c]
            else:
                mid_val = min_val
                min_val = df_values[j, c]
            
            # 🚫 Prevent division by zero
            if max_val == min_val:
                imbalance_features[j, i] = np.nan
            elif mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)
    
    return imbalance_features

# 📈 Function to calculate triplet imbalance for given price data and a DataFrame
def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance using the Numba-optimized function
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features


In [None]:
# 📊 Function to generate imbalance features
def imbalance_features(df: pd.DataFrame):

    stock_groups = df.groupby(["date_id", "seconds_in_bucket"])
    # Index WAP
    df["wwap"] = df.stock_id.map(weights) * df.wap
    df["iwap"] = stock_groups["wwap"].transform(lambda x: x.sum())
    del df["wwap"]

    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1 features
    # Calculate various features using Pandas eval function
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["all_size"] = df.eval("matched_size + imbalance_size")  # add
    df["imbalance_size_for_buy_sell"] = df.eval("imbalance_size * imbalance_buy_sell_flag")  # add

    cols = ['wap', 'imbalance_size_for_buy_sell', "bid_size", "ask_size"]
    for q in [0.25, 0.5, 0.75]:  # Try more/different q
        df[[f'all_{col}_quantile_{q}' for col in cols]] = stock_groups[cols].transform(lambda x: x.quantile(q))

    df["1/bid_size"]= 1/df.bid_size
    df["1/sqrt(bid_size)"] = 1/np.sqrt(df.bid_size)
    
    # Create features for pairwise price imbalances
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in combinations(sizes, 2):
        df[f"{c[0]}/{c[1]}"] = df.eval(f"({c[0]})/({c[1]})")

    # Calculate triplet imbalance features using the Numba-optimized function
    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
        
    # V2 features
    # Calculate additional features
    stock_groups = df.groupby(['stock_id', 'date_id'])
    df["imbalance_momentum"] = stock_groups['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = stock_groups['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * df["price_spread"]
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    df['wap_advantage'] = df.wap - df.iwap  # add
    
    # Calculate various statistical aggregation features
    df_prices = df[prices]
    df_sizes = df[sizes]
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df_prices.agg(func, axis=1)
        df[f"all_sizes_{func}"] = df_sizes.agg(func, axis=1)
        
    # V3 features
    # Calculate shifted and return features for specific columns
    cols = ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag', "wap", "iwap"]
    stock_groups_cols = stock_groups[cols]
    for window in [1, 2, 3, 6, 10]:
        df[[f"{col}_shift_{window}" for col in cols]] = stock_groups_cols.shift(window)

    cols = ['matched_size', 'imbalance_size', 'reference_price', "wap", "iwap"]
    stock_groups_cols = stock_groups[cols]
    for window in [1, 2, 3, 6, 10]:
        df[[f"{col}_ret_{window}" for col in cols]] = stock_groups_cols.pct_change(window)

    # Calculate diff features for specific columns
    cols = ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'wap', 'near_price', 'far_price', 'imbalance_size_for_buy_sell']
    stock_groups_cols = stock_groups[cols]
    for window in [1, 2, 3, 6, 10]:
        df[[f"{col}_diff_{window}" for col in cols]] = stock_groups_cols.diff(window)

    # V4 features
    df['flag_change'] = stock_groups['imbalance_buy_sell_flag'].diff().ne(0).astype(int)
    # 使用cumsum创建一个组标识符，每当flag改变时，组标识符增加
    df['group'] = df.groupby(['stock_id', 'date_id'])['flag_change'].cumsum()
    # 对每个组内的'seconds_in_bucket'计算时间差，以得到自上次flag改变以来的时间
    group_min = df.groupby(['stock_id', 'date_id', 'group'])['seconds_in_bucket'].transform('min')
    df['time_since_last_imbalance_change'] = df['seconds_in_bucket'] - group_min
    # `flag_change`为1的地方设为0
    df['time_since_last_imbalance_change'] *= (1 - df['flag_change'])
    df.drop(columns=['flag_change', 'group'], inplace=True)

    # V5 features
    cols = ['imbalance_size_for_buy_sell']
    stock_groups_cols = stock_groups[cols]
    for window in [5, 10]:
        mean_col = stock_groups_cols.transform(lambda x: x.rolling(window=window).mean())
        std_col = stock_groups_cols.transform(lambda x: x.rolling(window=window).std())
        df[[f'z_score_{col}_{window}' for col in cols]] = (df[cols] - mean_col) / std_col
    
    # ------
    
    combin = [['ask_price','wap'],['ask_size','imbalance_size'],['bid_price','wap'],['bid_size','ask_size'],['bid_size','imbalance_size'],
    ['matched_size','imbalance_size'],['matched_size','ask_size'],['matched_size','bid_size'],['reference_price','ask_price'],['reference_price','bid_price'],
    ['reference_price','imbalance_size_for_buy_sell'],['reference_price','wap'],['wap','imbalance_size_for_buy_sell'],['wap','iwap']]
    for c in combin:
        spread_col = f'{c[0]}_{c[1]}_spread'
        df[spread_col] = df[c[0]] - df[c[1]]
        df[f'{spread_col}_diff'] = stock_groups[spread_col].diff(1)
        del df[spread_col]  # Delete the temporary column to save memory

    df["size_imbalance"] = df.eval("bid_size / ask_size")
    df['ask_size_mean'] = df.groupby(["date_id", "seconds_in_bucket"])['ask_size'].transform(lambda x: x.mean())
    df['wap_std'] = df.groupby(["date_id", "seconds_in_bucket"])['wap'].transform(lambda x: x.std())

    df['ask_value'] = df.eval('ask_price * ask_size')
    df['bid_value'] = df.eval('bid_price * bid_size')
    df['harmonic_imbalance'] = df.eval('2 / ((1 / bid_size) + (1 / ask_size))')
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x : 1 if x > 0 else (-1 if x < 0 else 0))
    df["my_target"] = (df["wap_ret_6"] - df["iwap_ret_6"]) * 10000
    df["my_target_abs_mean"] = df.groupby(['date_id', 'seconds_in_bucket'])["my_target"].transform(lambda x: x.abs().mean())
    del df["my_target"]

    df['order_flow'] = df['bid_size'] - df['ask_size']
    df['size_spread'] = df.eval("bid_size - ask_size")
    df["reference_price_momentum"] = stock_groups['reference_price'].diff(periods=1) / df['matched_size']
    df['relative_matched_size'] = df['matched_size'] / (df['ask_size'] + df['bid_size'])
    df['spread_roc'] = stock_groups['price_spread'].pct_change()
    df['spread_weighted_by_imbalance'] = df['price_spread'] * df['imbalance_buy_sell_flag']
    df['stock_weight'] = df.stock_id.map(weights)
    df['weight_sum'] = df.groupby(["date_id", "seconds_in_bucket"])['stock_weight'].transform(lambda x: x.sum())
    df["w_iwap"] =  df["iwap"] / df['weight_sum']   # add
    del df['weight_sum'], df['stock_weight']

    df['w_iwap_advantage'] = df['wap'] - df['w_iwap'] #add

    df['rolling_kurt_5_wap'] = stock_groups['wap'].transform(lambda x: x.rolling(5).agg('kurt'))
    df['rolling_skew_5_wap'] = stock_groups['wap'].transform(lambda x: x.rolling(5).agg('skew'))
    df['rolling_kurt_5_reference'] = stock_groups['reference_price'].transform(lambda x: x.rolling(5).agg('kurt'))
    df['rolling_std_30_reference'] = stock_groups['reference_price'].transform(lambda x: x.rolling(30).agg('std'))
    df['rolling_skew_5_reference'] = stock_groups['reference_price'].transform(lambda x: x.rolling(5).agg('skew'))
    df['rolling_kurt_30_reference'] = stock_groups['reference_price'].transform(lambda x: x.rolling(30).agg('kurt'))

    # MACD
    cols = ["far_price", "near_price", "reference_price", "wap"]
    stock_groups_cols= stock_groups[cols]
    df[[f'{col}_ema_12' for col in cols]] = stock_groups_cols.transform(lambda x: x.ewm(span=12, adjust=False).mean())
    df[[f'{col}_ema_26' for col in cols]] = stock_groups_cols.transform(lambda x: x.ewm(span=26, adjust=False).mean())
    df[[f'{col}_macd' for col in cols]] = df[[f'{col}_ema_12' for col in cols]].values - df[[f'{col}_ema_26' for col in cols]].values
    df[[f"{col}_macd_signal" for col in cols]] = stock_groups[[f'{col}_macd' for col in cols]].transform(lambda x: x.ewm(span=9, adjust=False).mean())

    # Replace infinite values with 0
    return df.replace([np.inf, -np.inf], 0)

# 📅 Function to generate time and stock-related features
def other_features(df):
    # df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  # Seconds
    df["minute"] = df["seconds_in_bucket"] // 60  # Minutes

    # Map global features to the DataFrame
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    df["stock_id&seconds_in_bucket"] = df['stock_id'].astype(str) + '_' + df['seconds_in_bucket'].astype(str)
    for key, value in global_seconds_feats_stock.items():
        df[f"global_seconds_{key}_stock"] = df["stock_id&seconds_in_bucket"].map(value.to_dict())
    del df["stock_id&seconds_in_bucket"]

    return df

def last_days_features(df: pd.DataFrame, feat_last=None, target_last=None):
    size = None
    if feat_last is not None and len(feat_last) > 0:
        cols = [col for col in df.columns if col in set(feat_last.columns)]
        if target_last is not None:
            cols.append("target")
            feat_last["target"] = target_last
            df["target"] = 0
        paddings = []
        second_start = df.seconds_in_bucket.max()
        padding_src = df[df.seconds_in_bucket == second_start]
        size = len(df)
        size_pad = len(padding_src) * 6
        for second in range(second_start + 10, second_start + 70, 10):
            padding = padding_src.copy()
            padding["seconds_in_bucket"] = second
            paddings.append(padding)
        df = pd.concat([feat_last[cols], df] + paddings)

    # Add Last days features
    cols = ['near_price', 'far_price', 'depth_pressure']
    stock_groups = df.groupby(['stock_id', 'seconds_in_bucket'])
    stock_groups_cols = stock_groups[cols]
    for window in [1]:  # Only [1] is enough
        df[[f"{col}_last_{window}day" for col in cols]] = stock_groups_cols.shift(window)
    
    cols = [f"{col}_last_{window}day" for col in cols for window in [1]]
    stock_groups = df.groupby(['stock_id', 'date_id'])
    stock_groups_cols = stock_groups[cols]
    for window in [1, 2, 3, 6]:
        df[[f"{col}_future_{window}" for col in cols]] = stock_groups_cols.shift(-window)

    if 'target' in df.columns:
        cols = ['target']
        stock_groups = df.groupby(['stock_id', 'seconds_in_bucket'])
        stock_groups_cols = stock_groups[cols]
        for window in [1]:  # Only [1] is enough
            df[[f"{col}_last_{window}day" for col in cols]] = stock_groups_cols.shift(window)

        cols = [f"{col}_last_{window}day" for col in cols for window in [1]]
        stock_groups = df.groupby(['stock_id', 'date_id'])
        stock_groups_cols = stock_groups[cols]
        for window in [1, 2]:
            df[[f"{col}_future_{window}" for col in cols]] = stock_groups_cols.shift(-window)

    if size:
        return df[-(size + size_pad):-size_pad]
    return df

# 🚀 Function to generate all features by combining imbalance and other features
def generate_all_features(df, feat_last=None, target_last=None):
    # Select relevant columns for feature generation
    cols = [c for c in df.columns if c not in {"row_id", "time_id"}]
    df = df[cols]
    
    # Generate imbalance features
    df = imbalance_features(df)
    
    # Generate last days features
    df = last_days_features(df, feat_last, target_last)

    # Generate time and stock-related features
    df = other_features(df)
    
    gc.collect()  # Perform garbage collection to free up memory
    
    # Select and return the generated features
    feature_name = [i for i in df.columns if i not in {"row_id", "target", "time_id"}]
    
    return df[feature_name]

In [None]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]

weights = {int(k):v for k,v in enumerate(weights)}

df_train["imbalance_size_for_buy_sell"] = df_train.eval("imbalance_size * imbalance_buy_sell_flag")  # add
df_train["depth_pressure"] = df_train["far_price"] - df_train["near_price"]
global_stock_id_feats = {
    "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
    "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
    "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),

    "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
    "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
    "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
    "median_far_price": df_train.groupby("stock_id")["far_price"].median(),
    "median_near_price": df_train.groupby("stock_id")["near_price"].median(),
    "median_imbalance_size_for_buy_sell": df_train.groupby("stock_id")["imbalance_size_for_buy_sell"].median(),
    "median_matched_size": df_train.groupby("stock_id")["matched_size"].median(),
    "median_depth_pressure": df_train.groupby("stock_id")["depth_pressure"].median(),
}

df_train["stock_id&seconds_in_bucket"] = df_train['stock_id'].astype(str) + '_' + df_train['seconds_in_bucket'].astype(str)
global_seconds_feats_stock = {
    "median_imbalance_size_for_buy_sell": df_train.groupby("stock_id&seconds_in_bucket")["imbalance_size_for_buy_sell"].median(),
    "median_matched_size": df_train.groupby("stock_id&seconds_in_bucket")["matched_size"].median(),
}
del df_train["stock_id&seconds_in_bucket"], df_train["imbalance_size_for_buy_sell"], df_train["depth_pressure"]

In [None]:
def golden_section_search(f, a, b, epsilon):
    phi = (1 + 5 ** 0.5) / 2  # golden ratio
    c = b - (b - a) / phi
    d = a + (b - a) / phi

    while abs(b - a) > epsilon:
        if f(c) < f(d):
            b = d
        else:
            a = c
        c = b - (b - a) / phi
        d = a + (b - a) / phi

    return (b + a) / 2

In [None]:
feature_name_lgb = \
['stock_id',
 'seconds_in_bucket',
 'imbalance_size',
 'imbalance_buy_sell_flag',
 'reference_price',
 'matched_size',
 'far_price',
 'near_price',
 'bid_price',
 'bid_size',
 'ask_price',
 'ask_size',
 'wap',
 'iwap',
 'volume',
 'mid_price',
 'liquidity_imbalance',
 'matched_imbalance',
 'all_size',
 'imbalance_size_for_buy_sell',
 'all_wap_quantile_0.25',
 'all_imbalance_size_for_buy_sell_quantile_0.25',
 'all_bid_size_quantile_0.25',
 'all_ask_size_quantile_0.25',
 'all_wap_quantile_0.5',
 'all_imbalance_size_for_buy_sell_quantile_0.5',
 'all_bid_size_quantile_0.5',
 'all_ask_size_quantile_0.5',
 'all_wap_quantile_0.75',
 'all_imbalance_size_for_buy_sell_quantile_0.75',
 'all_bid_size_quantile_0.75',
 'all_ask_size_quantile_0.75',
 'reference_price_far_price_imb',
 'reference_price_near_price_imb',
 'reference_price_ask_price_imb',
 'reference_price_bid_price_imb',
 'reference_price_wap_imb',
 'far_price_near_price_imb',
 'far_price_ask_price_imb',
 'far_price_bid_price_imb',
 'far_price_wap_imb',
 'near_price_ask_price_imb',
 'near_price_bid_price_imb',
 'near_price_wap_imb',
 'ask_price_bid_price_imb',
 'ask_price_wap_imb',
 'bid_price_wap_imb',
 'matched_size/bid_size',
 'matched_size/ask_size',
 'matched_size/imbalance_size',
 'bid_size/ask_size',
 'bid_size/imbalance_size',
 'ask_size/imbalance_size',
 'ask_price_bid_price_wap_imb2',
 'ask_price_bid_price_reference_price_imb2',
 'ask_price_wap_reference_price_imb2',
 'bid_price_wap_reference_price_imb2',
 'matched_size_bid_size_ask_size_imb2',
 'matched_size_bid_size_imbalance_size_imb2',
 'matched_size_ask_size_imbalance_size_imb2',
 'bid_size_ask_size_imbalance_size_imb2',
 'imbalance_momentum',
 'price_spread',
 'spread_intensity',
 'price_pressure',
 'market_urgency',
 'depth_pressure',
 'wap_advantage',
 'all_prices_mean',
 'all_sizes_mean',
 'all_prices_std',
 'all_sizes_std',
 'all_prices_skew',
 'all_sizes_skew',
 'all_prices_kurt',
 'all_sizes_kurt',
 'matched_size_shift_1',
 'imbalance_size_shift_1',
 'reference_price_shift_1',
 'imbalance_buy_sell_flag_shift_1',
 'wap_shift_1',
 'iwap_shift_1',
 'matched_size_shift_2',
 'imbalance_size_shift_2',
 'reference_price_shift_2',
 'imbalance_buy_sell_flag_shift_2',
 'wap_shift_2',
 'iwap_shift_2',
 'matched_size_shift_3',
 'imbalance_size_shift_3',
 'reference_price_shift_3',
 'imbalance_buy_sell_flag_shift_3',
 'wap_shift_3',
 'iwap_shift_3',
 'matched_size_shift_6',
 'imbalance_size_shift_6',
 'reference_price_shift_6',
 'imbalance_buy_sell_flag_shift_6',
 'wap_shift_6',
 'iwap_shift_6',
 'matched_size_shift_10',
 'imbalance_size_shift_10',
 'reference_price_shift_10',
 'imbalance_buy_sell_flag_shift_10',
 'wap_shift_10',
 'iwap_shift_10',
 'matched_size_ret_1',
 'imbalance_size_ret_1',
 'reference_price_ret_1',
 'wap_ret_1',
 'iwap_ret_1',
 'matched_size_ret_2',
 'imbalance_size_ret_2',
 'reference_price_ret_2',
 'wap_ret_2',
 'iwap_ret_2',
 'matched_size_ret_3',
 'imbalance_size_ret_3',
 'reference_price_ret_3',
 'wap_ret_3',
 'iwap_ret_3',
 'matched_size_ret_6',
 'imbalance_size_ret_6',
 'reference_price_ret_6',
 'wap_ret_6',
 'iwap_ret_6',
 'matched_size_ret_10',
 'imbalance_size_ret_10',
 'reference_price_ret_10',
 'wap_ret_10',
 'iwap_ret_10',
 'ask_price_diff_1',
 'bid_price_diff_1',
 'ask_size_diff_1',
 'bid_size_diff_1',
 'wap_diff_1',
 'near_price_diff_1',
 'far_price_diff_1',
 'imbalance_size_for_buy_sell_diff_1',
 'ask_price_diff_2',
 'bid_price_diff_2',
 'ask_size_diff_2',
 'bid_size_diff_2',
 'wap_diff_2',
 'near_price_diff_2',
 'far_price_diff_2',
 'imbalance_size_for_buy_sell_diff_2',
 'ask_price_diff_3',
 'bid_price_diff_3',
 'ask_size_diff_3',
 'bid_size_diff_3',
 'wap_diff_3',
 'near_price_diff_3',
 'far_price_diff_3',
 'imbalance_size_for_buy_sell_diff_3',
 'ask_price_diff_6',
 'bid_price_diff_6',
 'ask_size_diff_6',
 'bid_size_diff_6',
 'wap_diff_6',
 'near_price_diff_6',
 'far_price_diff_6',
 'imbalance_size_for_buy_sell_diff_6',
 'ask_price_diff_10',
 'bid_price_diff_10',
 'ask_size_diff_10',
 'bid_size_diff_10',
 'wap_diff_10',
 'near_price_diff_10',
 'far_price_diff_10',
 'imbalance_size_for_buy_sell_diff_10',
 'time_since_last_imbalance_change',
 'z_score_imbalance_size_for_buy_sell_5',
 'z_score_imbalance_size_for_buy_sell_10',
 'near_price_last_1day',
 'far_price_last_1day',
 'depth_pressure_last_1day',
 'target_last_1day',
 'near_price_last_1day_future_1',
 'far_price_last_1day_future_1',
 'depth_pressure_last_1day_future_1',
 'near_price_last_1day_future_2',
 'far_price_last_1day_future_2',
 'depth_pressure_last_1day_future_2',
 'near_price_last_1day_future_3',
 'far_price_last_1day_future_3',
 'depth_pressure_last_1day_future_3',
 'near_price_last_1day_future_6',
 'far_price_last_1day_future_6',
 'depth_pressure_last_1day_future_6',
 'seconds',
 'minute',
 'global_median_size',
 'global_std_size',
 'global_ptp_size',
 'global_median_price',
 'global_std_price',
 'global_ptp_price',
 'global_median_far_price',
 'global_median_near_price',
 'global_median_imbalance_size_for_buy_sell',
 'global_median_matched_size']

In [None]:
feature_name_ctb = ['seconds_in_bucket',
 'imbalance_size',
 'matched_size',
 'bid_size',
 'ask_price',
 'ask_size',
 'wap',
 'volume',
 'liquidity_imbalance',
 'matched_imbalance',
 'all_size',
 'imbalance_size_for_buy_sell',
 'imbalance_momentum',
 'price_pressure',
 'market_urgency',
 'depth_pressure',
 'size_imbalance',
 '1/bid_size',
 '1/sqrt(bid_size)',
 'rolling_skew_5_wap',
 'rolling_skew_5_reference',
 'rolling_kurt_5_wap',
 'rolling_kurt_5_reference',
 'rolling_std_30_reference',
 'rolling_kurt_30_reference',
 'size_spread',
 'bid_value',
 'ask_value',
 'mid_price_movement',
 'order_flow',
 'reference_price_momentum',
 'spread_roc',
 'relative_matched_size',
 'spread_weighted_by_imbalance',
 'w_iwap_advantage',
 'reference_price_bid_price_spread_diff',
 'reference_price_ask_price_spread_diff',
 'reference_price_wap_spread_diff',
 'reference_price_imbalance_size_for_buy_sell_spread_diff',
 'bid_price_wap_spread_diff',
 'ask_price_wap_spread_diff',
 'wap_iwap_spread_diff',
 'wap_imbalance_size_for_buy_sell_spread_diff',
 'matched_size_bid_size_spread_diff',
 'matched_size_ask_size_spread_diff',
 'matched_size_imbalance_size_spread_diff',
 'bid_size_ask_size_spread_diff',
 'bid_size_imbalance_size_spread_diff',
 'ask_size_imbalance_size_spread_diff',
 'all_ask_size_quantile_0.5',
 'all_bid_size_quantile_0.75',
 'all_ask_size_quantile_0.75',
 'ask_size_mean',
 'wap_std',
 'reference_price_near_price_imb',
 'reference_price_ask_price_imb',
 'reference_price_bid_price_imb',
 'reference_price_wap_imb',
 'bid_price_wap_imb',
 'matched_size/bid_size',
 'matched_size/ask_size',
 'matched_size/imbalance_size',
 'bid_size/ask_size',
 'ask_price_bid_price_wap_imb2',
 'ask_price_bid_price_reference_price_imb2',
 'matched_size_bid_size_ask_size_imb2',
 'matched_size_bid_size_imbalance_size_imb2',
 'matched_size_ask_size_imbalance_size_imb2',
 'bid_size_ask_size_imbalance_size_imb2',
 'all_sizes_skew',
 'all_sizes_kurt',
 'matched_size_shift_1',
 'imbalance_size_shift_1',
 'reference_price_shift_1',
 'imbalance_buy_sell_flag_shift_1',
 'matched_size_shift_2',
 'imbalance_size_shift_2',
 'imbalance_buy_sell_flag_shift_2',
 'wap_shift_2',
 'matched_size_shift_3',
 'reference_price_shift_3',
 'imbalance_buy_sell_flag_shift_3',
 'wap_shift_3',
 'matched_size_shift_6',
 'imbalance_buy_sell_flag_shift_6',
 'wap_shift_6',
 'imbalance_buy_sell_flag_shift_10',
 'matched_size_ret_1',
 'imbalance_size_ret_1',
 'iwap_ret_1',
 'imbalance_size_ret_2',
 'reference_price_ret_2',
 'wap_ret_2',
 'iwap_ret_2',
 'matched_size_ret_3',
 'imbalance_size_ret_3',
 'reference_price_ret_3',
 'iwap_ret_3',
 'imbalance_size_ret_6',
 'reference_price_ret_6',
 'imbalance_size_ret_10',
 'ask_price_diff_1',
 'bid_price_diff_1',
 'ask_size_diff_1',
 'bid_size_diff_1',
 'near_price_diff_1',
 'imbalance_size_for_buy_sell_diff_1',
 'ask_size_diff_2',
 'bid_size_diff_2',
 'wap_diff_2',
 'near_price_diff_2',
 'imbalance_size_for_buy_sell_diff_2',
 'ask_price_diff_3',
 'bid_size_diff_3',
 'wap_diff_3',
 'near_price_diff_3',
 'imbalance_size_for_buy_sell_diff_3',
 'ask_size_diff_6',
 'bid_size_diff_6',
 'wap_diff_6',
 'near_price_diff_6',
 'imbalance_size_for_buy_sell_diff_6',
 'ask_size_diff_10',
 'imbalance_size_for_buy_sell_diff_10',
 'z_score_imbalance_size_for_buy_sell_5',
 'z_score_imbalance_size_for_buy_sell_10',
 'harmonic_imbalance',
 'my_target_abs_mean',
 'target_last_1day',
 'near_price_last_1day_future_6',
 'target_last_1day_future_1',
 'target_last_1day_future_2',
 'seconds',
 'minute',
 'global_median_size',
 'global_median_far_price',
 'global_median_imbalance_size_for_buy_sell',
 'global_median_matched_size',
 'global_median_depth_pressure',
 'global_seconds_median_imbalance_size_for_buy_sell_stock',
 'global_seconds_median_matched_size_stock',
 ]

In [None]:
def train_lgb(df_train_feats, targets, split_day):
    
    # stress test
#     split_day = df_train_feats.date_id.max()
    target_mean = targets.values.mean()
    
    # Update global features
    global global_stock_id_feats, global_seconds_feats_stock, feature_name_lgb
    global_stock_id_feats = {
        "median_size": df_train_feats.groupby("stock_id")["bid_size"].median() + df_train_feats.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train_feats.groupby("stock_id")["bid_size"].std() + df_train_feats.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train_feats.groupby("stock_id")["bid_size"].max() - df_train_feats.groupby("stock_id")["bid_size"].min(),

        "median_price": df_train_feats.groupby("stock_id")["bid_price"].median() + df_train_feats.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train_feats.groupby("stock_id")["bid_price"].std() + df_train_feats.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train_feats.groupby("stock_id")["bid_price"].max() - df_train_feats.groupby("stock_id")["ask_price"].min(),
        "median_far_price": df_train_feats.groupby("stock_id")["far_price"].median(),
        "median_near_price": df_train_feats.groupby("stock_id")["near_price"].median(),
        "median_imbalance_size_for_buy_sell": df_train_feats.groupby("stock_id")["imbalance_size_for_buy_sell"].median(),
        "median_matched_size": df_train_feats.groupby("stock_id")["matched_size"].median(),
        "median_depth_pressure": df_train_feats.groupby("stock_id")["depth_pressure"].median(),
    }

    df_train_feats["stock_id&seconds_in_bucket"] = df_train_feats['stock_id'].astype(str) + '_' + df_train_feats['seconds_in_bucket'].astype(str)
    global_seconds_feats_stock = {
        "median_imbalance_size_for_buy_sell": df_train_feats.groupby("stock_id&seconds_in_bucket")["imbalance_size_for_buy_sell"].median(),
        "median_matched_size": df_train_feats.groupby("stock_id&seconds_in_bucket")["matched_size"].median(),
    }
    
    # update global features
    for key, value in global_stock_id_feats.items():
        df_train_feats[f"global_{key}"] = df_train_feats["stock_id"].map(value.to_dict())

    for key, value in global_seconds_feats_stock.items():
        df_train_feats[f"global_seconds_{key}_stock"] = df_train_feats["stock_id&seconds_in_bucket"].map(value.to_dict())
    del df_train_feats["stock_id&seconds_in_bucket"]
        
    # Limit N newest data to avoid OOM
#     keep_days = 550
#     data_mask = df_train_feats.date_id >= split_day + 1 - keep_days
# #     keep_rows = 6274400 # 600: 6557200; 575: 6274400
#     keep_rows = data_mask.sum()
#     df_train_feats = df_train_feats[-keep_rows:]
#     targets = targets[-keep_rows:]
#     del data_mask
#     gc.collect()

#     feature_name = list(df_train_feats.columns)
#     feature_name.remove("date_id")
    offline_split = df_train_feats['date_id'] > (split_day - 45)
    
    # Split data for offline training based on a specific date
    df_offline_train = df_train_feats[~offline_split]
    df_offline_valid = df_train_feats[offline_split]
    df_offline_train_target = targets[~offline_split]
    df_offline_valid_target = targets[offline_split]
    
    pre = df_offline_valid[["date_id", "seconds_in_bucket"]]
    pre["target"] = df_offline_valid_target.values
        
    lgb_params = {
        "objective": "mae",
        "n_estimators": 5000,
        "num_leaves": 465,
        "subsample": 0.65791,
        "colsample_bytree": 0.7,
        "learning_rate": 0.00877,  # 0.00877
        "n_jobs": 4,
        "device": "gpu",
        "verbosity": -1,
        "importance_type": "gain",
        "max_depth": 14,  # Maximum depth of the tree
        "min_child_samples": 132,  # Minimum number of data points in a leaf
        "reg_alpha": 6,  # L1 regularization term
        "reg_lambda": 0.08,  # L2 regularization term
    }

    print(f"Feature length = {len(feature_name_lgb)}")
    print("Valid Model Training.")

    # Train a LightGBM model on the offline data
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(
        df_offline_train[feature_name_lgb].values.astype(np.float32),
        df_offline_train_target.values.astype(np.float32),
        eval_set=[(df_offline_valid[feature_name_lgb].values.astype(np.float32), df_offline_valid_target.values.astype(np.float32))],
        callbacks=[
            lgb.callback.early_stopping(stopping_rounds=200),
            lgb.callback.log_evaluation(period=100),
        ],
        feature_name = feature_name_lgb
    )
    
    best_iteration_ = lgb_model.best_iteration_
    
    pre["target_pre"] = lgb_model.predict(df_offline_valid[feature_name_lgb])
    pre["target_pre_pro"] = pre.groupby(["date_id", "seconds_in_bucket"])["target_pre"].transform(lambda x: x - x.mean() + target_mean)
    pre_lgb = pre["target_pre_pro"].values
    
    # Free up memory by deleting variables
    del df_offline_train, df_offline_valid, df_offline_train_target, df_offline_valid_target, offline_split, lgb_model, pre
    gc.collect()

    # Inference
    df_train_target = targets
    print("Infer Model Training.")

    # Adjust the number of estimators for the inference model
    infer_params = lgb_params.copy()
    infer_params["n_estimators"] = int(1.2 * best_iteration_)
    infer_lgb_model = lgb.LGBMRegressor(**infer_params)
    infer_lgb_model.fit(
        df_train_feats[feature_name_lgb].values.astype(np.float32), 
        df_train_target.values.astype(np.float32), 
        feature_name = feature_name_lgb
        )
    print("LightGBM training completed.")
    return infer_lgb_model, pre_lgb

In [None]:
def train_ctb(df_train_feats, targets, split_day):
    
    # stress test
#     split_day = df_train_feats.date_id.max()
    target_mean = targets.values.mean()
    
#     feature_name = list(df_train_feats.columns)
#     feature_name.remove("date_id")
#     feature_name = [feat for feat in feature_name if feat not in eliminated_features_names]
    global feature_name_ctb
    offline_split = df_train_feats['date_id'] > (split_day - 45)
    
    # Split data for offline training based on a specific date
    df_offline_train = df_train_feats[~offline_split]
    df_offline_valid = df_train_feats[offline_split]
    df_offline_train_target = targets[~offline_split]
    df_offline_valid_target = targets[offline_split]
    
    pre = df_offline_valid[["date_id", "seconds_in_bucket"]]
    pre["target"] = df_offline_valid_target.values
    
    ctb_params = dict(iterations=2000,
                      learning_rate=1.0,
                      depth=9,
                      l2_leaf_reg=30,
                      bootstrap_type='Bernoulli',
                      subsample=0.66,
                      loss_function='MAE',
                      eval_metric = 'MAE',
                      metric_period=100,
                      od_type='Iter',
                      od_wait=200,
                      task_type='GPU',
                      allow_writing_files=False,
                      random_strength=4.428571428571429
                      )

    print(f"Feature length = {len(feature_name_ctb)}")
    
#     train_dataset = lgb.Dataset(df_offline_train[feature_name].values, label = df_offline_train_target, feature_name = feature_name)
#     valid_dataset = lgb.Dataset(df_offline_valid[feature_name].values, label = df_offline_valid_target, feature_name = feature_name)

    print("Valid Model Training.")

    # Train a LightGBM model on the offline data
    ctb_model = ctb.CatBoostRegressor(**ctb_params)
#     summary = ctb_model.select_features(
#         df_offline_train[feature_name], df_offline_train_target,
#         eval_set=[(df_offline_valid[feature_name], df_offline_valid_target)],
#         features_for_select=feature_name,
#         num_features_to_select=len(feature_name)-24,    # Dropping from 124 to 100
#         steps=3,
#         algorithm=ctb.EFeaturesSelectionAlgorithm.RecursiveByShapValues,
#         shap_calc_type=ctb.EShapCalcType.Regular,
#         train_final_model=False,
#         plot=True,
#     )
    ctb_model.fit(
        df_offline_train[feature_name_ctb], df_offline_train_target,
        eval_set=[(df_offline_valid[feature_name_ctb], df_offline_valid_target)],
        use_best_model=True,
#         early_stopping_rounds=200
    )
    
    best_iteration_ = ctb_model.best_iteration_
    
    pre["target_pre"] = ctb_model.predict(df_offline_valid[feature_name_ctb])
    pre["target_pre_pro"] = pre.groupby(["date_id", "seconds_in_bucket"])["target_pre"].transform(lambda x: x - x.mean() + target_mean)
    pre_ctb = pre["target_pre_pro"].values
    
    # Free up memory by deleting variables
    del df_offline_train, df_offline_valid, df_offline_train_target, offline_split, ctb_model, pre
    gc.collect()

    # Inference
    df_train_target = targets
    print("Infer Model Training.")
    # Adjust the number of estimators for the inference model
    infer_params = ctb_params.copy()
    infer_params["iterations"] = int(1.2 * best_iteration_)
    infer_ctb_model = ctb.CatBoostRegressor(**infer_params)
    infer_ctb_model.fit(df_train_feats[feature_name_ctb], df_train_target)
    print("CatBoost training completed.")
    
    return infer_ctb_model, pre_ctb, target_mean, df_offline_valid_target

In [None]:
def get_model_weight(pre_lgb, pre_ctb, target_true):
    def func(x):
        return mean_absolute_error(pre_lgb * x + pre_ctb * (1 - x), target_true)
    return golden_section_search(func, 0, 1, 1e-6)

In [None]:
def train_model(df_train_feats, targets, split_day):
    df_train_feats['target'] = targets
    df_train_feats.dropna(subset=["target", "wap"], inplace=True)
    targets = df_train_feats['target']
    del df_train_feats['target']
    gc.collect()
    
    infer_lgb_model, pre_lgb = train_lgb(df_train_feats, targets, split_day)
    gc.collect()
    infer_ctb_model, pre_ctb, target_mean, target_true = train_ctb(df_train_feats, targets, split_day)
    gc.collect()
    alpha = get_model_weight(pre_lgb, pre_ctb, target_true)
    return infer_lgb_model, infer_ctb_model, alpha, target_mean
#     return None, infer_ctb_model, 0, target_mean

In [None]:
import optiver2023, pickle, time
env = optiver2023.make_env()
iter_test = env.iter_test()
counter = 0
y_min, y_max = -64, 64
qps, predictions = [], []
cache = pd.DataFrame()
# train_cache = pd.DataFrame()
print("Start initialization...")

# Process online
# targets = df_train["target"].astype(np.float32)
# data = reduce_mem_usage(generate_all_features(df_train))

# Load processed data
with open("/kaggle/input/optiver-final-dateset/df_feats.pkl", "rb") as file:
    data = pickle.load(file)
with open("/kaggle/input/optiver-training-data-features/df_target.pkl", "rb") as file:
    targets = pickle.load(file)
print("Loaded temporary data.")
del df_train

## Stress test
# split = int(len(data) * 0.375)  # 660 / 480 - 1 = 0.375
# data_dummy = data[:split].copy()
# data_dummy["date_id"] += 481
# data = pd.concat([data, data_dummy], ignore_index=True, axis=0)
# targets = pd.concat([targets, targets[:split]], ignore_index=True, axis=0)
# del data_dummy
##

gc.collect()
print("Initialization done.")
start_date_id = 481  # 481
start_pro_date_id = start_date_id - 1
# train_date_id = set([start_date_id, start_date_id + 20, start_date_id + 40, start_date_id + 60, start_date_id + 80, start_date_id + 100])  # set([start_date_id + 20])
train_date_id = set([start_date_id + 37])  # set([start_date_id + 20])
# feature_name = list(data.columns)
# feature_name.remove("date_id")
# feature_name_ctb = [feat for feat in feature_name if feat not in eliminated_features_names]
feat_last = None
feat = None
target_last = None
target_mean = targets.values.mean()
infer_lgb_model = joblib.load("/kaggle/input/optiver-480-model/lgbm.model")  #None
infer_ctb_model = joblib.load("/kaggle/input/optiver-480-model/ctb.model")   #None
alpha = 0.52
test_float_columns = ['imbalance_size', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap',]
test_int_columns = ['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag']
revealed_int_columns = ['stock_id', 'seconds_in_bucket']
last_day_id = 477
train_cnt = 0
print("Start prediction...")
for (test, revealed_targets, sample_prediction) in iter_test:
    
    test[test_float_columns] = test[test_float_columns].astype(float)
    test[test_int_columns] = test[test_int_columns].astype(int)
    
    currently_scored = test.iloc[0]["currently_scored"]
    date_id = test["date_id"].max()

    # currently_scored = date_id >= start_date_id
#     seconds_in_bucket = test.iloc[0]["seconds_in_bucket"]
#     if seconds_in_bucket == 0:
#         print(date_id)
        
    if date_id < start_pro_date_id:
        last_day_id = date_id
        sample_prediction['target'] = 0
        env.predict(sample_prediction)
        continue
    del test['currently_scored']
    
    # Update cache and data for training
#     if date_id >= start_date_id and len(revealed_targets) > 1:
    if date_id >= start_date_id and last_day_id != date_id:
        # Update `feat_last` and `target_last`
        if feat is None:
            feat = generate_all_features(cache, feat_last, target_last)
        feat_last = feat
        feat = None
#         target_last = revealed_targets["revealed_target"].values.astype(np.float32) 
        # drop duplicates
        feat_last.drop_duplicates(['stock_id', 'seconds_in_bucket'], inplace=True)
        revealed_targets.drop_duplicates(['stock_id', 'seconds_in_bucket'], inplace=True)
        revealed_targets[revealed_int_columns] = revealed_targets[revealed_int_columns].astype(int)

        target_last = feat_last.merge(revealed_targets[['stock_id', 'seconds_in_bucket', 'revealed_target']], on=['stock_id', 'seconds_in_bucket'], how='left')['revealed_target'].values.astype(np.float32) 
        cache = pd.DataFrame()
        # If lastday's date_id is greater than start_date_id, update `data` and `targets` 
        if train_cnt < len(train_date_id) and date_id - 1 >= start_date_id:
            data = pd.concat([data, reduce_mem_usage(feat_last)], ignore_index=True, axis=0)
            targets = pd.concat([targets, pd.Series(target_last)], ignore_index=True, axis=0)
        # Online train
        if date_id in train_date_id:
            del infer_lgb_model, infer_ctb_model
            infer_lgb_model, infer_ctb_model, alpha, target_mean = train_model(data, targets, date_id - 1)
            train_cnt += 1
            # Clear `data` and `targets`, since no need to train again
            if train_cnt >= len(train_date_id):
                del data, targets
    
    last_day_id = date_id
    # Generate features
    cache = pd.concat([cache, test], ignore_index=True, axis=0)
    
    if not currently_scored:
        sample_prediction['target'] = 0
        env.predict(sample_prediction)
        continue
    
    feat = generate_all_features(cache, feat_last, target_last)
    feat_cur = feat[-len(test):]
    lgb_prediction = infer_lgb_model.predict(feat_cur[feature_name_lgb])
    ctb_prediction = infer_ctb_model.predict(feat_cur[feature_name_ctb])
    prediction = lgb_prediction * alpha + ctb_prediction * (1 - alpha)
#     prediction = infer_ctb_model.predict(feat_cur[feature_name_ctb])
    prediction = prediction - prediction.mean() + target_mean
    clipped_predictions = np.clip(prediction, y_min, y_max)
    sample_prediction['target'] = clipped_predictions
    env.predict(sample_prediction)
print("OK")