---

## 0. requirements

In [1]:
# !pip install --target=/home/<user_name>/<venv_name>/lib/python3.10/site-packages <package_name>

#### 0.2. kaggle api 설치법

In [2]:
# !pip install --target=/home/<user_name>/<venv_name>/lib/python3.10/site-packages kaggle

In [3]:
# !mkdir ~/.kaggle

In [4]:
# !mv ~/kaggle.json ~/.kaggle/

In [5]:
# !chmod 600 ~/.kaggle/kaggle.json

## 1. config 설정

#### 1.1. init config

In [1]:
MODE = "train"  # train, inference, both
KAGGLE_DATASET_NAME = "model-lgbm-version-yongmin-9"

In [2]:
import gc
import os
import time
import warnings
from itertools import combinations
from warnings import simplefilter
import functools
import time
from numba import njit, prange
import pyarrow.parquet as pq

import joblib
import lightgbm as lgb
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from functools import partial
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, TimeSeriesSplit

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

#### 1.2. train / inference config

In [3]:
lgb.__version__, xgb.__version__

('3.3.2', '2.0.1')

In [4]:
EPS = 1e-10

In [5]:
if MODE == "train":
    print("You are in train mode")
    model_directory = "./models/" + time.strftime("%Y%m%d_%H:%M:%S", time.localtime(time.time() + 9 * 60 * 60))
    data_directory = "./data"
    train_mode = True
    infer_mode = False
elif MODE == "inference":
    print("You are in inference mode")
    model_directory = f'/kaggle/input/{KAGGLE_DATASET_NAME}'
    data_directory = "/kaggle/input/optiver-trading-at-the-close"
    train_mode = False
    infer_mode = True
elif MODE == "both":
    print("You are in both mode")
    model_directory = f'/kaggle/working/'
    data_directory = "/kaggle/input/optiver-trading-at-the-close"
    train_mode = True
    infer_mode = True
else:
    raise ValueError("Invalid mode")

You are in train mode


#### 1.3. model config

In [6]:
config = {
    "data_dir": data_directory,
    "model_dir": model_directory,

    "train_mode": train_mode,  # True : train, False : not train
    "infer_mode": infer_mode,  # True : inference, False : not inference
    "model_name": ["lgb"],  # model name
    "final_mode": False,  # True : using final model, False : not using final model
    "best_iterate_ratio": 1.2,  # best iteration ratio
    'target': 'target',

    'split_method': 'rolling',  # time_series, rolling, blocking, holdout
    'n_splits': 3,  # number of splits
    'correct': True,  # correct boundary
    'gap': 0.05,  # gap between train and test (0.05 = 5% of train size)

    'initial_fold_size_ratio': 0.8,  # initial fold size ratio
    'train_test_ratio': 0.9,  # train, test ratio

    'optuna_random_state': 42,
}

config["model_mode"] = "single" if len(config["model_name"]) == 1 else "stacking"  # 모델 수에 따라서 single / stacking 판단
config["mae_mode"] = True if config["model_mode"] == "single" and not config[
    "final_mode"] else False  # single 모델이면서 final_mode가 아닌경우 폴드가 여러개일때 모델 평가기준이 없어서 mae로 평가
config["inference_n_splits"] = len(config['model_name']) if config["final_mode"] or config["mae_mode"] else config[
    "n_splits"]  # final_mode가 아닌경우 n_splits만큼 inference

#### 1.4. model heyperparameter config

In [7]:
models_config = {
    "lgb": {
        "model": lgb.LGBMRegressor,
        "params": {
                "objective": "mae",
                "n_estimators": 6800,
                "num_leaves": 512,
                "subsample": 0.34,
                "colsample_bytree": 0.3,
                "learning_rate": 0.01,
                'max_depth': 15,
                "n_jobs": 4,
                "device": "gpu",
                "verbosity": -1,
                "importance_type": "gain",
                "reg_alpha": 3.23,
                "reg_lambda": 0.015
            }
    },

    "xgb": {
        "model": xgb.XGBRegressor,
        "params": {
            'booster': 'dart',
            "objective": "reg:linear",
            "n_estimators": 6800,
            "max_depth": 14,
            "eta": 0.0073356282482453065,
            "subsample": 0.9,
            "colsample_bytree": 0.30000000000000004,
            "colsample_bylevel": 0.9,
            "min_child_weight": 0.4824060812428942,
            "reg_lambda": 182.50819193990537,
            "reg_alpha": 0.03171419713574529,
            "gamma": 0.9162634503670075,
            "tree_method": "gpu_hist",
            "n_jobs": 4,
            "verbosity": 0,
        },
    },
}

In [8]:
if MODE == "train":
    if not os.path.exists(config["model_dir"]):
        os.makedirs(config["model_dir"])
    if not os.path.exists(config["data_dir"]):
        os.makedirs(config["data_dir"])
    !kaggle competitions download optiver-trading-at-the-close -p {config["data_dir"]} --force
    !unzip -o {config["data_dir"]}/optiver-trading-at-the-close.zip -d {config["data_dir"]}
    !rm {config["data_dir"]}/optiver-trading-at-the-close.zip

Downloading optiver-trading-at-the-close.zip to ./data
 99%|███████████████████████████████████████▋| 199M/201M [00:08<00:00, 25.2MB/s]
100%|████████████████████████████████████████| 201M/201M [00:08<00:00, 23.8MB/s]
Archive:  ./data/optiver-trading-at-the-close.zip
  inflating: ./data/example_test_files/revealed_targets.csv  
  inflating: ./data/example_test_files/sample_submission.csv  
  inflating: ./data/example_test_files/test.csv  
  inflating: ./data/optiver2023/__init__.py  
  inflating: ./data/optiver2023/competition.cpython-310-x86_64-linux-gnu.so  
  inflating: ./data/public_timeseries_testing_util.py  
  inflating: ./data/train.csv        


# ## Global Method

In [9]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """

    start_mem = df.memory_usage().sum() / 1024 ** 2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    return df

In [10]:
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    """
    Calculate the triplet imbalance for each row in the DataFrame.
    :param df_values: 
    :param comb_indices: 
    :return: 
    """
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            if mid_val == min_val:  # Prevent division by zero
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val + EPS)

    return imbalance_features


def calculate_triplet_imbalance_numba(price, df):
    """
    Calculate the triplet imbalance for each row in the DataFrame.
    :param price: 
    :param df: 
    :return: 
    """
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

In [11]:
def print_log(message_format):
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            # self 확인: 첫 번째 인자가 클래스 인스턴스인지 확인합니다.
            if args and hasattr(args[0], 'infer'):
                self = args[0]

                # self.infer가 False이면 아무 것도 출력하지 않고 함수를 바로 반환합니다.
                if self.infer:
                    return func(*args, **kwargs)

            start_time = time.time()
            result = func(*args, **kwargs)
            end_time = time.time()

            elapsed_time = end_time - start_time

            if result is not None:
                data_shape = getattr(result, 'shape', 'No shape attribute')
                shape_message = f", shape({data_shape})"
            else:
                shape_message = ""

            print(f"\n{'-' * 100}")
            print(message_format.format(func_name=func.__name__, elapsed_time=elapsed_time) + shape_message)
            print(f"{'-' * 100}\n")

            return result

        return wrapper

    return decorator


In [12]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out

#### 각 클래스의 method는 각자 필요에 따라 추가 해서 사용하면 됩니다. 이때 class의 주석에 method를 추가하고, method의 주석에는 method의 역할을 간단하게 적어주세요.

# ## Pre Code

## Data Preprocessing Class

In [13]:
class DataPreprocessor:
    """
    데이터 전처리 클래스
    
    Attributes
    ----------
    data : pandas.DataFrame
        전처리할 데이터
        
    Methods
    -------
    handle_missing_data()
        결측치 처리
    handle_outliers()
        이상치 처리
    normalize()
        정규화
    custom_preprocessing()
        사용자 정의 전처리
    transform()
        전처리 수행
    """

    def __init__(self, data, infer=False):
        self.data = data  # reduce_mem_usage(data) # reduce_mem_usage 정밀도 훼손함 
        self.infer = infer

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def handle_missing_data(self):
        # 결측치 처리 코드
        self.data = self.data.dropna(subset=["target"]) if self.infer == False else self.data
        self.data = self.data.reset_index(drop=True) if self.infer == False else self.data
        # self.data.reset_index(drop=True, inplace=True)
        return self.data

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def handle_outliers(self):
        # 이상치 처리 코드
        return self.data

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def normalize(self):
        # 정규화 코드
        return self.data

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def custom_preprocessing(self):
        # 사용자 정의 전처리 코드
        return self.data

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def transform(self):
        # 전처리 수행 코드 (위의 메소드 활용 가능)
        self.handle_missing_data()
        # self.handle_outliers()
        # self.normalize()
        # self.custom_preprocessing()
        return self.data

## Feature Engineering Class

In [14]:
global_features = {}

In [15]:
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    """
    Calculate the triplet imbalance for each row in the DataFrame.
    :param df_values: 
    :param comb_indices: 
    :return: 
    """
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            if mid_val == min_val:  # Prevent division by zero
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val + EPS)

    return imbalance_features

def calculate_triplet_imbalance_numba(price, df):
    """
    Calculate the triplet imbalance for each row in the DataFrame.
    :param price: 
    :param df: 
    :return: 
    """
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

In [16]:
from tqdm import tqdm
import glob

all_stock_data = {}

for s in tqdm(glob.glob("./data/alpha/*.csv") if MODE == "train" else glob.glob(
        "/kaggle/input/nasdaq-stocks-historical-data/alpha/*.csv"), desc="Processing files"):
    stock_df = pd.read_csv(s, dtype={"ticker": str})
    stock_df.query("Date >= '2021-08-05' and Date <= '2023-07-06'", inplace=True)
    if len(stock_df) > 180:
        all_stock_data[s[13:-15]] = (stock_df, len(stock_df))

reversed_stock_list = [
        'MNST', 'WING', 'AXON', 'HON', 'MAR', 'OKTA', 'POOL', 'LRCX', 'YOTA', 'PFG',
        'NDAQ', 'COIN', 'AMGN', 'TER', 'ADBE', 'ABNB', 'ZBRA', 'KLAC', 'ZI', 'ALNY',
        'ULTA', 'SSNC', 'ON', 'SWKS', 'AKAM', 'ASML', 'PPBI', 'QRVO', 'FANG', 'ORLY',
        'LNT', 'AGRX', 'NTAP', 'CROX', 'REGN', 'ROST', 'DLTR', 'ADP', 'EMCG', 'CTAS',
        'CZR', 'NVDA', 'SAIA', 'JKHY', 'FOSLL', 'MSFT', 'TECH', 'TXRH', 'WDAY', 'FITB',
        'MTCH', 'ROKU', 'CINF', 'EBAY', 'SNPS', 'FAST', 'ETSY', 'IDXX', 'INTU', 'ZG',
        'CRWD', 'LYFT', 'RGEN', 'LKQ', 'MKTX', 'EXC', 'LBRDK', 'MRNA', 'PAYX', 'SOFI',
        'BYND', 'EQIX', 'ADI', 'GEN', 'ALGN', 'CDNS', 'HAS', 'VRTX', 'HOOD', 'WBD',
        'TXG', 'SGEN', 'OPEN', 'INTC', 'GOOG', 'CAR', 'UPST', 'LSCC', 'NFLX', 'ENTG',
        'FFIV', 'DOCU', 'MSTR', 'ZION', 'PCTY', 'AMD', 'MRVL', 'NBIX', 'JBLU', 'PARA',
        'MQ', 'FCNCA', 'TEAM', 'ZS', 'WBA', 'MDLZ', 'TRMB', 'PODD', 'SEDG', 'CSX',
        'TMUS', 'SPWR', 'AAPL', 'LULU', 'LPLA', 'ILMN', 'CDW', 'GDS', 'MELI', 'MASI',
        'FOXA', 'KDP', 'AAL', 'GILD', 'ASO', 'UTHR', 'MU', 'MDB', 'WDC', 'CFLT',
        'SBUX', 'INCY', 'TSCO', 'ISRG', 'VTRS', 'DKNG', 'LITE', 'TTWO', 'SMCI', 'EXPE',
        'VRTS', 'AMAT', 'AVGO', 'TLRY', 'PCAR', 'CG', 'MIDD', 'APA', 'LNT', 'VRSK',
        'PANW', 'CSCO', 'SBAC', 'HTZ', 'DBX', 'CHKEW', 'LCID', 'ADSK', 'APLS', 'STLD',
        'PEP', 'PTON', 'ENPH', 'COST', 'CPRT', 'HST', 'KHC', 'CHRW', 'AMZN', 'ANSS',
        'HOLX', 'TROW', 'APP', 'FIVE', 'AFRM', 'GOOGL', 'FTNT', 'SWAV', 'ZM', 'META',
        'GH', 'JBHT', 'UAL', 'MCHP', 'DDOG', 'ODFL', 'CTSH', 'EA', 'RUN', 'CSGP',
        'DXCM', 'TSLA', 'PTC', 'PYPL', 'PENN', 'XEL', 'XRAY', 'SPLK', 'CMCSA', 'BKR'
]

stock_list_df = pd.read_csv('./data/nasdaq-screener/nasdaq_screener_1701158836955.csv') if MODE == "train" else pd.read_csv(
    '/kaggle/input/nasdaq-screener/nasdaq_screener_1701158836955.csv')

Processing files: 100%|██████████| 3131/3131 [00:05<00:00, 534.96it/s]


In [17]:
from sklearn.preprocessing import LabelEncoder

def get_stock_info(df, data, column_name):  # column_name = "Market Cap", "Sector", "Industry"
    le = LabelEncoder()

    if column_name != "Market Cap":
        stock_list_df[column_name] = le.fit_transform(stock_list_df[column_name])

    df[f'{column_name}'] = -1

    for idx, ticker in enumerate(reversed_stock_list):
        stock_id_indices = data[data['stock_id'] == idx].index
        if ticker in stock_list_df["Symbol"].values:
            value = stock_list_df[stock_list_df["Symbol"] == ticker][column_name].iloc[0]
            df.loc[stock_id_indices, f'{column_name}'] = value

    return df

In [18]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
_weights = {int(k):v for k,v in enumerate(weights)}

In [26]:
class FeatureEngineer:

    def __init__(self, data, infer=False, feature_versions=None, dependencies=None,
                 base_directory="./data/fe_versions"):
        self.data = data
        self.infer = infer
        self.feature_versions = feature_versions or []
        self.dependencies = dependencies or {}  # 피처 버전 간 의존성을 정의하는 딕셔너리
        self.base_directory = base_directory
        if not os.path.exists(self.base_directory):
            os.makedirs(self.base_directory)

    @staticmethod
    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def generate_global_features(data):
        global_features["version_0"] = {
            "median_size": data.groupby("stock_id")["bid_size"].median() + data.groupby("stock_id")[
                "ask_size"].median(),
            "std_size": data.groupby("stock_id")["bid_size"].std() + data.groupby("stock_id")["ask_size"].std(),
            "ptp_size": data.groupby("stock_id")["bid_size"].max() - data.groupby("stock_id")["bid_size"].min(),
            "median_price": data.groupby("stock_id")["bid_price"].median() + data.groupby("stock_id")[
                "ask_price"].median(),
            "std_price": data.groupby("stock_id")["bid_price"].std() + data.groupby("stock_id")["ask_price"].std(),
            "ptp_price": data.groupby("stock_id")["bid_price"].max() - data.groupby("stock_id")["ask_price"].min(),
        }

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def feature_selection(self, data, exclude_columns):
        # 제외할 컬럼을 뺀 나머지로 구성된 새로운 DataFrame을 생성합니다.
        selected_columns = [c for c in data.columns if c not in exclude_columns]
        data = data[selected_columns]
        return data

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def feature_version_yongmin_0(self, *args, version_name="feature_version_yongmin_0"):
        
        df = pd.DataFrame(index=self.data.index)

        df['dow'] = self.data["date_id"] % 5
        df['seconds'] = self.data['seconds_in_bucket'] % 60
        df['minute'] = self.data['seconds_in_bucket'] // 60
        df['time_to_market_close'] = 540 - self.data['seconds_in_bucket']
    
        self.data["volume"] = self.data.eval("ask_size + bid_size")
    
        return df

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def feature_version_yongmin_1(self, *args, version_name="feature_version_yongmin_1"):
        # feature engineering version 1
        # create empty dataframe
        df = pd.DataFrame(index=self.data.index)
        self.data["stock_weights"] = self.data["stock_id"].map(_weights)
        self.data["weighted_wap"] = self.data["stock_weights"] * self.data["wap"]
        
        self.data['cum_wap'] = self.data.groupby(['stock_id'])['wap'].cumprod()
        self.data['cum_weighted_wap'] = self.data.groupby(['stock_id'])['weighted_wap'].cumprod()
    
        for i in [1, 3, 6, 12]:
            df[f"cum_wap_ma_{i}"] = self.data.groupby(['stock_id'])['cum_wap'].rolling(i).mean().values
            df[f'wap_momentum_{i}'] = self.data.groupby('stock_id')['weighted_wap'].pct_change(periods=i)

        self.data["imbalance_momentum"] = self.data.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / self.data['matched_size']
        self.data["imbalance_momentum_r"] = self.data.groupby(['stock_id'])['matched_size'].diff(periods=1) / self.data['imbalance_size']
        self.data["anb_imb_momentum"] =self.data.groupby(['stock_id'])['ask_size'].diff(periods=1) / self.data['bid_size']
        self.data["anb_imb_momentum_r"] =self.data.groupby(['stock_id'])['bid_size'].diff(periods=1) / self.data['ask_size']
        self.data["imb_momentum_spread"] = self.data["imbalance_momentum"] - self.data["anb_imb_momentum"]
        self.data["price_spread"] = self.data["ask_price"] - self.data["bid_price"]
        
        self.data["mid_price"] = self.data.eval("(ask_price + bid_price) / 2")
        self.data["liquidity_imbalance"] = self.data.eval(f"(bid_size-ask_size)/(bid_size+ask_size+{EPS})")
        self.data["matched_imbalance"] = self.data.eval(f"(imbalance_size-matched_size)/(matched_size+imbalance_size+{EPS})")
        self.data["size_imbalance"] = self.data.eval(f"bid_size / ask_size+{EPS}")
        self.data["size_gini"] = self.data.eval(f"ask_size / (bid_size+ask_size+{EPS})")
        self.data["size_gini_2"] = self.data.eval(f"matched_size / (matched_size+imbalance_size+{EPS})")

        
        df["size_spread_intensity"] = self.data.groupby(['stock_id'])['imb_momentum_spread'].diff()
        df["spread_intensity"] = self.data.groupby(['stock_id'])['price_spread'].diff()
        df['price_pressure'] = self.data['imbalance_size'] * (self.data['ask_price'] - self.data['bid_price'])
        
        self.data['market_urgency_like'] = self.data['imb_momentum_spread'] * self.data['liquidity_imbalance']
        self.data['market_urgency'] = self.data['price_spread'] * self.data['liquidity_imbalance']
        df['depth_pressure'] = (self.data['ask_size'] - self.data['bid_size']) * (self.data['far_price'] - self.data['near_price'])
        
        df['spread_depth_ratio'] = (self.data['ask_price'] - self.data['bid_price']) / (self.data['bid_size'] + self.data['ask_size'])
        self.data['mid_price_movement'] = self.data['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
        
        df['micro_price'] = ((self.data['bid_price'] * self.data['ask_size']) + (self.data['ask_price'] * self.data['bid_size'])) / (self.data['bid_size'] + self.data['ask_size'])
        df['relative_spread'] = (self.data['ask_price'] - self.data['bid_price']) / self.data['wap']


        df['mid_price*volume'] = self.data['mid_price_movement'] * self.data['volume']
        df['harmonic_imbalance'] = 2 / ((1 / self.data["bid_size"] ) + (1 / self.data["ask_size"]))
        
        return df

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def feature_version_yongmin_2(self, *args, version_name="feature_version_yongmin_2"):
        # feature engineering version 1
        # create empty dataframe
        df = pd.DataFrame(index=self.data.index)

        prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
        sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
        
        for func in ["mean", "std", "skew", "kurt"]:
            self.data[f"all_prices_{func}"] = self.data[prices].agg(func, axis=1)
            self.data[f"all_sizes_{func}"] = self.data[sizes].agg(func, axis=1)

        for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
            for window in [1,3,6,9,12]:
                self.data[f"{col}_shift_{window}"] = self.data.groupby('stock_id')[col].shift(window)
                self.data[f"{col}_ret_{window}"] = self.data.groupby('stock_id')[col].pct_change(window)
    
        # Calculate diff features for specific columns
        for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'weighted_wap','price_spread']:
            for window in [1,3,6,9,12]:
                self.data[f"{col}_diff_{window}"] = self.data.groupby("stock_id")[col].diff(window)

        for window in [1,3,6,9,12]:
            self.data[f'price_change_diff_{window}'] = self.data[f'bid_price_diff_{window}'] - self.data[f'ask_price_diff_{window}']
            self.data[f'size_change_diff_{window}'] = self.data[f'bid_size_diff_{window}'] - self.data[f'ask_size_diff_{window}']
        
        return df

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def feature_version_yongmin_3(self, *args, version_name="feature_version_yongmin_3"):
        # create empty dataframe
        df = pd.DataFrame(index=self.data.index)

        for i in ['wap', 'ask_size', 'bid_size', 'weighted_wap', 'price_spread']:
            _stock_wise_std = self.data.groupby(['date_id'])[i].std()
            _stock_wise_pct_change = self.data.groupby(['date_id'])[i].pct_change()
            
            df[f'stock_wise_{i}_std'] = self.data['date_id'].map(_stock_wise_std)
            df[f'stock_wise_{i}_pct'] = self.data['date_id'].map(_stock_wise_pct_change)
        
        return df

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def feature_version_yongmin_4(self, *args, version_name="feature_version_yongmin_4"):
        # create empty dataframe
        df = pd.DataFrame(index=self.data.index)

        bid_price_norm, ask_price_norm = (self.data["bid_price"] / self.data["bid_price"].max()), (self.data["ask_price"] / self.data["ask_price"].max())
        df['ask_bid_norm'] = self.data["ask_size"] * bid_price_norm
        df['bid_ask_norm'] = self.data["bid_size"] * ask_price_norm
    
        df['cross_spread'] = df['bid_ask_norm'] - df['ask_bid_norm']
        df['cross_imbalance'] = df.eval("(bid_ask_norm-ask_bid_norm)/(bid_ask_norm+ask_bid_norm)")
        df['cross_imbalance_ratio'] = df['bid_ask_norm'] / df['ask_bid_norm']
        df['cross_urgency'] = df['cross_spread'] * df['cross_imbalance']
        
        return df

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def feature_version_alvin_1(self, *args, version_name="feature_version_alvin_1"):
        # feature engineering version 1
        # create empty dataframe
        df = pd.DataFrame(index=self.data.index)
        prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
        sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

        for c in combinations(prices, 2):
            df[f"{c[0]}_{c[1]}_imb"] = self.data.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]} + {EPS})")

        for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
            triplet_feature = calculate_triplet_imbalance_numba(c, self.data)
            df[triplet_feature.columns] = triplet_feature.values

        return df

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def feature_market_cap(self, *args, version_name="feature_market_cap"):
        df = pd.DataFrame(index=self.data.index)

        df = get_stock_info(df, self.data, "Market Cap")

        return df

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def feature_sector(self, *args, version_name="feature_sector"):
        df = pd.DataFrame(index=self.data.index)

        df = get_stock_info(df, self.data, "Sector")

        return df

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def feature_industry(self, *args, version_name="feature_industry"):
        df = pd.DataFrame(index=self.data.index)

        df = get_stock_info(df, self.data, "Industry")

        return df

    # you can add more feature engineering version like above
    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def execute_feature_versions(self, save=False, load=False):
        results = {}

        for version in self.feature_versions:
            if load:
                df = self._load_from_parquet(version)
            else:
                method = getattr(self, version, None)
                if callable(method):
                    args = []
                    for dep in self.dependencies.get(version, []):
                        dep_result = results.get(dep)
                        if isinstance(dep_result, pd.DataFrame):
                            args.append(dep_result)
                        elif dep_result is None and hasattr(self, dep):
                            dep_method = getattr(self, dep)
                            dep_result = dep_method()
                            results[dep] = dep_result
                            args.append(dep_result)
                        else:
                            args.append(None)
                    df = method(*args)
                    if save:
                        self._save_to_parquet(df, version)
            results[version] = df

        # return that was in self.feature_versions
        return {k: v for k, v in results.items() if k in self.feature_versions}

    @print_log("Executed {func_name}, Elapsed time: {elapsed_time:.2f} seconds")
    def transform(self, save=False, load=False):
        feature_versions_results = self.execute_feature_versions(save=save, load=load)
        if not self.infer:
            self.data["date_id_copy"] = self.data["date_id"]
        concat_df = pd.concat([self.data] + list(feature_versions_results.values()), axis=1)

        exclude_columns = ["row_id", "time_id", "date_id"]
        final_data = self.feature_selection(concat_df, exclude_columns)
        final_data = concat_df
        return final_data

## Split Data Class

In [27]:
class Splitter:
    """
    데이터 분리 클래스
    
    Attributes
    ----------
    method : str
        데이터 분리 방식
    n_splits : int
        데이터 분리 개수
    correct : bool
        데이터 분리 시 boundary를 맞출지 여부
    initial_fold_size_ratio : float
        초기 fold size 비율
    train_test_ratio : float
        train, test 비율
        
    Methods
    -------
    split()
        데이터 분리 수행
    """

    def __init__(self, method, n_splits, correct, initial_fold_size_ratio=0.6, train_test_ratio=0.8, gap=0,
                 overlap=True, train_start=0,
                 train_end=390, valid_start=391, valid_end=480):
        self.method = method
        self.n_splits = n_splits
        self.correct = correct
        self.initial_fold_size_ratio = initial_fold_size_ratio
        self.train_test_ratio = train_test_ratio

        self.gap = gap
        self.overlap = overlap

        # only for holdout method
        self.train_start = train_start
        self.train_end = train_end
        self.valid_start = valid_start
        self.valid_end = valid_end

        self.target = config["target"]

        self.boundaries = []

    def split(self, data):
        #self.data = reduce_mem_usage(data)
        self.data = data
        self.all_dates = self.data['date_id_copy'].unique()
        if self.method == "time_series":
            if self.n_splits <= 1:
                raise ValueError("Time series split method only works with n_splits > 1")
            return self._time_series_split(data)
        elif self.method == "rolling":
            if self.n_splits <= 1:
                raise ValueError("Rolling split method only works with n_splits > 1")
            return self._rolling_split(data)
        elif self.method == "blocking":
            if self.n_splits <= 1:
                raise ValueError("Blocking split method only works with n_splits > 1")
            self.initial_fold_size_ratio = 1.0 / self.n_splits
            return self._rolling_split(data)
        elif self.method == "holdout":
            if self.n_splits != 1:
                raise ValueError("Holdout method only works with n_splits=1")
            return self._holdout_split(data)
        else:
            raise ValueError("Invalid method")

    def _correct_boundary(self, data, idx, direction="forward"):
        # Correct the boundary based on date_id_copy
        original_idx = idx
        if idx == 0 or idx == len(data) - 1:
            return idx
        if direction == "forward":
            while idx < len(data) and data.iloc[idx]['date_id_copy'] == data.iloc[original_idx]['date_id_copy']:
                idx += 1
        elif direction == "backward":
            while idx > 0 and data.iloc[idx]['date_id_copy'] == data.iloc[original_idx]['date_id_copy']:
                idx -= 1
            idx += 1  # adjust to include the boundary
        return idx

    def _time_series_split(self, data):
        n = len(data)
        initial_fold_size = int(n * self.initial_fold_size_ratio)
        initial_test_size = int(initial_fold_size * (1 - self.train_test_ratio))
        increment = (1.0 - self.initial_fold_size_ratio) / (self.n_splits - 1)

        for i in range(self.n_splits):
            fold_size = int(n * (self.initial_fold_size_ratio + i * increment))
            train_size = fold_size - initial_test_size

            if self.correct:
                train_size = self._correct_boundary(data, train_size, "forward")
                end_of_test = self._correct_boundary(data, train_size + initial_test_size, "forward")
            else:
                end_of_test = train_size + initial_test_size

            train_slice = data.iloc[:train_size]
            test_slice = data.iloc[train_size:end_of_test]
            if test_slice.shape[0] == 0:
                raise ValueError("Try setting correct=False or Try reducing the train_test_ratio")

            X_train = train_slice.drop(columns=[self.target, 'date_id_copy'])
            y_train = train_slice[self.target]
            X_test = test_slice.drop(columns=[self.target, 'date_id_copy'])
            y_test = test_slice[self.target]

            self.boundaries.append((
                train_slice['date_id_copy'].iloc[0],
                train_slice['date_id_copy'].iloc[-1],
                test_slice['date_id_copy'].iloc[-1]
            ))
            yield X_train, y_train, X_test, y_test

    def _rolling_split(self, data):
        n = len(data)
        total_fold_size = int(n * self.initial_fold_size_ratio)
        test_size = int(total_fold_size * (1 - self.train_test_ratio))
        gap_size = int(total_fold_size * self.gap)
        train_size = total_fold_size - test_size
        rolling_increment = (n - total_fold_size) // (self.n_splits - 1)

        end_of_test = n - 1
        start_of_test = end_of_test - test_size
        end_of_train = start_of_test - gap_size
        start_of_train = end_of_train - train_size

        for _ in range(self.n_splits):
            if self.correct:
                start_of_train = self._correct_boundary(data, start_of_train, direction="forward")
                end_of_train = self._correct_boundary(data, end_of_train, direction="backward")
                start_of_test = self._correct_boundary(data, start_of_test, direction="forward")
                end_of_test = self._correct_boundary(data, end_of_test, direction="forward")

            train_slice = data[start_of_train:end_of_train]
            test_slice = data[start_of_test:end_of_test]
            if test_slice.shape[0] == 0:
                raise ValueError("Try setting correct=False or Try reducing the train_test_ratio")

            X_train = train_slice.drop(columns=[self.target, 'date_id_copy'])
            y_train = train_slice[self.target]
            X_test = test_slice.drop(columns=[self.target, 'date_id_copy'])
            y_test = test_slice[self.target]

            self.boundaries.append((
                train_slice['date_id_copy'].iloc[0],
                train_slice['date_id_copy'].iloc[-1],
                test_slice['date_id_copy'].iloc[0],
                test_slice['date_id_copy'].iloc[-1]
            ))
            yield X_train, y_train, X_test, y_test
            start_of_train = max(start_of_train - rolling_increment, 0)
            end_of_train -= rolling_increment
            start_of_test -= rolling_increment
            end_of_test -= rolling_increment

    def _holdout_split(self, data):
        # train_start ~ train_end : 학습 데이터 기간
        # valid_start ~ valid_end : 검증 데이터 기간
        # 학습 및 검증 데이터 분리
        train_mask = (data['date_id_copy'] >= self.train_start) & (data['date_id_copy'] <= self.train_end)
        valid_mask = (data['date_id_copy'] >= self.valid_start) & (data['date_id_copy'] <= self.valid_end)

        train_slice = data[train_mask]
        valid_slice = data[valid_mask]

        X_train = train_slice.drop(columns=[self.target, 'date_id_copy'])
        y_train = train_slice[self.target]
        X_valid = valid_slice.drop(columns=[self.target, 'date_id_copy'])
        y_valid = valid_slice[self.target]

        self.boundaries.append((
            train_slice['date_id_copy'].iloc[0],
            train_slice['date_id_copy'].iloc[-1],
            valid_slice['date_id_copy'].iloc[0],
            valid_slice['date_id_copy'].iloc[-1]
        ))
        yield X_train, y_train, X_valid, y_valid

    def visualize_splits(self):
        print("Visualizing Train/Test Split Boundaries")

        plt.figure(figsize=(15, 6))

        for idx, (train_start, train_end, test_start, test_end) in enumerate(self.boundaries):
            train_width = train_end - train_start + 1
            plt.barh(y=idx, width=train_width, left=train_start, color='blue', edgecolor='black')
            plt.text(train_start + train_width / 2, idx - 0.15, f'{train_start}-{train_end}', ha='center', va='center',
                     color='black', fontsize=8)

            test_width = test_end - test_start + 1
            plt.barh(y=idx, width=test_width, left=test_start, color='red', edgecolor='black')
            if test_width > 0:
                plt.text(test_start + test_width / 2, idx + 0.15, f'{test_start}-{test_end}', ha='center', va='center',
                         color='black', fontsize=8)

        plt.yticks(range(len(self.boundaries)), [f"split {i + 1}" for i in range(len(self.boundaries))])
        plt.xticks(self.all_dates[::int(len(self.all_dates) / 10)])
        plt.xlabel("date_id_copy")
        plt.title("Train/Test Split Boundaries")
        plt.grid(axis='x')

        plt.tight_layout()
        plt.show()

## Model Class

In [28]:
from sklearn.metrics import mean_absolute_error

class OptunaWeights:
    def __init__(self, random_state, n_trials=5000):
        self.study = None
        self.weights = None
        self.random_state = random_state
        self.n_trials = n_trials

    def _objective(self, trial, y_true, y_preds):
        # Define the weights for the predictions from each model
        #         weights = [trial.suggest_float(f"weight{n}", -2, 3) for n in range(len(y_preds))]
        weights = [max(0, trial.suggest_float(f"weight{n}", -2, 3)) for n in range(len(y_preds))]
        # Calculate the weighted prediction
        if sum(weights) == 0:
            num_models = len(y_preds)
            weights = [1 / num_models] * num_models
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=weights)
        auc_score = mean_absolute_error(y_true, weighted_pred)
        #         log_loss_score=log_loss(y_true, weighted_pred)
        return auc_score  #/log_loss_score

    def fit(self, y_true, y_preds):
        optuna.logging.set_verbosity(optuna.logging.ERROR)
        sampler = optuna.samplers.CmaEsSampler(seed=self.random_state)
        pruner = optuna.pruners.HyperbandPruner()
        self.study = optuna.create_study(sampler=sampler, pruner=pruner, study_name="OptunaWeights",
                                         direction='maximize')
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)
        self.study.optimize(objective_partial, n_trials=self.n_trials)
        self.weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds))]

    def predict(self, y_preds):
        assert self.weights is not None, 'OptunaWeights error, must be fitted before predict'
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=self.weights)
        return weighted_pred

    def fit_predict(self, y_true, y_preds):
        self.fit(y_true, y_preds)
        return self.predict(y_preds)

    def weights(self):
        return self.weights

In [29]:
# V2 ModlePipeline
class ModelPipeline:
    def __init__(self):
        self.model_mode = config["model_mode"]  # "single" or "stacking"
        self.final_mode = config["final_mode"]  # False
        self.train_mode = config["train_mode"]  # True
        self.mae_mode = config["mae_mode"]  # False
        self.model_names = config["model_name"]  # ["lgb", "xgb", "pytorch_cnn"]
        self.n_splits = config["n_splits"]  # 5
        self.model_dir = config["model_dir"]  # "./model"
        self.optuna_random_state = config["optuna_random_state"]  # 42

        self.best_iterate_ratio = config["best_iterate_ratio"]  # 1.2

        self.models = []  # [] fold 에서 모델 저장
        self.models_list = []  # [[],] 전체 fold 에서 모델 저장
        self.single_final_model = None

        self.predictions = []  # [] fold 에서 모델 예측값 저장
        self.single_model_mae = []  # [] fold 에서 mae 값 저장

        self.optuna_weights = []  # [] fold 에서 optuna weights 저장

        self.inference_models = {}
        self.inference_prediction = None

    def train(self, idx, X_train, y_train, X_valid=None, y_valid=None, use_early_stopping=True, best_iteration=None):
        self.models = []
        for model_name in self.model_names:
            model_cls = models_config[model_name]["model"]
            params = models_config[model_name]["params"]
            if best_iteration is not None:
                params["n_estimators"] = best_iteration[model_name]
            model = model_cls(**params)
            print(f"\n\n================== Training {model_name} ({idx}/{config['n_splits']})==================")
            if "lgb" in model_name:
                fit_params = {
                    "callbacks": [lgb.callback.log_evaluation(period=100)]
                }
                if use_early_stopping:
                    fit_params["callbacks"].append(lgb.callback.early_stopping(stopping_rounds=100))
                if X_valid is not None and y_valid is not None:
                    fit_params["eval_set"] = [(X_valid, y_valid)]

                model.fit(X_train, y_train, **fit_params)

            elif "xgb" in model_name:
                fit_params = {
                    "eval_set": [(X_valid, y_valid)],
                    "eval_metric": "mae",
                    "verbose": 100
                }
                if use_early_stopping:
                    fit_params["early_stopping_rounds"] = 100

                model.fit(X_train, y_train, **fit_params)

            elif model_name == "pytorch_cnn":
                pass
            else:
                raise ValueError("Invalid model name")
            print(f"Successfully trained {model_name} ({idx}/{config['n_splits']})")
            self.models.append(model)
            # joblib.dump(model, f"{self.model_dir}/{idx}_{model_name}.pkl")
            # print(f"Successfully saved model ({self.model_dir}/{idx}_{model_name}.pkl)")
        self.models_list.append(self.models)

    def predict(self, idx, X_test, infer=False):
        self.predictions = []
        if infer:
            if not self.mae_mode:
                for model_name in self.model_names:
                    # print(
                    #     f"\n\n================== Inference each model {model_name} ({idx}/{config['n_splits']})==================")
                    model = self.inference_models[f"{self.model_dir}/{idx}_{model_name}.pkl"] if MODE != "both" else \
                        self.models_list[idx][self.model_names.index(model_name)]
                    self.predictions.append(model.predict(X_test))
                    self.inference_prediction = self.predictions[0]  # for sigle model (non stakcing)
                    # print(f"Successfully inference {model_name} ({idx}/{config['n_splits']})")
            else:
                # single model mae
                model = self.single_final_model[0]
                self.predictions.append(model.predict(X_test))
                self.inference_prediction = self.predictions[0]  # for sigle model (mae)

        else:
            for model_name, model in zip(self.model_names, self.models):
                print(
                    f"\n\n================== Predict each model {model_name} ({idx}/{config['n_splits']})==================")
                self.predictions.append(model.predict(X_test))
                print(f"Successfully predicted {model_name} ({idx}/{config['n_splits']})")
                if self.mae_mode:
                    score = mean_absolute_error(y_test, self.predictions[0])
                    print(f"Score for single model {model_name} ({idx}/{config['n_splits']}): {score}")
                    self.single_model_mae.append(score)

    def stacking(self, idx, y_test=None, infer=False):
        # stacking 코드
        optuna = OptunaWeights(random_state=self.optuna_random_state)
        if infer:
            self.inference_prediction = None
            # print(f"\n\n================== Inference stacking ({idx}/{config['n_splits']})==================")
            optuna.weights = self.optuna_weights[idx]
            self.inference_prediction = optuna.predict(self.predictions)
            print(f"Successfully inference stacking ({idx}/{config['n_splits']})")
        else:
            print(f"\n\n================== Stacking ({idx}/{config['n_splits']})==================")
            y_test_pred = optuna.fit_predict(y_test.values, self.predictions)
            score = mean_absolute_error(y_test, y_test_pred)
            print(f"Score for stacking ({idx}/{config['n_splits']}): {score}")
            self.optuna_weights.append(optuna.weights)
            print(f"Successfully stacking ({idx}/{config['n_splits']})")

    def final_train(self, data):
        best_iterations = {name: [] for name in self.model_names}

        for n in range(self.n_splits):
            for model_name in self.model_names:
                model = self.models_list[n][self.model_names.index(model_name)]
                if "lgb" in model_name:
                    best_iterations[model_name].append(model.best_iteration_)
                elif "xgb" in model_name:
                    best_iterations[model_name].append(model.get_booster().best_iteration)

        average_best_iterations = {name: int(int(np.mean(iterations)) * self.best_iterate_ratio) for name, iterations in
                                   best_iterations.items()}
        for model_name, average in average_best_iterations.items():
            print(f"Average best iteration for {model_name.upper()} models: {average}")

        # final model 로 학습할거라 이전 모델 초기화
        self.models_list = []
        self.models = []
        self.predictions = []
        if self.model_mode == "stacking":
            splitter = Splitter(method="holdout", n_splits=1, correct=True)
            for idx, (X_final_train, y_final_train, X_final_test, y_final_test) in enumerate(splitter.split(data)):
                print(X_final_train.shape, y_final_train.shape, X_final_test.shape, y_final_test.shape)
                self.train(idx, X_final_train, y_final_train, X_final_test, y_final_test,
                           use_early_stopping=False,
                           best_iteration=average_best_iterations)
                self.predict(idx, X_final_test)
                self.stacking(idx, y_final_test)
        else:
            X_final_train = data.drop(columns=[config["target"], 'date_id_copy'])
            y_final_train = data[config["target"]]
            print(X_final_train.shape, y_final_train.shape)
            self.train(0, X_final_train, y_final_train,
                       use_early_stopping=False,
                       best_iteration=average_best_iterations)

    def save_models(self):
        # 모델 저장
        if MODE == "train":
            for idx in range(config["inference_n_splits"]):
                for n_model, model_name in enumerate(self.model_names):
                    model = self.models_list[idx][n_model]
                    joblib.dump(model, f"{self.model_dir}/{idx}_{model_name}.pkl")
                    print(f"Successfully saved model ({self.model_dir}/{idx}_{model_name}.pkl)")
            if self.mae_mode:
                # select best model
                max_idx = np.argmin(self.single_model_mae)
                self.single_final_model = self.models_list[max_idx]
                print(f"The best model is {self.model_names[0]} ({max_idx}/{config['n_splits']})")
                joblib.dump(self.single_final_model, f"{self.model_dir}/single_final_model.pkl")
                print(f"Successfully saved single model mae ({self.model_dir}/single_final_model.pkl)")

    def save_optuna_weights(self):
        # optuna weights 저장
        if MODE == "train":
            if self.model_mode == "stacking":
                joblib.dump(self.optuna_weights, f"{self.model_dir}/optuna_weights.pkl")
                print(f"Successfully saved optuna weights ({self.model_dir}/optuna_weights.pkl)")

    def load_models(self):
        # 모델 불러오기
        if MODE == "inference":
            for idx in range(config["inference_n_splits"]):
                for model_name in self.model_names:
                    model = joblib.load(f"{self.model_dir}/{idx}_{model_name}.pkl")
                    print(f"Successfully loaded model ({self.model_dir}/{idx}_{model_name}.pkl)")
                    self.inference_models[f"{self.model_dir}/{idx}_{model_name}.pkl"] = model
            if self.mae_mode:
                self.single_final_model = joblib.load(f"{self.model_dir}/single_final_model.pkl")
                print(f"Successfully loaded single model mae ({self.model_dir}/single_final_model.pkl)")

    def load_optuna_weights(self):
        # optuna weights 불러오기
        if MODE == "inference":
            if self.model_mode == "stacking":
                self.optuna_weights = joblib.load(f"{self.model_dir}/optuna_weights.pkl")
                print(f"Successfully loaded optuna weights ({self.model_dir}/optuna_weights.pkl)")


# ## Main
## import data

In [30]:
gc.collect();

In [31]:
# 피쳐 엔지니어링 할 함수에 args가 들어간다면 dependencies에 추가
dependencies = {
    # "feature_version_alvin_2_1": ["feature_version_alvin_1", "feature_version_alvin_2_0"],
}

In [None]:
model_pipeline = ModelPipeline()

if config["train_mode"]:
    # 데이터 불러오기
    df = pd.read_csv(f"{config['data_dir']}/train.csv")

    df = df.drop(['row_id', 'time_id'], axis=1)

    # 데이터 전처리
    data_processor = DataPreprocessor(df)
    df = data_processor.transform()

    # 사용할 피쳐 엔지니어링 함수 선택
    feature_engineer = FeatureEngineer(df, feature_versions=['feature_version_yongmin_0', 'feature_version_yongmin_1', 
                                                             'feature_version_yongmin_2', 'feature_version_yongmin_3', 'feature_version_yongmin_4',
                                                             'feature_version_alvin_1', 'feature_market_cap', 'feature_industry'],
                                       
                                       dependencies=dependencies)
    
    feature_engineer.generate_global_features(df)
    
    df = feature_engineer.transform()  # 맨 처음에는 save=True 돌렸으면, 다음부턴 transform(load=True)로 바꾸면된 
    
    splitter = Splitter(method=config["split_method"], n_splits=config["n_splits"], correct=config["correct"],
                        initial_fold_size_ratio=config["initial_fold_size_ratio"],
                        train_test_ratio=config["train_test_ratio"], gap=config["gap"])
    
    for idx, (X_train, y_train, X_test, y_test) in enumerate(splitter.split(df)):
        print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
        if "date_id_copy" in X_train.columns:
            X_train = X_train.drop(['date_id_copy'], axis=1)
            X_test = X_test.drop(['date_id_copy'], axis=1)
        
        model_pipeline.train(idx, X_train, y_train, X_test, y_test)
        model_pipeline.predict(idx, X_test)

        if config["final_mode"]:
            model_pipeline.final_train(df)


----------------------------------------------------------------------------------------------------
Executed handle_missing_data, Elapsed time: 0.31 seconds, shape((5237892, 15))
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
Executed transform, Elapsed time: 0.31 seconds, shape((5237892, 15))
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
Executed generate_global_features, Elapsed time: 0.64 seconds
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
Executed feature_version_yongmin_0, Elapsed time: 0.07 seconds, shape((52378

In [None]:
if config["train_mode"]:
    model_pipeline.save_models()
    model_pipeline.save_optuna_weights()
    splitter.visualize_splits()

### upload kaggle dataset

#### dataset init
! /home/username/.local/bin/kaggle datasets init -p {config['model_dir']}
#### dataset create 
! /home/username/.local/bin/kaggle datasets create -p {config['model_dir']}

In [None]:
if MODE == "train":
    ! /usr/local/bin/kaggle datasets init -p {config['model_dir']}
    import json

    with open(f"{config['model_dir']}/dataset-metadata.json", "r") as file:
        data = json.load(file)

    data["title"] = data["title"].replace("INSERT_TITLE_HERE", f"{KAGGLE_DATASET_NAME}")
    data["id"] = data["id"].replace("INSERT_SLUG_HERE", f"{KAGGLE_DATASET_NAME}")

    with open(f"{config['model_dir']}/dataset-metadata.json", "w") as file:
        json.dump(data, file, indent=2)

    ! /usr/local/bin/kaggle datasets create -p {config['model_dir']}

    # !/usr/local/bin/kaggle datasets version -p {config['model_dir']} -m 'Updated data'

In [34]:
dependencies = {
    # "feature_version_alvin_2_1": ["feature_version_alvin_1", "feature_version_alvin_2_0"],
}

In [33]:
if config["infer_mode"]:
    import optiver2023

    env = optiver2023.make_env()
    iter_test = env.iter_test()

    y_min, y_max = -64, 64
    qps = []
    counter = 0
    cache = pd.DataFrame()

    model_pipeline.load_models()
    model_pipeline.load_optuna_weights()
    
    for (test, revealed_targets, sample_prediction) in iter_test:
        now_time = time.time()
        
        cache = pd.concat([cache, test], ignore_index=True, axis=0)
        if counter > 0:
            cache = cache.groupby(['stock_id']).tail(21).sort_values(
                by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
            
        # preprocessing
        data_processor = DataPreprocessor(cache, infer=True)
        cache_df = data_processor.transform()
        
        # feature engineering
        feature_engineer = FeatureEngineer(cache_df, infer=True, 
                                           feature_versions=['feature_version_yongmin_0', 'feature_version_yongmin_1', 
                                                             'feature_version_yongmin_2', 'feature_version_yongmin_3', 'feature_version_yongmin_4',
                                                             'feature_version_alvin_1', 'feature_market_cap', 'feature_sector', 'feature_industry'],
                                           dependencies=dependencies)
        
        cache_df = feature_engineer.transform()
        feat = cache_df[-len(test):]
        
        feat = feat.drop(['row_id', 'currently_scored'], axis=1)
        
        test_predss = np.zeros(feat.shape[0])
        
        # prediction
        for i in range(config["inference_n_splits"]):
            model_pipeline.predict(i, feat, infer=True)
            
            if config["model_mode"] == "stacking":
                model_pipeline.stacking(i, infer=True)
                
            test_predss += model_pipeline.inference_prediction / config["inference_n_splits"]
            
        test_predss = zero_sum(test_predss, test['bid_size'] + test['ask_size'])
        clipped_predictions = np.clip(test_predss, y_min, y_max)
        sample_prediction['target'] = clipped_predictions
        env.predict(sample_prediction)
        counter += 1
        qps.append(time.time() - now_time)
        if counter % 10 == 0:
            print(counter, 'qps:', np.mean(qps))

    time_cost = 1.146 * np.mean(qps)
    print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")

Successfully loaded model (/kaggle/input/model-lgbm-version-yongmin-5/0_lgb.pkl)
Successfully loaded single model mae (/kaggle/input/model-lgbm-version-yongmin-5/single_final_model.pkl)
This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
10 qps: 2.0898277759552
20 qps: 1.941500461101532
30 qps: 1.8946048021316528
40 qps: 1.8768567323684693
50 qps: 1.859838604927063
60 qps: 1.8536233941713969
70 qps: 1.8488979646137782
80 qps: 1.8477144569158555
90 qps: 1.8426805708143446
100 qps: 1.840173692703247
110 qps: 1.8381189996545966
120 qps: 1.8351737995942434
130 qps: 1.8339970368605394
140 qps: 1.8303761260850089
150 qps: 1.8282303428649902
160 qps: 1.8279522195458413
The code will take approximately 2.0939 hours to reason about


In [34]:
# single 1fold final / fianl no
# single 1fold final / fianl
# single 5fold final / fianl no
# single 5fold final / fianl
# stacking 1fold final / fianl no
# stacking 1fold final / fianl
# stacking 5fold final / fianl no
# stacking 5fold final / fianl