In [1]:
import sgml, sgutil, sgpp, dproc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import root_mean_squared_error

from proc_v2 import p
from ml_v1 import X_cat, X_num, X_all, target, kf, ss, config, scheduler
from ml_v1 import xgb_adapter, lgb_adapter, cb_adapter, lr_adapter, nn_adapter

sc = sgutil.SGCache('img', 'result', 'model')
df_train = p.fit_transform(['data/train.csv']).assign(
    Calories_Log = lambda x: np.log(x['Calories'] + 1)
)
df_test = p.transform(['data/test.csv'])

2025-05-20 01:15:47.169542: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-20 01:15:47.179407: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747671347.189543   51045 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747671347.192828   51045 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747671347.201071   51045 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

# 검증 방법의 조정 검토

Duration이 target에 영향도가 높다는 것은 공감이 됩니다. 

기존의 경험중에서 검증 방법의 부적합함으로 순위가 낮게 나온적이 있어 조정 검토를 해봅니다.

Kaggler의 분석 중에서 Duration의 10 분위구간으로 계층적 분리 사례가 보입니다. 

Public Leader가 검증보다 좋게 나오니 더 좋은 결과와 기복이 적은 검증을 보여 주는 게 더 적합 검증법이라 생각이 되어 조정을 검토를 위해

제시된 검증법의 결과를 살펴봅니다.

In [2]:
from sklearn.model_selection import StratifiedKFold, KFold
skf = StratifiedKFold(5, random_state = 123, shuffle = True)
kf5 = KFold(5, random_state = 123, shuffle = True)

In [23]:
import pandas as pd
df_train['duration_bin'] = pd.qcut(df_train['Duration'], q = 10, labels = np.arange(0, 10))

In [4]:
result_lgb1_kf4 = sc.read_cv('lgb1')
np.mean(result_lgb1_kf4['valid_scores']), np.std(result_lgb1_kf4['valid_scores'])

(0.0602333072030953, 0.0007964254636918083)

In [None]:
result = sgml.cv(df_train, skf, result_lgb1_kf4['hparams'], {**config, 'sp_y': 'duration_bin'}, result_lgb1_kf4['adapter'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

In [None]:
result = sgml.cv(df_train, kf5, result_lgb1_kf4['hparams'], config, result_lgb1_kf4['adapter'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

- Kaggler들의 결과와 비교를 위해 5Fold를 기복이 더 적고, Public Score와 유사한 Duration_bin에 대한 계층적 분리를 선택합니다.

## 재학습

In [10]:
for i in ['lgb2', 'lgb1', 'xgb1', 'xgb2', 'cb1', 'cb2', 'cb3', 'cb4', 'nn1', 'nn2']:
    result_kf4= sc.read_cv(i)
    sc.cv_result(i + '_skf5', df_train, skf, result_kf4['hparams'], {**config, 'sp_y': 'duration_bin'}, result_kf4['adapter'])

In [25]:
df_stk = sc.read_prds(
    [i + '_skf5' for i in ['lgb2', 'lgb1', 'xgb1', 'xgb2', 'cb1', 'cb2', 'cb3', 'cb4', 'nn1', 'nn2']], index = df_train.index
).assign(
    Calories_Log = df_train[target],
    duration_bin = pd.qcut(df_train['Duration'], q = 10, labels = np.arange(0, 10))
)

In [26]:
X_stk = [i + '_skf5' for i in ['lgb2', 'lgb1', 'xgb1', 'xgb2', 'cb1', 'cb2', 'cb3', 'cb4', 'nn1', 'nn2']]
hparams = {
    'X_num': X_stk
}
result = sgml.cv(df_stk, skf, hparams, {**config, 'sp_y': 'duration_bin'}, lr_adapter)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

0.05905013560776077

In [2]:
from itertools import product

X_o2 = ['Body_Temp_mul_Duration', 'Duration_mul_Heart_Rate']

## XGB3

In [4]:
hparams = {
    'model_params': {'n_estimators': 1000, 'colsample_bytree': 0.5, 'learning_rate': 0.02, 'subsample': 0.9, 'max_depth': 8},
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration_log', 'Heart_Rate', 'Body_Temp', 'Heart_Rate_div_Weight_sqrt'] + X_o2, 
    #'validation_fraction': 0.1, 
}
result = sc.cv_result('xgb3', df_train, kf, hparams, config, xgb_adapter, rerun = 0, use_gpu = True)
np.mean(result['valid_scores']), result['valid_scores']

(0.059901101514697075,
 [0.0597374327480793,
  0.06098819151520729,
  0.058511871844530106,
  0.0603669099509716])

## LGB3

In [4]:
hparams = {
    'model_params': {'n_estimators': 2300, 'colsample_bytree': 0.5, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration_log', 'Heart_Rate', 'Body_Temp', 'Heart_Rate_div_Weight_sqrt'] + X_o2, 
    #'validation_fraction': 0.1, 
}
result = sc.cv_result('lgb3', df_train, kf, hparams, config, lgb_adapter, rerun = 0)
np.mean(result['valid_scores']), result['valid_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

(0.06015547295126207,
 [0.060088512149218336,
  0.06094721375015833,
  0.058943984142731706,
  0.06064218176293988])

## CB5

In [17]:
hparams = {
    'model_params': {'max_depth': 6, 'n_estimators': 2000, 'learning_rate': 0.07, 'colsample_bylevel': 0.75}, 
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration_log', 'Heart_Rate', 'Body_Temp', 'Heart_Rate_div_Weight_sqrt'] + X_o2, 
}

result = sc.cv_result('cb5', df_train, kf, hparams, config, cb_adapter, use_gpu = 0, rerun = 0)
np.mean(result['valid_scores']), result['valid_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

(0.059683692528551976,
 [0.05939315987476944,
  0.0605820802424876,
  0.05846471024896851,
  0.060294819747982356])

# Ensemble

In [18]:
df_stk = sc.read_prds(
    ['lgb2', 'lgb1', 'lgb3', 'xgb1', 'xgb2', 'xgb3', 'cb1', 'cb2', 'cb3', 'cb4', 'cb5', 'nn1', 'nn2'], index = df_train.index
).assign(
    Calories_Log = df_train[target]
)

In [19]:
X_stk = ['lgb2', 'lgb1', 'lgb3', 'xgb1', 'xgb2', 'xgb3', 'cb1', 'cb2', 'cb3', 'cb4', 'cb5', 'nn1', 'nn2']
hparams = {
    'X_num': X_stk
}
result = sgml.cv(df_stk, kf, hparams, config, lr_adapter)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

0.05931652800963023

In [10]:
import sgfs

In [12]:
sgfs.step_fs_fast(df_stk.assign(const = 1), X_stk, target, [], set(), [np.inf], root_mean_squared_error)

(['cb1', 'xgb3', 'nn1', 'xgb2'],
 set(),
 [inf,
  0.05943455473528065,
  0.05934302580607246,
  0.05927327462942142,
  0.05926873676502093])

In [13]:
result = sgml.cv(df_stk, kf, {'X_num': ['cb1', 'xgb3', 'nn1', 'xgb2']}, config, lr_adapter)
np.mean(result['valid_scores']), result['valid_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

(0.05935754555891216,
 [0.05911224449724054,
  0.060344899193953125,
  0.0580770611722538,
  0.059895977372201155])