In [1]:
import sgml, sgutil, sgpp, dproc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import root_mean_squared_error

from proc_v2 import p
from ml_v1 import X_cat, X_num, X_all, target, kf, ss, config, scheduler
from ml_v1 import xgb_adapter, lgb_adapter, cb_adapter, lr_adapter, nn_adapter

sc = sgutil.SGCache('img', 'result', 'model')
df_train = p.fit_transform(['data/train.csv']).assign(
    Calories_Log = lambda x: np.log(x['Calories'] + 1)
)
df_test = p.transform(['data/test.csv'])

2025-05-20 23:50:55.980985: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-20 23:50:56.117775: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747752656.168640    8987 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747752656.182616    8987 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747752656.281133    8987 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

# 검증 방법의 조정 검토

Duration이 target에 영향도가 높다는 것은 공감이 됩니다. 

기존의 경험중에서 검증 방법의 부적합함으로 순위가 낮게 나온적이 있어 조정 검토를 해봅니다.

Kaggler의 분석 중에서 Duration의 10 분위구간으로 계층적 분리 사례가 보입니다. 

Public Leader가 검증보다 좋게 나오니 더 좋은 결과와 기복이 적은 검증을 보여 주는 게 더 적합 검증법이라 생각이 되어 조정을 검토를 위해

제시된 검증법의 결과를 살펴봅니다.

In [2]:
from sklearn.model_selection import StratifiedKFold, KFold
skf = StratifiedKFold(5, random_state = 123, shuffle = True)
kf5 = KFold(5, random_state = 123, shuffle = True)

In [3]:
import pandas as pd
df_train['duration_bin'] = pd.qcut(df_train['Duration'], q = 10, labels = np.arange(0, 10))

In [35]:
result_lgb1_kf4 = sc.read_cv('lgb1')
np.mean(result_lgb1_kf4['valid_scores']), np.std(result_lgb1_kf4['valid_scores'])

(np.float64(0.06024029483999867), np.float64(0.0007878547727643804))

In [34]:
result = sgml.cv(df_train, skf, result_lgb1_kf4['hparams'], 
                 {**config, 'sp_y': 'duration_bin'}, result_lgb1_kf4['adapter'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(np.float64(0.0598861565764261), np.float64(0.00034079646911263864))

In [None]:
result = sgml.cv(df_train, kf5, result_lgb1_kf4['hparams'], config, result_lgb1_kf4['adapter'])
np.mean(result['valid_scores']), np.std(result['valid_scores'])

- Kaggler들의 결과와 비교를 위해 5Fold를 기복이 더 적고, Public Score와 유사한 Duration_bin에 대한 계층적 분리를 선택합니다.

- 그리고 동일한 입력은 Target의 최소값으로 바꿔어주고 있습니다. 일종의 노이즈 제거 작업으로 보이는데, 효과성을 살펴봅니다.

In [40]:
def train_data_proc(x, agg_target_min = True):
    if agg_target_min:
        return x.loc[x[target] == x.groupby(X_all)[target].transform('min')]
    else:
        return x

In [41]:
result = sgml.cv(
    df_train, skf, {**result_lgb1_kf4['hparams'], 'train_data_proc_param': {'agg_target_min': True}}, 
    {**config, 'sp_y': 'duration_bin', 'train_data_proc': train_data_proc}, result_lgb1_kf4['adapter']
)
np.mean(result['valid_scores']), np.std(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(np.float64(0.059900043608170486), np.float64(0.0003507132513929142))

- 효과성이 없습니다.

## 재학습

In [4]:
for i in ['lgb2', 'lgb1', 'xgb1', 'xgb2', 'cb1', 'cb2', 'cb3', 'cb4', 'nn1', 'nn2']:
    result_kf4= sc.read_cv(i)
    sc.cv_result(i + '_skf5', df_train, skf, result_kf4['hparams'], {**config, 'sp_y': 'duration_bin'}, result_kf4['adapter'])

In [5]:
df_stk = sc.read_prds(
    [i + '_skf5' for i in ['lgb2', 'lgb1', 'xgb1', 'xgb2', 'cb1', 'cb2', 'cb3', 'cb4', 'nn1', 'nn2']], index = df_train.index
).assign(
    Calories_Log = df_train[target],
    duration_bin = pd.qcut(df_train['Duration'], q = 10, labels = np.arange(0, 10))
)

In [7]:
X_stk = [i + '_skf5' for i in ['lgb2', 'lgb1', 'xgb1', 'xgb2', 'cb1', 'cb2', 'cb3', 'cb4', 'nn1', 'nn2']]

In [26]:
hparams = {
    'X_num': X_stk
}
result = sgml.cv(df_stk, skf, hparams, {**config, 'sp_y': 'duration_bin'}, lr_adapter)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

0.05905013560776077

In [8]:
for i in X_stk:
    sc.train_cv(i, df_train, {**config, 'sp_y': 'duration_bin'})

I0000 00:00:1747752896.632140    8987 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5520 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4070 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Step:   0%|          | 0/5860 [00:00<?, ?it/s]

I0000 00:00:1747752898.429997   11129 service.cc:152] XLA service 0x7f6f3800c170 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1747752898.430030   11129 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Laptop GPU, Compute Capability 8.9
2025-05-20 23:54:58.458656: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1747752898.582552   11129 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1747752898.852701   11129 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

Step:   0%|          | 0/5860 [00:00<?, ?it/s]

In [25]:
{k: np.mean(v['valid_scores']) for k, v in sc.read_cvs(X_stk).items()}

{'lgb2_skf5': 0.05984920378587222,
 'lgb1_skf5': 0.05979122414285846,
 'xgb1_skf5': 0.05993200391530991,
 'xgb2_skf5': 0.05994187369942665,
 'cb1_skf5': 0.05937918878156999,
 'cb2_skf5': 0.059341334646981034,
 'cb3_skf5': 0.05926860505198582,
 'cb4_skf5': 0.059208080202538114,
 'nn1_skf5': 0.05969605818390846,
 'nn2_skf5': 0.05947723761200905}

In [9]:
from sklearn.linear_model import LinearRegression
reg_meta_lr = LinearRegression()
reg_meta_lr.fit(df_stk[X_stk], df_train[target])

In [10]:
df_stk_test = pd.concat([
    sc.get_predictor_cv(i, config)(df_test).rename(i) for i in X_stk
], axis = 1)
df_stk_test.head()

Unnamed: 0_level_0,lgb2_skf5,lgb1_skf5,xgb1_skf5,xgb2_skf5,cb1_skf5,cb2_skf5,cb3_skf5,cb4_skf5,nn1_skf5,nn2_skf5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
750000,3.34222,3.343804,3.336766,3.330249,3.339252,3.345108,3.339329,3.339992,3.348824,3.343099
750001,4.694324,4.696249,4.704313,4.694716,4.691551,4.690045,4.69097,4.688648,4.690432,4.693828
750002,4.471508,4.471234,4.473776,4.476552,4.471377,4.47086,4.481466,4.474276,4.480945,4.477394
750003,4.837693,4.838142,4.835355,4.834024,4.840599,4.844425,4.841232,4.840612,4.840408,4.839946
750004,4.335305,4.346876,4.339695,4.335484,4.345177,4.347627,4.345495,4.343181,4.346485,4.347324


In [11]:
pd.Series(
    reg_meta_lr.predict(df_stk_test), index = df_stk_test.index, name = 'Calories'
).pipe(lambda x: (np.exp(x) - 1).clip(lower = 1)).to_csv('result/submission6.csv')

In [12]:
!head result/submission6.csv

id,Calories
750000,27.266669649617235
750001,108.14976560402356
750002,86.8025970042829
750003,125.50467465330111
750004,76.20201423908433
750005,21.77193804975593
750006,48.7332687305813
750007,6.764882996939128
750008,10.05843900339979


In [13]:
# !kaggle competitions submit -c playground-series-s5e5 -f result/submission6.csv -m "Ensemble6"

100%|██████████████████████████████████████| 6.06M/6.06M [00:02<00:00, 2.60MB/s]
Successfully submitted to Predict Calorie Expenditure

In [14]:
from itertools import product

X_o2 = ['Body_Temp_mul_Duration', 'Duration_mul_Heart_Rate']

## XGB3

In [16]:
hparams = {
    'model_params': {'n_estimators': 1000, 'colsample_bytree': 0.5, 'learning_rate': 0.02, 'subsample': 0.9, 'max_depth': 8},
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration_log', 'Heart_Rate', 'Body_Temp', 'Heart_Rate_div_Weight_sqrt'] + X_o2, 
    #'validation_fraction': 0.1, 
}
result = sc.cv_result('xgb3_skf5', df_train, skf, hparams, {**config, 'sp_y': 'duration_bin'}, xgb_adapter, rerun = 0, use_gpu = True)
np.mean(result['valid_scores']), result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


(0.05958021804690361,
 [0.06003261357545853,
  0.059574373066425323,
  0.06001051515340805,
  0.05936082452535629,
  0.05892276391386986])

## LGB3

In [26]:
hparams = {
    'model_params': {'n_estimators': 2300, 'colsample_bytree': 0.5, 'num_leaves': 63, 'learning_rate': 0.03},
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration_log', 'Heart_Rate', 'Body_Temp', 'Heart_Rate_div_Weight_sqrt'] + X_o2, 
    #'validation_fraction': 0.1, 
}
result = sc.cv_result('lgb3_skf5', df_train, kf, hparams, {**config, 'sp_y': 'duration_bin'}, lgb_adapter, rerun = 0)
np.mean(result['valid_scores']), result['valid_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

(0.06014408457242311,
 [0.06010141524223946,
  0.06096232101400503,
  0.058979871403125356,
  0.06053273063032259])

## CB5

In [27]:
hparams = {
    'model_params': {'max_depth': 6, 'n_estimators': 2000, 'learning_rate': 0.07, 'colsample_bylevel': 0.75}, 
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration_log', 'Heart_Rate', 'Body_Temp', 'Heart_Rate_div_Weight_sqrt'] + X_o2, 
}

result = sc.cv_result('cb5_skf5', df_train, kf, hparams, {**config, 'sp_y': 'duration_bin'}, cb_adapter, use_gpu = 0, rerun = 0)
np.mean(result['valid_scores']), result['valid_scores']

Fold:   0%|          | 0/4 [00:00<?, ?it/s]

(0.05967991375567919,
 [0.05944756078517626,
  0.060449583625179854,
  0.058462711458710875,
  0.06035979915364975])

# Ensemble

In [36]:
X_model = [i + '_skf5' for i in ['lgb2', 'lgb1', 'lgb3', 'xgb1', 'xgb2', 'xgb3', 'cb1', 'cb2', 'cb3', 'cb4', 'cb5', 'nn1', 'nn2']]
df_stk = sc.read_prds(X_model, index = df_train.index).assign(
    Calories_Log = df_train[target],
    duration_bin = pd.qcut(df_train['Duration'], q = 10, labels = np.arange(0, 10))
)

In [38]:
X_stk = [i + '_skf5' for i in ['lgb2', 'lgb1', 'xgb1', 'xgb3', 'cb1', 'cb2', 'cb3', 'cb4', 'cb5', 'nn1', 'nn2']]
hparams = {
    'X_num': X_stk
}
result = sgml.cv(df_stk, skf, hparams,  {**config, 'sp_y': 'duration_bin'}, lr_adapter)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

0.05899515019213728

In [39]:
import sgfs

In [41]:
X_sel, _, result = sgfs.step_fs_fast(df_stk.assign(const = 1), X_stk, target, [], set(), [np.inf], root_mean_squared_error)
result

[inf,
 0.05945302738509161,
 0.05920054755556721,
 0.05911803675291648,
 0.05910277345841237,
 0.0590939597991154,
 0.05909040268995622,
 0.05908477292433439,
 0.059084709912654264]

In [44]:
result = sgml.cv(df_stk, skf, {'X_num': X_sel}, {**config, 'sp_y': 'duration_bin'}, lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores']), result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.05898731902269151,
 [0.0593321366232454,
  0.05899660190394943,
  0.05939152257119936,
  0.058707127659103776,
  0.058509206355959575])

In [47]:
pd.concat([pd.Series(i['coef']) for i in result['model_result']], axis = 1).mean(axis = 1)

xgb3_skf5    0.247522
xgb1_skf5   -0.021158
lgb1_skf5    0.152489
nn2_skf5     0.319530
cb5_skf5     0.070504
cb1_skf5     0.131216
lgb2_skf5   -0.047926
cb2_skf5     0.147886
dtype: float64

In [48]:
for i in X_stk:
    sc.train_cv(i, df_train, {**config, 'sp_y': 'duration_bin'})

In [49]:
df_stk_test = pd.concat([
    sc.get_predictor_cv(i, config)(df_test).rename(i) for i in X_stk
], axis = 1)
df_stk_test.head()

Unnamed: 0_level_0,lgb2_skf5,lgb1_skf5,xgb1_skf5,xgb3_skf5,cb1_skf5,cb2_skf5,cb3_skf5,cb4_skf5,cb5_skf5,nn1_skf5,nn2_skf5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
750000,3.34222,3.343804,3.336766,3.334744,3.339252,3.345108,3.339329,3.339992,3.335049,3.348824,3.343099
750001,4.694324,4.696249,4.704313,4.698164,4.691551,4.690045,4.69097,4.688648,4.685364,4.690432,4.693828
750002,4.471508,4.471234,4.473776,4.470401,4.471377,4.47086,4.481466,4.474276,4.484206,4.480945,4.477394
750003,4.837693,4.838142,4.835355,4.844798,4.840599,4.844425,4.841232,4.840612,4.838311,4.840408,4.839946
750004,4.335305,4.346876,4.339695,4.341092,4.345177,4.347627,4.345495,4.343181,4.33355,4.346485,4.347324


In [50]:
from sklearn.linear_model import LinearRegression
reg_meta_lr = LinearRegression()
reg_meta_lr.fit(df_stk[X_sel], df_train[target])

In [51]:
pd.Series(
    reg_meta_lr.predict(df_stk_test[X_sel]), index = df_stk_test.index, name = 'Calories'
).pipe(lambda x: (np.exp(x) - 1).clip(lower = 1)).to_csv('result/submission7.csv')

In [52]:
# !kaggle competitions submit -c playground-series-s5e5 -f result/submission7.csv -m "Ensemble7"

100%|██████████████████████████████████████| 6.06M/6.06M [00:02<00:00, 2.63MB/s]
Successfully submitted to Predict Calorie Expenditure