In [1]:
import sgml, sgutil, sgpp, dproc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import root_mean_squared_error

from proc_v2 import p
from ml_v2 import X_cat, X_num, X_all, target, kf, skf, ss, config, scheduler
from ml_v2 import xgb_adapter, lgb_adapter, cb_adapter, lr_adapter, nn_adapter

from functools import partial

sc = sgutil.SGCache('img', 'result', 'model')
df_train = p.fit_transform(['data/train.csv']).assign(
    Calories_Log = lambda x: np.log(x['Calories'] + 1)
)
df_test = p.transform(['data/test.csv'])

2025-05-24 23:22:52.241870: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-24 23:22:52.250884: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748096572.261718   56733 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748096572.264839   56733 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748096572.272525   56733 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

# Duration-Aware CV-5

In [2]:
from itertools import product
X_o2 = [i + '_mul_'+ j for i, j in product(X_num, X_num) if i < j]
X_o2

['Age_mul_Height',
 'Age_mul_Weight',
 'Age_mul_Duration',
 'Age_mul_Heart_Rate',
 'Age_mul_Body_Temp',
 'Height_mul_Weight',
 'Duration_mul_Height',
 'Duration_mul_Weight',
 'Duration_mul_Heart_Rate',
 'Heart_Rate_mul_Height',
 'Heart_Rate_mul_Weight',
 'Body_Temp_mul_Height',
 'Body_Temp_mul_Weight',
 'Body_Temp_mul_Duration',
 'Body_Temp_mul_Heart_Rate']

## CB_da_skf5

In [13]:
hparams = {
    'model_params': {'max_depth': 7, 'n_estimators': 4000, 'learning_rate': 0.07, 'colsample_bylevel': 0.8, 'early_stopping_rounds': 300}, 
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp'],
    'validation_fraction': 'oof'
}

result_cb = sc.cv_result('cb5_da_skf5', df_train, skf, hparams, config, cb_adapter, use_gpu = 0, rerun = 0,
                     result_proc = partial(sgml.predict_learning_result, df=df_test))
np.mean(result_cb['valid_scores']), result_cb['valid_scores']

(0.05921423685913637,
 [0.05958546768671888,
  0.05915621229557451,
  0.059710016930662135,
  0.05886812440666825,
  0.05875136297605805])

## XGB_da_skf5

In [11]:
X_o2 = ['Body_Temp_mul_Duration', 'Duration_mul_Heart_Rate']
hparams = {
    'model_params': {'n_estimators': 5000, 'colsample_bytree': 0.5, 'learning_rate': 0.007, 'subsample': 0.9, 'max_depth': 8, 'early_stopping_rounds': 300},
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration_log', 'Heart_Rate', 'Body_Temp', 'Heart_Rate_div_Weight_sqrt'] + X_o2, 
    'validation_fraction': 'oof'
}

result_xgb = sc.cv_result('xgb_da_skf5', df_train, skf, hparams, config, xgb_adapter, use_gpu = 0, rerun = 0,
                     result_proc = partial(sgml.predict_learning_result, df=df_test))
np.mean(result_xgb['valid_scores']), result_xgb['valid_scores']

(0.05945324972271919,
 [0.05985007807612419,
  0.05950526148080826,
  0.05980324372649193,
  0.0592055581510067,
  0.058902107179164886])

## LGB_da_skf5

In [11]:
X_o2 = ['Body_Temp_mul_Duration', 'Duration_mul_Heart_Rate']
hparams = {
    'model_params': {'n_estimators': 5000, 'colsample_bytree': 0.5, 'num_leaves': 31, 'learning_rate': 0.03, 'early_stopping_rounds': 100},
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration_log', 'Heart_Rate', 'Body_Temp', 'Heart_Rate_div_Weight_sqrt'] + X_o2, 
    'validation_fraction': 'oof'
}
result_lgb = sc.cv_result('lgb_da_skf5', df_train, skf, hparams, config, lgb_adapter, rerun = 0,
                     result_proc = partial(sgml.predict_learning_result, df=df_test))
np.mean(result_lgb['valid_scores']), result_lgb['valid_scores']

(0.05921423685913637,
 [0.05958546768671888,
  0.05915621229557451,
  0.059710016930662135,
  0.05886812440666825,
  0.05875136297605805])

In [29]:
root_mean_squared_error(
    df_train[target],
    pd.concat([
        pd.Series(sc.read_prd(i), index = df_train.index, name = i)
        for i in ['cb5_da_skf5', 'xgb_da_skf5', 'lgb_da_skf5']
    ], axis =1).dot([0.6, 0.4, 0.0])
)

0.05906179415200209

# Ensemble

In [15]:
models = ['lgb2', 'lgb1', 'xgb1', 'xgb3', 'cb1', 'cb2', 'cb3', 'cb4', 'cb5', 'nn1', 'nn2', 'cb5_da', 'xgb_da', 'lgb_da']
X_model = [i + '_skf5' for i in models]
df_stk = sc.read_prds(X_model, index = df_train.index).assign(
    Calories_Log = df_train[target],
    duration_bin = pd.qcut(df_train['Duration'], q = 10, labels = np.arange(0, 10))
)

In [16]:
X_stk = [i + '_skf5' for i in models]
hparams = {
    'X_num': X_stk
}
result = sgml.cv(df_stk, skf, hparams,  {**config, 'sp_y': 'duration_bin'}, lr_adapter)
np.mean(result['valid_scores'])

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

0.05898965194788795

In [17]:
import sgfs

In [18]:
X_sel, _, result = sgfs.step_fs_fast(df_stk.assign(const = 1), X_stk, target, [], set(), [np.inf], root_mean_squared_error)
result

[inf,
 0.05945302738509161,
 0.05920054755556721,
 0.05909124381958336,
 0.059079547039009174,
 0.0590741690266993,
 0.05907110946365807,
 0.05906606547834305,
 0.05906563183990104]

In [19]:
result = sgml.cv(df_stk, skf, {'X_num': X_sel}, {**config, 'sp_y': 'duration_bin'}, lr_adapter, result_proc = [sgml.lr_learning_result])
np.mean(result['valid_scores']), result['valid_scores']

Fold:   0%|          | 0/5 [00:00<?, ?it/s]

(0.05897226347194764,
 [0.059301902285167137,
  0.058994556857618466,
  0.05935555121882213,
  0.05869540527783597,
  0.058513901720294484])

In [20]:
pd.concat([pd.Series(i['coef']) for i in result['model_result']], axis = 1).mean(axis = 1)

cb2_skf5       0.136420
cb1_skf5       0.119979
lgb2_skf5     -0.044986
nn2_skf5       0.320127
lgb1_skf5      0.136059
cb5_skf5       0.064125
xgb_da_skf5    0.295727
xgb1_skf5     -0.027381
dtype: float64

In [24]:
df_stk_test = pd.concat([
    sc.get_predictor_cv(i, config)(df_test).rename(i) for i in [
        'cb2_skf5', 'cb1_skf5', 'lgb2_skf5', 'nn2_skf5', 'lgb1_skf5',  'cb5_skf5', 'xgb1_skf5'
    ]
], axis = 1)
df_stk_test.head()

Unnamed: 0_level_0,cb2_skf5,cb1_skf5,lgb2_skf5,nn2_skf5,lgb1_skf5,cb5_skf5,xgb1_skf5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
750000,3.345108,3.339252,3.34222,3.343099,3.343804,3.335049,3.336766
750001,4.690045,4.691551,4.694324,4.693828,4.696249,4.685364,4.704313
750002,4.47086,4.471377,4.471508,4.477394,4.471234,4.484206,4.473776
750003,4.844425,4.840599,4.837693,4.839946,4.838142,4.838311,4.835355
750004,4.347627,4.345177,4.335305,4.347324,4.346876,4.33355,4.339695


In [25]:
df_stk_test = df_stk_test.join(
    pd.Series(np.vstack(sc.read_cv('xgb_da_skf5')['model_result']).mean(axis = 0), index = df_test.index, name = 'xgb_da_skf5')
)
df_stk_test.head()

Unnamed: 0_level_0,cb2_skf5,cb1_skf5,lgb2_skf5,nn2_skf5,lgb1_skf5,cb5_skf5,xgb1_skf5,xgb_da_skf5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
750000,3.345108,3.339252,3.34222,3.343099,3.343804,3.335049,3.336766,3.342004
750001,4.690045,4.691551,4.694324,4.693828,4.696249,4.685364,4.704313,4.696612
750002,4.47086,4.471377,4.471508,4.477394,4.471234,4.484206,4.473776,4.480614
750003,4.844425,4.840599,4.837693,4.839946,4.838142,4.838311,4.835355,4.844573
750004,4.347627,4.345177,4.335305,4.347324,4.346876,4.33355,4.339695,4.339159


In [26]:
from sklearn.linear_model import LinearRegression
reg_meta_lr = LinearRegression()
reg_meta_lr.fit(df_stk[X_sel], df_train[target])

In [27]:
pd.Series(
    reg_meta_lr.predict(df_stk_test[X_sel]), index = df_stk_test.index, name = 'Calories'
).pipe(lambda x: (np.exp(x) - 1).clip(lower = 1)).to_csv('result/submission8.csv')

In [13]:
!head result/submission8.csv

id,Calories
750000,27.284962484460088
750001,108.22294014572229
750002,86.9465197836893
750003,125.7152886388522
750004,76.05542728645645
750005,21.73949000813518
750006,48.59378494124187
750007,6.788324241664323
750008,10.008999888355769


In [28]:
# !kaggle competitions submit -c playground-series-s5e5 -f result/submission8.csv -m "Ensemble8"

100%|██████████████████████████████████████| 6.06M/6.06M [00:02<00:00, 2.83MB/s]
Successfully submitted to Predict Calorie Expenditure

# Duration Aware + Interaction Feature

In [2]:
df_train = pd.concat([
    df_train,
    df_train[X_num].multiply(df_train['Sex'].astype('float32'), axis=0).rename(columns = lambda x: x + '_1'),
    df_train[X_num].multiply((1 - df_train['Sex'].astype('float32')), axis=0).rename(columns = lambda x: x + '_0')
], axis=1)

In [3]:
X_if = [i + '_1' for i in X_num ] + [i + '_0' for i in X_num ]

- CatBoost와 XGBoost 모두 성능의 개선점이 보이지 않습니다.

# High Fold

- 많은 수의 교차 검증을 하여 Random Forest 스러운 모델의 기법을 써봅니다 

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
ssf50 = StratifiedShuffleSplit(50, train_size = 0.8, random_state = 123)
ssf25 = StratifiedShuffleSplit(25, train_size = 0.8, random_state = 123)

In [6]:
df_train_ho, df_valid_ho = train_test_split(df_train, train_size = 0.75, random_state = 123, shuffle = True)

In [None]:
X_o2 = ['Body_Temp_mul_Duration', 'Duration_mul_Heart_Rate']
hparams = {
    'model_params': {'n_estimators': 5000, 'colsample_bytree': 0.5, 'learning_rate': 0.01, 'subsample': 0.9, 'max_depth': 8, 'early_stopping_rounds': 300},
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration_log', 'Heart_Rate', 'Body_Temp', 'Heart_Rate_div_Weight_sqrt'] + X_o2, 
    'validation_fraction': 'oof'
}
ssf50 = StratifiedShuffleSplit(50, train_size = 0.8, random_state = 123)
result_xgb = sc.cv_result('xgb_da_ssf50', df_train_ho, ssf50, hparams, config, xgb_adapter, use_gpu = 1, rerun = 0,
                     result_proc = partial(sgml.predict_learning_result, df=df_valid_ho))
np.mean(result_xgb['valid_scores']), result_xgb['valid_scores']

In [14]:
root_mean_squared_error(
    df_valid_ho[target], np.vstack(result_xgb['model_result']).mean(axis = 0)
)

0.059616874903440475

In [17]:
X_o2 = ['Body_Temp_mul_Duration', 'Duration_mul_Heart_Rate']
hparams = {
    'model_params': {'n_estimators': 5000, 'colsample_bytree': 0.5, 'learning_rate': 0.01, 'subsample': 0.9, 'max_depth': 8, 'early_stopping_rounds': 300},
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration_log', 'Heart_Rate', 'Body_Temp', 'Heart_Rate_div_Weight_sqrt'] + X_o2, 
    'validation_fraction': 'oof'
}
result_xgb = sc.cv_result('xgb_da_ssf50_sb', df_train, ssf50, hparams, config, xgb_adapter, use_gpu = 1, rerun = 0,
                     result_proc = partial(sgml.predict_learning_result, df=df_test))
np.mean(result_xgb['valid_scores']), result_xgb['valid_scores']

(0.059622136130928996,
 [0.05970972403883934,
  0.05945446714758873,
  0.06025796756148338,
  0.06007489562034607,
  0.05851852893829346,
  0.059629883617162704,
  0.058500271290540695,
  0.05993784964084625,
  0.05857853218913078,
  0.05895112454891205,
  0.05956907570362091,
  0.059883009642362595,
  0.05960804224014282,
  0.059217240661382675,
  0.06039721518754959,
  0.061017993837594986,
  0.06017300859093666,
  0.060396213084459305,
  0.059514369815588,
  0.06055770069360733,
  0.0602448396384716,
  0.05943973362445831,
  0.05957671254873276,
  0.05968595668673515,
  0.05939721316099167,
  0.05865190923213959,
  0.058121904730796814,
  0.060373254120349884,
  0.059239331632852554,
  0.06001932919025421,
  0.05931759253144264,
  0.059035733342170715,
  0.06129910796880722,
  0.05987044423818588,
  0.05972221866250038,
  0.060376498848199844,
  0.05982634797692299,
  0.05947359278798103,
  0.060355935245752335,
  0.05860772356390953,
  0.06001404672861099,
  0.05905250459909439,
  

In [18]:
hparams = {
    'model_params': {'max_depth': 7, 'n_estimators': 2000, 'learning_rate': 0.07}, 
    'X_num': ['Sex', 'Age', 'Height', 'Weight', 'Duration_log', 'Heart_Rate', 'Body_Temp'], 
}

result_cb = sc.cv_result('cb_da_ssf50_sb', df_train, ssf50, hparams, config, cb_adapter, use_gpu = 1, rerun = 0,
                     result_proc = partial(sgml.predict_learning_result, df=df_test))
np.mean(result_cb['valid_scores']), result_cb['valid_scores']

(0.059555191296287883,
 [0.05964243618140191,
  0.059253224061895114,
  0.06044266164369892,
  0.06015859833080649,
  0.05848991754252254,
  0.05925666937307361,
  0.05879254680077867,
  0.05964691961484383,
  0.058386468877192774,
  0.05873897646178013,
  0.05937971668715145,
  0.06031550770266292,
  0.05957939140615279,
  0.05911447333781476,
  0.0603202807775528,
  0.06107799587580963,
  0.0598996822413904,
  0.06004917770643736,
  0.05952826704725772,
  0.06065419024188999,
  0.060240519196051306,
  0.059478379675975836,
  0.05948225202551691,
  0.05965061879272189,
  0.05953380029997595,
  0.058349545446830464,
  0.05808496488714645,
  0.059906855657577314,
  0.05915870470859298,
  0.060173429609749536,
  0.059244339189880264,
  0.05909465976872529,
  0.06119477496996145,
  0.0598973204870157,
  0.059811490463440725,
  0.05973680066335052,
  0.05956823848601056,
  0.059963268334269425,
  0.060154037647002286,
  0.05869838840217912,
  0.0601062296143608,
  0.05896553949098611,
  0.

# Ensemble

In [15]:
pd.concat([
    pd.Series(np.vstack(result_xgb['model_result']).mean(axis = 0), index = df_test.index, name = 'xgb'),
    pd.Series(np.vstack(result_cb['model_result']).mean(axis = 0), index = df_test.index, name = 'cb')
], axis = 1).dot([0.4, 0.6]).pipe(lambda x: (np.exp(x) - 1).clip(lower = 1)).to_csv('result/submission9.csv')

In [16]:
# !kaggle competitions submit -c playground-series-s5e5 -f result/submission9.csv -m "Ensemble9"

100%|██████████████████████████████████████| 6.06M/6.06M [00:02<00:00, 2.51MB/s]
Successfully submitted to Predict Calorie Expenditure

In [5]:
nn_params = {
    'config':  [
        {'unit': 64, 'activation': 'swish', 'batch_norm': False},
        {'unit': 128, 'activation': 'swish', 'batch_norm': False},
        {'unit': 128, 'activation': 'swish', 'batch_norm': False},
        {'unit': 64, 'activation': 'swish', 'batch_norm': False},
    ]
}

hparams = {
    'model_params': {
        'model_params': nn_params,
        'epochs': 30,
        'optimizer': ('Adam', {'learning_rate': 0.0002}),
        'batch_size': 128, 'shuffle_size': 204800,
        'early_stopping': None, 'reduce_lr_on_plateau': None, 'lr_scheduler': {'schedule': scheduler}
    }, 'X_std': ['Age', 'Height', 'Weight', 'Duration', 'Duration_log', 'Heart_Rate', 'Heart_Rate_sqrt_d', 'Body_Temp'], 'X_num': ['Sex']
}

result_nn = sc.cv_result('nn_da_ssf25_sb', df_train, ssf25, hparams, config, nn_adapter, rerun = 0,
                     result_proc = partial(sgml.predict_learning_result, df=df_test))
np.mean(result_nn['valid_scores']), result_nn['valid_scores']

(0.05997607246041298,
 [0.059879906475543976,
  0.05963442847132683,
  0.060638800263404846,
  0.060151517391204834,
  0.058605559170246124,
  0.059453703463077545,
  0.059070851653814316,
  0.06016174703836441,
  0.058642398566007614,
  0.058789581060409546,
  0.05931291729211807,
  0.06166021525859833,
  0.05997556447982788,
  0.05918755382299423,
  0.06036893278360367,
  0.06235136091709137,
  0.06029874086380005,
  0.06019004061818123,
  0.06007293611764908,
  0.06064626947045326,
  0.06091777980327606,
  0.059519823640584946,
  0.059815093874931335,
  0.05968823656439781,
  0.060367852449417114])

In [19]:
pd.concat([
    pd.Series(np.vstack(result_xgb['model_result']).mean(axis = 0), index = df_test.index, name = 'xgb'),
    pd.Series(np.vstack(result_cb['model_result']).mean(axis = 0), index = df_test.index, name = 'cb'),
    pd.Series(np.vstack(result_nn['model_result']).mean(axis = 0), index = df_test.index, name = 'nn')
], axis = 1).dot([0.3, 0.4, 0.3]).pipe(lambda x: (np.exp(x) - 1).clip(lower = 1)).to_csv('result/submission10.csv')

In [20]:
#!kaggle competitions submit -c playground-series-s5e5 -f result/submission10.csv -m "Ensemble10"

100%|██████████████████████████████████████| 6.06M/6.06M [00:02<00:00, 2.58MB/s]
Successfully submitted to Predict Calorie Expenditure