# Library

In [7]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

import gc

import lightgbm as lgb

import optuna
from functools import partial
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedShuffleSplit
from tqdm import tqdm


from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn import set_config

# Data

In [8]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

sample_submission = pd.read_csv('../data/sample_submission.csv')

obsdata = pd.read_csv('../data/ObesityDataSet.csv', index_col=0)

In [9]:
train.shape, test.shape, obsdata.shape

((20758, 18), (13840, 17), (2111, 16))

# FeatureEngineering

- ビニング

In [10]:
def feature_engineering(df):
    df['Age_Cat'] = pd.cut(df['Age'], bins=[0, 20, 30, 40,50,60, float('inf')],labels=[0,1,2,3,4,5])
    df['FCVC_Cat'] = pd.cut(df['FCVC'], bins=[1,2,3,4,5, float('inf')],labels=[1,2,3,4,5])
    df['NCP_Cat'] = pd.cut(df['NCP'], bins=[1,2,3,4,5, float('inf')],labels=[1,2,3,4,5])
    df['CH2O_Cat'] = pd.cut(df['CH2O'], bins=[0, 1, 2, 3,4, float('inf')],labels=[0,1,2,3,4])
    df['FAF_Cat'] = pd.cut(df['FAF'], bins=[0, 0.5, 1.0, 1.5, 2.5, 3.5, float('inf')],labels=[0,1,2,3,4,5])
    df['TUE_Cat'] = pd.cut(df['TUE'], bins=[0, 0.5, 1.0, 1.5, 2, 3, float('inf')],labels=[0,1,2,3,4,5])
    return df

In [11]:
train = feature_engineering(train)

- ラベルエンコード

In [12]:
le = LabelEncoder()
enc = le.fit_transform(train.NObeyesdad)
train = train.assign(target=enc)
train.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,...,CALC,MTRANS,NObeyesdad,Age_Cat,FCVC_Cat,NCP_Cat,CH2O_Cat,FAF_Cat,TUE_Cat,target
0,0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,...,Sometimes,Public_Transportation,Overweight_Level_II,1,1,2,2,,1,6
1,1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,...,no,Automobile,Normal_Weight,0,1,2,1,1.0,1,1
2,2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,...,no,Public_Transportation,Insufficient_Weight,0,1,1,1,1.0,3,0
3,3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,...,Sometimes,Public_Transportation,Obesity_Type_III,1,2,2,1,2.0,1,4
4,4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,...,Sometimes,Public_Transportation,Overweight_Level_II,2,2,1,1,3.0,1,6


# model_Data

- データ

In [13]:
numeric_features = ['Height','Weight']
categorical_features = ['Gender','family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS']
ordinal_features = ['Age_Cat','FCVC_Cat','NCP_Cat','CH2O_Cat','FAF_Cat','TUE_Cat']
all_cols = numeric_features + categorical_features + ordinal_features

- pipeline

In [14]:
def get_pipeline():
    numerical_pipe = Pipeline([('std_scaler',StandardScaler())])
    categorical_pipe = Pipeline([('one_hot',OneHotEncoder())])
    ordinal_pipe = Pipeline([('ordinal',OrdinalEncoder())])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
    ordinal_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=np.nan))])    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
    return pipeline

def build_model(model):
    numerical_pipe = Pipeline([('std_scaler',StandardScaler())])
    categorical_pipe = Pipeline([('one_hot',OneHotEncoder())])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
    ordinal_transformer = Pipeline(steps=[('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=np.nan))])        
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features),
            ('ordinal', ordinal_transformer, ordinal_features)
        ])
    mdl = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])   
    set_config(display='diagram')
    return mdl

- Data

In [15]:
X = train[all_cols]
y = train['target']

- oprtuna

In [20]:
def callback(study, trial):
    if study.best_trial.number == trial.number:
        study.set_user_attr(key="best_model", value=trial.user_attrs["best_model"])

def optimize(trial, X, y, n_splits):
 
    n_estimators = trial.suggest_int("n_estimators",500,1500)
    max_depth = trial.suggest_int("max_depth",15,30)
    learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
    gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
    subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
    min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
    reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
    reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
    colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)

    params = dict(use_label_encoder=False,
                  eval_metric='logloss',
                  objective='multi:softmax',
                  n_estimators = n_estimators,
                  max_depth = max_depth,
                  learning_rate = learning_rate,
                  gamma = gamma,
                  subsample = subsample,
                  min_child_weight = min_child_weight,
                  reg_lambda = reg_lambda,
                  reg_alpha = reg_alpha,                  
                  colsample_bytree = colsample_bytree,
                  random_state=42)

    # gpu_params = dict(tree_method='gpu_hist',gpu_id=0)
    # params.update(gpu_params)

    model = build_model(XGBClassifier(**params))
    
    strat_split = StratifiedShuffleSplit(n_splits=n_splits, 
                                         test_size = 0.2, 
                                         random_state=42)
    lg_loss = []
    for fold, (train_idx, test_idx) in tqdm(enumerate(strat_split.split(X=X, y=y))):
        X_train = X.loc[train_idx]
        y_train = y.loc[train_idx]      
        X_val = X.loc[test_idx]
        y_val = y.loc[test_idx]

        #model.fit(X_train, y_train,eval_set=[(X_val,y_val)], early_stopping_rounds=100)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        fold_lgloss = log_loss(y_val,preds)
        lg_loss.append(fold_lgloss)

    print(f"y_Mean log_loss : {np.mean(lg_loss)}rs_")
    trial.set_user_attr(key="best_model", value=model)
    return np.mean(lg_loss)

- optuna_trial

In [21]:
num_trails = 15
study = optuna.create_study(direction='minimize', study_name='s4e2-feb2024-xgboost-optuna')
optimization_function = partial(optimize, X=X, y=y, n_splits=5)
study.optimize(optimization_function,n_trials=num_trails, callbacks=[callback])

[I 2024-02-05 14:40:54,993] A new study created in memory with name: s4e2-feb2024-xgboost-optuna
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:33,  6.65s/it]


y_Mean log_loss : 0.29210391620579734rs_


[I 2024-02-05 14:41:28,932] Trial 0 finished with value: 0.29210391620579734 and parameters: {'n_estimators': 1407, 'max_depth': 18, 'learning_rate': 0.4252736429093766, 'gamma': 0.27755955873599913, 'subsample': 0.7645651978326519, 'min_child_weight': 2.443404815782571, 'reg_lambda': 2.1425021623249503, 'reg_alpha': 1.7606827578361264, 'colsample_bytree': 0.3536488183618155}. Best is trial 0 with value: 0.29210391620579734.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:26,  5.37s/it]


y_Mean log_loss : 0.27991561129853026rs_


[I 2024-02-05 14:41:56,601] Trial 1 finished with value: 0.27991561129853026 and parameters: {'n_estimators': 1015, 'max_depth': 23, 'learning_rate': 0.2713288616329084, 'gamma': 0.7867694743095488, 'subsample': 0.7913378162475828, 'min_child_weight': 2.183867536743872, 'reg_lambda': 2.061745543209171, 'reg_alpha': 1.753080478750923, 'colsample_bytree': 0.6308579778246476}. Best is trial 1 with value: 0.27991561129853026.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:34,  6.82s/it]


y_Mean log_loss : 0.2843252387516517rs_


[I 2024-02-05 14:42:31,541] Trial 2 finished with value: 0.2843252387516517 and parameters: {'n_estimators': 1216, 'max_depth': 29, 'learning_rate': 0.3340206207374731, 'gamma': 0.619466218596777, 'subsample': 0.625037047758365, 'min_child_weight': 2.5788726224198832, 'reg_lambda': 1.6667894926455364, 'reg_alpha': 1.9114776096243031, 'colsample_bytree': 0.3836730174494707}. Best is trial 1 with value: 0.27991561129853026.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:39,  7.90s/it]


y_Mean log_loss : 0.29183610530112103rs_


[I 2024-02-05 14:43:11,885] Trial 3 finished with value: 0.29183610530112103 and parameters: {'n_estimators': 1048, 'max_depth': 25, 'learning_rate': 0.21726730986302428, 'gamma': 0.42450543731043583, 'subsample': 0.5857635246313191, 'min_child_weight': 2.363558797366117, 'reg_lambda': 1.3183281386052215, 'reg_alpha': 2.0268857251521872, 'colsample_bytree': 0.6931379285085737}. Best is trial 1 with value: 0.27991561129853026.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:29,  5.82s/it]


y_Mean log_loss : 0.30684629813140807rs_


[I 2024-02-05 14:43:41,895] Trial 4 finished with value: 0.30684629813140807 and parameters: {'n_estimators': 627, 'max_depth': 23, 'learning_rate': 0.438633549076499, 'gamma': 0.37716572563027123, 'subsample': 0.6228451103068842, 'min_child_weight': 0.8450468365741421, 'reg_lambda': 1.819464052367378, 'reg_alpha': 1.6986564144019243, 'colsample_bytree': 0.7531236906918923}. Best is trial 1 with value: 0.27991561129853026.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:42,  8.46s/it]


y_Mean log_loss : 0.281410575677988rs_


[I 2024-02-05 14:44:25,146] Trial 5 finished with value: 0.281410575677988 and parameters: {'n_estimators': 929, 'max_depth': 16, 'learning_rate': 0.1509417696559817, 'gamma': 0.39256732019916724, 'subsample': 0.5936968890870178, 'min_child_weight': 2.4119580011522475, 'reg_lambda': 2.0516831144430165, 'reg_alpha': 2.0968517332238537, 'colsample_bytree': 0.4783665006473903}. Best is trial 1 with value: 0.27991561129853026.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:45,  9.11s/it]


y_Mean log_loss : 0.28492994077741574rs_


[I 2024-02-05 14:45:11,626] Trial 6 finished with value: 0.28492994077741574 and parameters: {'n_estimators': 1081, 'max_depth': 22, 'learning_rate': 0.09973425741239252, 'gamma': 0.5066088113460966, 'subsample': 0.5888739354408621, 'min_child_weight': 2.969621403928047, 'reg_lambda': 1.8635353719995345, 'reg_alpha': 1.583709738281168, 'colsample_bytree': 0.7562284231569402}. Best is trial 1 with value: 0.27991561129853026.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:28,  5.68s/it]


y_Mean log_loss : 0.2792738399644056rs_


[I 2024-02-05 14:45:40,713] Trial 7 finished with value: 0.2792738399644056 and parameters: {'n_estimators': 597, 'max_depth': 17, 'learning_rate': 0.07637092741890077, 'gamma': 0.23297098396051247, 'subsample': 0.77776612934952, 'min_child_weight': 2.141518326737641, 'reg_lambda': 2.268302471866175, 'reg_alpha': 1.8605318656331928, 'colsample_bytree': 0.5810635567218015}. Best is trial 7 with value: 0.2792738399644056.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:28,  5.67s/it]


y_Mean log_loss : 0.2837111898362281rs_


[I 2024-02-05 14:46:09,681] Trial 8 finished with value: 0.2837111898362281 and parameters: {'n_estimators': 682, 'max_depth': 30, 'learning_rate': 0.14117099679458264, 'gamma': 0.2089625503211836, 'subsample': 0.6824974017184366, 'min_child_weight': 1.7063041907868142, 'reg_lambda': 2.260343840375932, 'reg_alpha': 1.5128200270494123, 'colsample_bytree': 0.40739518071854275}. Best is trial 7 with value: 0.2792738399644056.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:41,  8.28s/it]


y_Mean log_loss : 0.28103946157866544rs_


[I 2024-02-05 14:46:51,805] Trial 9 finished with value: 0.28103946157866544 and parameters: {'n_estimators': 1123, 'max_depth': 19, 'learning_rate': 0.12611068694815095, 'gamma': 0.2656969541957156, 'subsample': 0.6673346016497722, 'min_child_weight': 1.637637609021337, 'reg_lambda': 1.5885047666789829, 'reg_alpha': 1.995813111004263, 'colsample_bytree': 0.4187842208427162}. Best is trial 7 with value: 0.2792738399644056.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:54, 10.97s/it]


y_Mean log_loss : 0.3780887134178295rs_


[I 2024-02-05 14:47:47,409] Trial 10 finished with value: 0.3780887134178295 and parameters: {'n_estimators': 798, 'max_depth': 15, 'learning_rate': 0.01036903466167223, 'gamma': 0.06358561962665954, 'subsample': 0.5003057127236731, 'min_child_weight': 1.2832026354878787, 'reg_lambda': 2.286982506163495, 'reg_alpha': 2.1793164493325583, 'colsample_bytree': 0.26840593517529776}. Best is trial 7 with value: 0.2792738399644056.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:20,  4.01s/it]


y_Mean log_loss : 0.2820683596140782rs_


[I 2024-02-05 14:48:08,220] Trial 11 finished with value: 0.2820683596140782 and parameters: {'n_estimators': 503, 'max_depth': 26, 'learning_rate': 0.27855286494652837, 'gamma': 0.7878668117703136, 'subsample': 0.7990495124268577, 'min_child_weight': 2.0606872417970683, 'reg_lambda': 2.034671448828993, 'reg_alpha': 1.8125614087060964, 'colsample_bytree': 0.6007874379890042}. Best is trial 7 with value: 0.2792738399644056.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:36,  7.36s/it]


y_Mean log_loss : 0.27981841952791814rs_


[I 2024-02-05 14:48:45,849] Trial 12 finished with value: 0.27981841952791814 and parameters: {'n_estimators': 860, 'max_depth': 20, 'learning_rate': 0.2573641274381523, 'gamma': 0.7658016968644082, 'subsample': 0.7510361333409219, 'min_child_weight': 1.9873729815313326, 'reg_lambda': 2.139130573976062, 'reg_alpha': 1.8645746743898726, 'colsample_bytree': 0.5811044227114709}. Best is trial 7 with value: 0.2792738399644056.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:52, 10.42s/it]


y_Mean log_loss : 0.2816449552381469rs_


[I 2024-02-05 14:49:38,725] Trial 13 finished with value: 0.2816449552381469 and parameters: {'n_estimators': 820, 'max_depth': 19, 'learning_rate': 0.01802843135691596, 'gamma': 0.62599090580666, 'subsample': 0.7393106241877193, 'min_child_weight': 1.9637469557976264, 'reg_lambda': 2.291477130884288, 'reg_alpha': 1.8804710689011104, 'colsample_bytree': 0.5533830528236161}. Best is trial 7 with value: 0.2792738399644056.
  learning_rate = trial.suggest_uniform("learning_rate", 0.01, 0.5)
  gamma = trial.suggest_uniform("gamma", 0.05, 0.8)
  subsample = trial.suggest_uniform("subsample", 0.5, 0.8)
  min_child_weight = trial.suggest_uniform("min_child_weight", 0.5, 3)
  reg_lambda = trial.suggest_uniform("reg_lambda", 1.3, 2.3)
  reg_alpha = trial.suggest_uniform("reg_alpha", 1.5, 2.2)
  colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.25, 0.8)
5it [00:19,  3.94s/it]


y_Mean log_loss : 0.2796702019868234rs_


[I 2024-02-05 14:49:59,129] Trial 14 finished with value: 0.2796702019868234 and parameters: {'n_estimators': 532, 'max_depth': 21, 'learning_rate': 0.21314425012837843, 'gamma': 0.5506498587177985, 'subsample': 0.7321813535662509, 'min_child_weight': 1.5221615891273836, 'reg_lambda': 2.1283419362309797, 'reg_alpha': 1.9270904351848996, 'colsample_bytree': 0.5372544377228453}. Best is trial 7 with value: 0.2792738399644056.


- optuna_check

In [22]:
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

Best trial: score 0.2792738399644056, params {'n_estimators': 597, 'max_depth': 17, 'learning_rate': 0.07637092741890077, 'gamma': 0.23297098396051247, 'subsample': 0.77776612934952, 'min_child_weight': 2.141518326737641, 'reg_lambda': 2.268302471866175, 'reg_alpha': 1.8605318656331928, 'colsample_bytree': 0.5810635567218015}


In [23]:
study.best_trial.params

{'n_estimators': 597,
 'max_depth': 17,
 'learning_rate': 0.07637092741890077,
 'gamma': 0.23297098396051247,
 'subsample': 0.77776612934952,
 'min_child_weight': 2.141518326737641,
 'reg_lambda': 2.268302471866175,
 'reg_alpha': 1.8605318656331928,
 'colsample_bytree': 0.5810635567218015}

- xgboost_optuna

In [26]:
params = dict(use_label_encoder=False,
                  eval_metric='logloss',
                  objective='multi:softmax',
              verbosity=1,
              random_state=42)
params.update(study.best_trial.params)
# gpu_params = dict(tree_method='gpu_hist',gpu_id=0)
# params.update(gpu_params)
params

{'use_label_encoder': False,
 'eval_metric': 'logloss',
 'objective': 'multi:softmax',
 'verbosity': 1,
 'random_state': 42,
 'n_estimators': 597,
 'max_depth': 17,
 'learning_rate': 0.07637092741890077,
 'gamma': 0.23297098396051247,
 'subsample': 0.77776612934952,
 'min_child_weight': 2.141518326737641,
 'reg_lambda': 2.268302471866175,
 'reg_alpha': 1.8605318656331928,
 'colsample_bytree': 0.5810635567218015}

In [27]:
best_model = build_model(XGBClassifier(**params))
best_model.fit(X,y)

# val予測

In [9]:
y_val_pred = base_model.predict(X_val)

In [10]:
accuracy = accuracy_score(y_val, y_val_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9090558766859345


# test予測

In [28]:
test = feature_engineering(test)
predictions = best_model.predict(test[all_cols])

- 結合

In [29]:
submit = pd.DataFrame(predictions, columns = ['target'])
submit['id'] = sample_submission['id']
submit = submit[['id','target']]

In [30]:
le_target_map = dict(zip(le.transform(le.classes_), le.classes_))
le_target_map

{0: 'Insufficient_Weight',
 1: 'Normal_Weight',
 2: 'Obesity_Type_I',
 3: 'Obesity_Type_II',
 4: 'Obesity_Type_III',
 5: 'Overweight_Level_I',
 6: 'Overweight_Level_II'}

In [31]:
submit['NObeyesdad'] = submit['target'].apply(lambda x: le_target_map[x])
submit = submit[['id','NObeyesdad']]

In [36]:
submit.to_csv(f'../output/submit_val.csv', index=False)

- submit確認

In [37]:
submit.head()

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III
