### データのインポート

In [5]:
import pandas as pd
import optuna
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

### 前処理

In [4]:
df = pd.read_excel('feature_engineer_train.xlsx')
# 線量予測で使えない特徴量を削除する
drop_columns = ['accession', 'scan protocol', 'scan series', 'scanning length',
                'kV', 'mean mA', 'max mA', 'pitch factor', 'nomial total collimation width',
                'exposure time per rotation', 'exposure time', 'study_date', 'id', 'DLP', 'target region']

df = df.drop(drop_columns, axis=1)

df['bmi_body_surface_area_category'] = df['bmi_body_surface_area_category'].astype('category')
df['age_weight_kg_category'] = df['age_weight_kg_category'].astype('category')

# Ordinal EncoderとTarget Encoderの使い分けを決める
# 線量と関連がありそうなカテゴリについてはtarget encodeを実施する
# target encoderの対象 -> CTDIw phantom type, category_bmi, scan_area
# ordinal encoder -> gender, scan_method
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder

te_columns = [ 'scan_area', 'category_bmi']
oe_columns = ['CTDIw phantom type','gender', 'scan_method', 'bmi_body_surface_area_category', 'age_weight_kg_category']

oe = OrdinalEncoder()
oe.set_output(transform='pandas')

df[oe_columns] = oe.fit_transform(df[oe_columns])

te = TargetEncoder()
te.fit(df[te_columns], df['Mean CTDIvol'])
df[te_columns] = te.transform(df[te_columns])

target = 'Mean CTDIvol'
X = df.drop(target, axis=1)
y = df[target]


In [7]:
def objective(trial):
    params = {
        'objective': 'mae', # 目的関数
        'verbosity': -1,
        
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.5),
        'num_leaves': trial.suggest_int('num_leaves', 2, 500),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 50),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-8, 10.0, log=True),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 10),
        'max_depth': trial.suggest_ing('max_depth', 2, 100),
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
        'path_smooth': trial.suggest_int('path_smooth', 0, 10),
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X)):
        X_tr = X.iloc[tr_idx]
        X_va = X.iloc[va_idx]
        y_tr = y.iloc[tr_idx]
        y_va = y.iloc[va_idx]
        
        lgb_train = lgb.Dataset(X_tr, y_tr)
        lgb_eval = lgb.Dataset(X_va, y_va, reference=lgb_train)
        
        model = lgb.train(params,
                          lgb_train,
                          num_boost_round=1000,
                          valid_sets=[lgb_train, lgb_eval],
                          valid_names=['train', 'valid'],
                          callbacks=[lgb.early_stopping(100),
                                     lgb.log_evaluation(50)])
        
    y_va_pred = model.predict(X_va,  num_iteration=model.best_iteration)
    score = mean_absolute_error(y_va, y_va_pred)
    print('')
    
    return score

In [8]:
study = optuna.create_study(
    study_name='lightgbm-hyperparameter-tuning-ramdomsampler-v1',
    storage='sqlite:///optuna.db',
    direction='minimize',
    sampler=optuna.samplers.RandomSampler(),
)

[32m[I 2023-07-23 07:38:44,242][0m A new study created in RDB with name: lightgbm-hyperparameter-tuning-v1[0m


In [None]:
study.optimize(objective, n_trials=100)

### 重要度の可視化

In [None]:
study = optuna.load_study(
    study_name='lightgbm-hyperparameter-tuning-v1',
    storage='sqlite:///optuna.db'
)
optuna.visualization.plot_param_importances(study).show()

In [None]:
optuna.visualization.plot_slice(
    study,
    params=["extra_trees", "feature_fraction"]
).show()

### 探索範囲を絞ってハイパーパラメータチューニングを実行する

In [None]:
def objective(trial):
    params = {
        'objective': 'mae', # 目的関数
        'verbosity': -1,
        
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.5),
        'num_leaves': trial.suggest_int('num_leaves', 2, 500),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 2, 50),
        'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-8, 10.0, log=True),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 10),
        'max_depth': trial.suggest_ing('max_depth', 2, 100),
        'extra_trees': trial.suggest_categorical('extra_trees', [True, False]),
        'path_smooth': trial.suggest_int('path_smooth', 0, 10),
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X)):
        X_tr = X.iloc[tr_idx]
        X_va = X.iloc[va_idx]
        y_tr = y.iloc[tr_idx]
        y_va = y.iloc[va_idx]
        
        lgb_train = lgb.Dataset(X_tr, y_tr)
        lgb_eval = lgb.Dataset(X_va, y_va, reference=lgb_train)
        
        model = lgb.train(params,
                          lgb_train,
                          num_boost_round=1000,
                          valid_sets=[lgb_train, lgb_eval],
                          valid_names=['train', 'valid'],
                          callbacks=[lgb.early_stopping(100),
                                     lgb.log_evaluation(50)])
        
    y_va_pred = model.predict(X_va,  num_iteration=model.best_iteration)
    score = mean_absolute_error(y_va, y_va_pred)
    print('')
    
    return score


In [None]:
study = optuna.create_study(
    study_name='lightgbm-hyperparameter-tuning-tpe-v1',
    storage='sqlite:///optuna.db',
    direction='minimize',
    sampler=optuna.samplers.TPESampler(),
)

In [None]:
optuna.visualization.plot_param_importances(study).show()

### 最適化の結果確認

In [None]:
trial = study.best_trial
print(f'trial {trial.number}')
print(f'MAE best: {trial.value}')
display(trial.params)