# HPO問題のELA特徴量

In [1]:
import pandas as pd
import numpy as np
import math

from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score

from pflacco.classical_ela_features import *
from pflacco.sampling import create_initial_sample

from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# まず、データセットを読み込みます
data = pd.read_csv('data/wilt.csv')
X = data.drop('Class', axis=1)
X

Unnamed: 0,id,V1,V2,V3,V4,V5
0,1,120.362774,205.500000,119.395349,416.581395,20.676318
1,2,124.739583,202.800000,115.333333,354.333333,16.707151
2,3,134.691964,199.285714,116.857143,477.857143,22.496712
3,4,127.946309,178.368421,92.368421,278.473684,14.977453
4,5,135.431548,197.000000,112.690476,532.952381,17.604193
...,...,...,...,...,...,...
4834,4835,123.554348,202.826087,106.391304,364.565217,17.314068
4835,4836,121.549028,276.220000,175.593333,402.620000,13.394574
4836,4837,119.076687,247.951220,113.365854,808.024390,24.830059
4837,4838,107.944444,197.000000,90.000000,451.000000,8.214887


In [3]:
y = data['Class']
y

0       2
1       2
2       2
3       2
4       2
       ..
4834    1
4835    1
4836    1
4837    1
4838    1
Name: Class, Length: 4839, dtype: int64

In [4]:
# ハイパーパラメータの探索範囲
param_bounds = {
    'nrounds': [3, 2000],
    'eta': [math.exp(-7), math.exp(0)],
    'lambda': [math.exp(-7), math.exp(7)],
    'gamma': [math.exp(-10), math.exp(2)],
    'alpha': [math.exp(-7), math.exp(7)] 
}

In [5]:
# ハイパーパラメータ空間をサンプリング
n_samples = 1000  # 必要に応じて変更
samples = {param: np.random.uniform(low=low, high=high, size=n_samples) 
           for param, (low, high) in param_bounds.items()}
samples = pd.DataFrame(samples)
samples

Unnamed: 0,nrounds,eta,lambda,gamma,alpha
0,77.585598,0.120485,759.144615,4.385620,821.253993
1,765.314926,0.598032,283.596135,6.611226,845.964876
2,1347.754438,0.858478,1064.606092,5.779855,1042.312149
3,1686.261405,0.628434,593.894417,3.378508,830.702679
4,1791.474608,0.248884,134.144826,4.608480,967.391422
...,...,...,...,...,...
995,330.895692,0.610652,120.465984,1.479987,881.397468
996,241.790114,0.125349,191.101235,5.531424,997.695691
997,133.011916,0.440063,158.049244,2.744691,156.196380
998,865.353189,0.420549,868.977085,4.881291,127.405880


In [6]:
# 丸め込みが必要な場合
samples["nrounds"] = samples["nrounds"].apply(lambda x: int(x))
samples

Unnamed: 0,nrounds,eta,lambda,gamma,alpha
0,77,0.120485,759.144615,4.385620,821.253993
1,765,0.598032,283.596135,6.611226,845.964876
2,1347,0.858478,1064.606092,5.779855,1042.312149
3,1686,0.628434,593.894417,3.378508,830.702679
4,1791,0.248884,134.144826,4.608480,967.391422
...,...,...,...,...,...
995,330,0.610652,120.465984,1.479987,881.397468
996,241,0.125349,191.101235,5.531424,997.695691
997,133,0.440063,158.049244,2.744691,156.196380
998,865,0.420549,868.977085,4.881291,127.405880


In [7]:
samples.dtypes

nrounds      int64
eta        float64
lambda     float64
gamma      float64
alpha      float64
dtype: object

In [8]:
# 各ハイパーパラメータ設定に対する目的関数を計算
def objective(params):
    # print(params)
    model = LGBMClassifier(n_estimators=int(params['nrounds']),
                           learning_rate=params['eta'],
                           reg_lambda=params['lambda'],
                           reg_alpha=params['alpha'],
                           verbose=-1)

    # 5-fold CVスコア
    score = cross_val_score(model, X, y, cv=5, scoring='neg_log_loss').mean()
    # print(score)
    return score

# 目的関数の値を計算
samples_y = samples.progress_apply(objective, axis=1)

100%|██████████| 1000/1000 [04:51<00:00,  3.44it/s]


In [12]:
samples_y

0     -0.209948
1     -0.209948
2     -0.209948
3     -0.209948
4     -0.209948
         ...   
995   -0.209948
996   -0.209948
997   -0.209948
998   -0.206966
999   -0.209948
Length: 1000, dtype: float64

In [13]:
# ELA特徴量を計算します
ela_meta = calculate_ela_meta(samples, samples_y)
ela_distr = calculate_ela_distribution(samples, samples_y)
ela_level = calculate_ela_level(samples, samples_y)
nbc = calculate_nbc(samples, samples_y)
disp = calculate_dispersion(samples, samples_y)
ic = calculate_information_content(samples, samples_y, seed=100)

In [14]:
# Store results in pandas dataframe
features_hpo = pd.DataFrame({**ic, **ela_meta, **ela_distr, **nbc, **disp}, index = [0])
features_hpo

Unnamed: 0,ic.h_max,ic.eps_s,ic.eps_max,ic.eps_ratio,ic.m0,ic.costs_runtime,ela_meta.lin_simple.adj_r2,ela_meta.lin_simple.intercept,ela_meta.lin_simple.coef.min,ela_meta.lin_simple.coef.max,...,disp.ratio_median_25,disp.diff_mean_02,disp.diff_mean_05,disp.diff_mean_10,disp.diff_mean_25,disp.diff_median_02,disp.diff_median_05,disp.diff_median_10,disp.diff_median_25,disp.costs_runtime
0,0.278882,-3.038038,6.6e-05,-3.618619,0.103206,0.406,0.026251,-0.204009,2e-06,0.002162,...,0.965645,-25.983464,-25.983464,-25.983464,-25.983464,-30.985969,-30.985969,-30.985969,-30.985969,0.11


# BBOB問題のELA特徴量

In [17]:
from ioh import get_problem, ProblemClass

features = []
# Get all 24 single-objective noiseless BBOB function in dimension 2 and 3 for the first five instances.
for fid in tqdm(range(1,25)):
    for dim in [2, 3]:
        for iid in range(1, 6):
            # Get optimization problem
            problem = get_problem(fid, iid, dim, ProblemClass.BBOB)

            # Create sample
            X = create_initial_sample(dim, lower_bound = -5, upper_bound = 5)
            y = X.apply(lambda x: problem(x), axis = 1)

            # Calculate ELA features
            ela_meta = calculate_ela_meta(X, y)
            ela_distr = calculate_ela_distribution(X, y)
            ela_level = calculate_ela_level(X, y)
            nbc = calculate_nbc(X, y)
            disp = calculate_dispersion(X, y)
            ic = calculate_information_content(X, y, seed = 100)

            # Store results in pandas dataframe
            data = pd.DataFrame({**ic, **ela_meta, **ela_distr, **nbc, **disp, **{'fid': fid}, **{'dim': dim}, **{'iid': iid}}, index = [0])
            features.append(data)

  quad_simple_cond = quad_model_con_max/quad_model_con_min
  lda_qda = np.array([lda_mmce[i]/qda_mmce[i] for i in range(len(ela_level_quantiles))])
  quad_simple_cond = quad_model_con_max/quad_model_con_min
  quad_simple_cond = quad_model_con_max/quad_model_con_min
  quad_simple_cond = quad_model_con_max/quad_model_con_min
100%|██████████| 24/24 [01:35<00:00,  4.00s/it]


In [18]:
features_bbob = pd.concat(features).reset_index(drop = True)
features_bbob

Unnamed: 0,ic.h_max,ic.eps_s,ic.eps_max,ic.eps_ratio,ic.m0,ic.costs_runtime,ela_meta.lin_simple.adj_r2,ela_meta.lin_simple.intercept,ela_meta.lin_simple.coef.min,ela_meta.lin_simple.coef.max,...,disp.diff_mean_10,disp.diff_mean_25,disp.diff_median_02,disp.diff_median_05,disp.diff_median_10,disp.diff_median_25,disp.costs_runtime,fid,dim,iid
0,0.763966,1.126126,2.663333,0.605606,0.428571,0.125,0.189539,97.551634,0.494009,2.021001,...,-3.498546,-2.709291,-4.654005,-3.946753,-3.422293,-2.579918,0.000,1,2,1
1,0.644577,1.326326,2.920556,0.825826,0.377551,0.157,0.894248,434.733695,5.687876,7.703178,...,-3.495223,-2.674072,-3.812702,-3.837386,-3.484730,-2.391569,0.000,1,2,2
2,0.704600,1.266266,4.849374,0.885886,0.316327,0.157,0.819249,-215.939107,0.499324,7.503178,...,-3.366637,-2.396171,-4.785490,-3.496943,-3.166415,-2.454170,0.000,1,2,3
3,0.735492,1.126126,3.058338,0.585586,0.397959,0.141,0.405030,-133.239068,0.440778,2.940924,...,-3.203519,-2.333507,-4.420605,-4.102743,-2.960040,-2.341476,0.016,1,2,4
4,0.689552,1.226226,4.322659,0.845846,0.357143,0.157,0.825852,4.919127,1.363324,7.322422,...,-3.763408,-2.657949,-4.087822,-3.895932,-3.593211,-2.651589,0.000,1,2,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,0.874820,1.686687,5.968856,0.985986,0.614865,0.172,0.222604,174.239124,2.054202,3.240836,...,-2.107950,-1.647262,-0.135781,-2.140374,-2.148556,-1.538556,0.000,24,3,1
236,0.865810,1.726727,7.342870,1.006006,0.662162,0.171,0.298557,167.111563,2.928470,4.179833,...,-2.213537,-1.474474,-1.644860,-1.633762,-2.130875,-1.539990,0.000,24,3,2
237,0.878892,1.766767,8.829700,1.046046,0.716216,0.172,0.331138,83.812737,2.625399,3.178839,...,-1.176162,-1.398975,1.167921,-1.496113,-1.450596,-1.493168,0.016,24,3,3
238,0.872910,1.806807,7.342870,1.026026,0.621622,0.172,0.318526,263.444211,2.409726,3.485210,...,-1.882423,-1.859769,-1.934914,-1.527123,-1.934914,-1.961077,0.000,24,3,4
