### 메타 모델 학습

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import poisson, loguniform
from sklearn.neural_network import MLPClassifier as MLPC
from sklearn.neural_network import MLPRegressor as MLPR
from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
import pickle
import warnings
from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process.kernels import Matern
from scipy.stats import norm
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [2]:
def extract_meta_features(X, y):
    m1, m2 = X.shape # 샘플 개수, 특징 개수
    y_vc = y.value_counts() # 라벨 분포
    m3 = y_vc.iloc[0] / y_vc.iloc[-1] # 클래스 불균형 비율
    m4 = m2 / m1 # 샘플 대비 특징 비율
    m5 = sum(X.dtypes == float) / m2 # 정수형 특징 비율
    m6 = sum(X.dtypes == int) / m2 # 실수형 특징 비율
    m7 = (X.max() - X.min()).max() # 특징별 범위의 최댓값
    m8 = (X.max() - X.min()).min() # 특징별 범위의 최솟값
    m9 = sum(X.min() > 0) / m2 # 모든 값이 양수인 비율
    
    return [m1, m2, m3, m4, m5, m6, m7, m8, m9]

In [83]:
def sampling():
    h1 = poisson(15).rvs()
    h2 = poisson(3).rvs()
    h3 = poisson(2).rvs() if h2 > 0 else 0
    h4 = poisson(1).rvs() if h3 > 0 else 0
    max_iter = int(loguniform(100, 10000).rvs())
    learning_rate_init = loguniform(0.0001, 0.1).rvs()
    s1 = np.random.choice([0, 1])
    s2 = np.random.choice([0, 1])
    s3 = np.random.choice([0, 1]) if s2 == 0 else 0
    
    return [h1, h2, h3, h4, max_iter, learning_rate_init, s1, s2, s3]

In [4]:
def model_test(model, X, y, s1, s2, s3):
    kf = KFold(n_splits = 5, shuffle = True, random_state = 2022)
    # 데이터 타입 변경 (pandas -> numpy)
    if isinstance(X, pd.DataFrame):
        X = X.values
    if isinstance(y, pd.Series):
        y = y.values
    
    # 모델 학습
    score = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        if s1 == 1:
            scaler = MinMaxScaler().fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
        
        if s2 == 1:
            X_train, y_train = SMOTE(k_neighbors = 3,
                                     random_state=2022).fit_resample(X_train, y_train)
        elif s3 == 1:
            X_train, y_train = NearMiss().fit_resample(X_train, y_train)
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score += f1_score(y_test, y_pred) / 5
        
    return score

In [5]:
meta_file_name_list = [
    "shuttle-c2-vs-c4",
    "iris0",
    "glass-0-1-6_vs_5",
    "glass-0-1-6_vs_2",
    "sonar",
    "glass0",
    "glass-0-1-2-3_vs_4-5-6",
    "glass1",
    "glass2",
    "glass5",
    "glass6",
    "new-thyroid1",
    "ecoli-0_vs_1",
    "spectfheart",
    "heart",
    "haberman",
    "bupa",
    "ionosphere",
    "monk-2",
    "page-blocks-1-3_vs_4",
    "wdbc",
    "vehicle0",
    "vehicle2",
    "vehicle3",
    "yeast-1-2-8-9_vs_7",
    "vowel0"
]

In [6]:
meta_data_list = []
for file_name in meta_file_name_list:
    df = pd.read_csv("../../data/classification/{}.csv".format(file_name))
    X = df.drop('y', axis = 1)
    y = df['y']
    meta_features = extract_meta_features(X, y)
    meta_data_list.append((X, y, meta_features))

#### 메타 모델 학습 데이터 생성

In [7]:
meta_X = []
meta_y = []

for _ in tqdm(range(10000)):
    idx = np.random.choice(range(len(meta_file_name_list)))
    X, y, meta_features = meta_data_list[idx]
    h1, h2, h3, h4, max_iter, learning_rate_init, s1, s2, s3 = sampling()
    
    if h2 == 0:
        hidden_layer_sizes = (h1, )
    elif h3 == 0:
        hidden_layer_sizes = (h1, h2)
    elif h4 == 0:
        hidden_layer_sizes = (h1, h2, h3)
    else:
        hidden_layer_sizes = (h1, h2, h3, h4)
    
    model = MLPC(hidden_layer_sizes = hidden_layer_sizes,
                max_iter = max_iter,
                learning_rate_init = learning_rate_init,
                random_state = 2022)
    
    score = model_test(model, X, y, s1, s2, s3)
    record = [h1, h2, h3, h4, max_iter, learning_rate_init, s1, s2, s3]
    record += meta_features
    
    meta_X.append(record)
    meta_y.append(score)

In [8]:
meta_X_cols = ["h1", "h2", "h3", "h4",
               "max_iter", "learning_rate_init",
               "s1", "s2", "s3",
               "m1", "m2", "m3", "m4", "m5", "m6", "m7", "m8", "m9"]
meta_X = pd.DataFrame(meta_X, columns = meta_X_cols)
meta_y = pd.Series(meta_y)
meta_y.name = "y"
meta_df = pd.concat([meta_X, meta_y], axis = 1)
meta_df.to_csv("MyAutoML3_메타모델_학습데이터.csv", index = False)

#### 메타 모델 학습 및 저장

In [87]:
best_score = np.inf
for _ in range(1000):
    h1, h2, h3, h4, max_iter, learning_rate_init, s1, s2, s3 = sampling()
    if h2 == 0:
        hidden_layer_sizes = (h1,)
    elif h3 == 0:
        hidden_layer_sizes = (h1, h2)
    elif h4 == 0:
        hidden_layer_sizes = (h1, h2, h3)
    else:
        hidden_layer_sizes = (h1, h2, h3, h4)
    model = MLPR(
        hidden_layer_sizes=hidden_layer_sizes,
        max_iter=max_iter,
        learning_rate_init=learning_rate_init,
        random_state=2022,
    )

    score = -cross_val_score(
        model, meta_X, meta_y, cv=5, scoring="neg_mean_absolute_error"
    ).mean()
    if score < best_score:
        best_score = score
        best_model = model
best_model.fit(meta_X, meta_y)

MLPRegressor(hidden_layer_sizes=(18, 5, 2, 1),
             learning_rate_init=0.0007337677668986755, max_iter=250,
             random_state=2022)

In [88]:
with open("MyAutoML3_meta_model.pckl", "wb") as f:
    pickle.dump(best_model, f)

### 시스템 구현 및 활용

In [130]:
class MyAutoML3:
    ## 생성자
    def __init__(
        self,
        seed=None,
        cv=5,
        scoring="f1",
        summarize_scoring="mean",
        num_iter=1000,
        num_candidate=100,
        num_init=10,
        num_sample=1,
    ):

        # self.seed 정의
        if (type(seed) != int) and (seed is not None):
            raise ValueError("seed는 int형 혹은 None이어야 합니다.")
        self.seed = seed

        # self.cv 정의
        if type(cv) != int:
            raise ValueError("cv는 int형이어야 합니다.")
        if cv < 2:
            raise ValueError("cv는 2보다는 커야 합니다.")
        self.cv = cv

        # self.scoring 정의
        scoring_dict = {
            "accuracy": accuracy_score,
            "precision": precision_score,
            "recall": recall_score,
            "f1": f1_score,
        }

        if scoring not in scoring_dict.keys():
            msg = "scoring은 {}중 하나여야 합니다.".format(scoring_dict.keys())
            raise ValueError(msg)
        self.scoring = scoring_dict[scoring]

        # self.summarize_scoring 정의
        summarize_scoring_dict = {"mean": np.mean, "max": np.max, "min": np.min}

        if summarize_scoring not in ["mean", "max", "min"]:
            msg = "summarize_scoring는 {'mean', 'max', 'min'}중 하나여야 합니다."
            raise ValueError(msg)
        self.summarize_scoring = summarize_scoring_dict[summarize_scoring]

        # self.num_iter 정의
        if type(num_iter) != int:
            raise ValueError("num_iter는 int 자료형이어야 합니다.")
        elif num_iter <= 0:
            raise ValueError("num_iter는 0보다 커야 합니다.")
        self.num_iter = num_iter

        # self.num_candidate 정의
        if type(num_candidate) != int:
            raise ValueError("num_candidate는 int 자료형이어야 합니다.")
        elif num_candidate <= 0:
            raise ValueError("num_candidate는 0보다 커야 합니다.")
        self.num_candidate = num_candidate

        # self.num_init 정의
        if type(num_init) != int:
            raise ValueError("num_init은 int 자료형이어야 합니다.")
        elif num_init <= 0:
            raise ValueError("num_init은 0보다 커야 합니다.")
        self.num_init = num_init

        # self.num_sample 정의
        if type(num_sample) != int:
            raise ValueError("num_sample은 int 자료형이어야 합니다.")
        elif num_sample <= 0:
            raise ValueError("num_sample은 0보다 커야 합니다.")
        elif num_sample > num_candidate:
            raise ValueError("num_sample은 num_candidate보다 커야 합니다.")
        self.num_sample = num_sample

        # self.meta_model 정의
        with open("MyAutoML3_meta_model.pckl", "rb") as f:
            self.meta_model = pickle.load(f)

    ## _extract_meta_features 메서드
    def _extract_meta_features(self, X, y):
        m1, m2 = X.shape  # 샘플 개수, 특징 개수
        y_vc = y.value_counts()  # 라벨 분포
        m3 = y_vc.iloc[0] / y_vc.iloc[-1]  # 클래스 불균형 비율
        m4 = m2 / m1  # 샘플 대비 특징 비율
        m5 = sum(X.dtypes == float) / m2  # 정수형 특징 비율
        m6 = sum(X.dtypes == int) / m2  # 실수형 특징 비율
        m7 = (X.max() - X.min()).max()  # 특징별 범위의 최댓값
        m8 = (X.max() - X.min()).min()  # 특징별 범위의 최솟값
        m9 = sum(X.min() > 0) / m2  # 모든 값이 양수인 비율

        return [m1, m2, m3, m4, m5, m6, m7, m8, m9]

    ## _sampling 메서드
    def _sampling(self):
        h1 = poisson(15).rvs()
        h2 = poisson(3).rvs()
        h3 = poisson(2).rvs() if h2 > 0 else 0
        h4 = poisson(1).rvs() if h3 > 0 else 0
        max_iter = int(loguniform(100, 10000).rvs())
        learning_rate_init = loguniform(0.0001, 0.1).rvs()
        s1 = np.random.choice([0, 1])
        s2 = np.random.choice([0, 1])
        s3 = np.random.choice([0, 1]) if s2 == 0 else 0 

        return [h1, h2, h3, h4, max_iter, learning_rate_init, s1, s2, s3]

    ## _solution_evaluate 메서드
    def _solution_evaluate(self, solution, X, y):
        h1, h2, h3, h4, max_iter, learning_rate_init, s1, s2, s3 = solution
        h1, h2, h3, h4, max_iter = tuple(map(int, (h1, h2, h3, h4, max_iter)))
        if h2 == 0:
            hidden_layer_sizes = (h1,)
        elif h3 == 0:
            hidden_layer_sizes = (h1, h2)
        elif h4 == 0:
            hidden_layer_sizes = (h1, h2, h3)
        else:
            hidden_layer_sizes = (h1, h2, h3, h4)
        model = MLPC(
            hidden_layer_sizes=hidden_layer_sizes,
            max_iter=max_iter,
            learning_rate_init=learning_rate_init,
            random_state=2022,
        )

        fold_score_list = []
        kf = KFold(n_splits=5, shuffle=True, random_state=2022)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            if s1 == 1:
                scaler = MinMaxScaler().fit(X_train)
                X_train = scaler.transform(X_train)
                X_test = scaler.transform(X_test)
            if s2 == 1:
                X_train, y_train = SMOTE(k_neighbors=3, random_state=2022).fit_resample(
                    X_train, y_train
                )
            elif s3 == 1:
                X_train, y_train = NearMiss().fit_resample(X_train, y_train)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            fold_score = self.scoring(y_test, y_pred)
            fold_score_list.append(fold_score)
        score = self.summarize_scoring(fold_score_list)
        return score
    
    ## _EI 메서드
    def _EI(self, X_new, surrogate_model, best_mu, e=0.01):
        mu, sigma = surrogate_model.predict(X_new, return_std=True)
        z = np.zeros(len(X_new))
        z[sigma > 0] = ((mu - best_mu - e) / sigma)[sigma > 0]
        return (mu - best_mu - e) * norm.cdf(z) + sigma * norm.pdf(z)

    ## fit 메서드
    def fit(self, X, y):
        # X, y 포맷 변경
        if isinstance(X, pd.DataFrame):
            X = X.values
        elif isinstance(X, list) or isinstance(X, tuple):
            X = np.array(X)
        if isinstance(y, pd.Series):
            y = y.values
        elif isinstance(y, list) or isinstance(y, tuple):
            y = np.array(y)
        # 베이지안 최적화 시작
        meta_features = self._extract_meta_features(pd.DataFrame(X), pd.Series(y))
        candidate_list = []
        for _ in range(self.num_candidate):
            candidate = meta_features + self._sampling()
            candidate_list.append(candidate)
        candidate_score_list = self.meta_model.predict(candidate_list)
        top_num_init_idx_list = (-candidate_score_list).argsort()[:self.num_init]

        GP_X = []
        GP_y = []
        for idx in top_num_init_idx_list:
            gp_x = candidate_list[idx][len(meta_features) :]
            gp_y = self._solution_evaluate(gp_x, X, y)

            GP_X.append(gp_x)
            GP_y.append(gp_y)
        surrogate_model = GPR(kernel=Matern(), random_state=2022).fit(GP_X, GP_y)

        best_mu = max(surrogate_model.predict(GP_X))
        for _ in range(self.num_iter - 1):
            candidate_list = np.array(
                [self._sampling() for _ in range(self.num_candidate)]
            )
            candidate_score_list = self._EI(candidate_list, surrogate_model, best_mu)

            new_GP_X = list(
                candidate_list[(-candidate_score_list).argsort()[: self.num_sample]]
            )
            new_GP_y = [
                self._solution_evaluate(new_gp_x, X, y) for new_gp_x in new_GP_X
            ]

            GP_X += new_GP_X
            GP_y += new_GP_y
            
            current_best_mu = max(surrogate_model.predict(new_GP_X))
            if current_best_mu > best_mu: 
                best_mu = current_best_mu
            surrogate_model = GPR(kernel=Matern(), random_state=2022).fit(GP_X, GP_y)
            
        self.leaderboard = pd.DataFrame(
            GP_X,
            columns=[
                "h1",
                "h2",
                "h3",
                "h4",
                "max_iter",
                "learning_rate_init",
                "s1",
                "s2",
                "s3",
            ],
        )
        self.leaderboard["점수"] = GP_y
        
        # 최종 모델 선정 및 학습
        h1, h2, h3, h4, max_iter, learning_rate_init, s1, s2, s3 = GP_X[np.array(GP_y).argmax()]
        h1, h2, h3, h4, max_iter = tuple(map(int, (h1, h2, h3, h4, max_iter)))
        if h2 == 0:
            hidden_layer_sizes = (h1,)
        elif h3 == 0:
            hidden_layer_sizes = (h1, h2)
        elif h4 == 0:
            hidden_layer_sizes = (h1, h2, h3)
        else:
            hidden_layer_sizes = (h1, h2, h3, h4)
        best_model = MLPC(
            hidden_layer_sizes=hidden_layer_sizes,
            max_iter=max_iter,
            learning_rate_init=learning_rate_init,
            random_state=2022,
        )
        
        if s1 == 1:
            scaler = MinMaxScaler().fit(X)
            X = scaler.transform(X)
        if s2 == 1:
            X, y = SMOTE(k_neighbors=3, random_state=2022).fit_resample(X, y)
        elif s3 == 1:
            X, y = NearMiss().fit_resample(X, y)
        
        self.model = best_model.fit(X, y)

    ## predict 메서드
    def predict(self, X):
        return self.model.predict(X)   
        
    ## show_leaderboard 메서드
    def show_leaderboard(self):
        return self.leaderboard

### 적용

In [131]:
# 데이터 불러오기
df = pd.read_csv("../../data/classification/glass4.csv")
X = df.drop('y', axis = 1)
y = df['y']

In [132]:
aml = MyAutoML3(scoring = "accuracy")
aml.fit(X, y)
result = aml.show_leaderboard()
display(result.sort_values(by = "점수", ascending = False))

Unnamed: 0,h1,h2,h3,h4,max_iter,learning_rate_init,s1,s2,s3,점수
830,11.0,2.0,0.0,0.0,1422.0,0.005899,1.0,1.0,0.0,0.976523
852,12.0,8.0,3.0,0.0,5107.0,0.001567,0.0,1.0,0.0,0.976523
802,18.0,3.0,7.0,1.0,8217.0,0.000789,1.0,1.0,0.0,0.971982
558,18.0,2.0,4.0,0.0,4396.0,0.001914,1.0,1.0,0.0,0.971872
862,12.0,6.0,0.0,0.0,681.0,0.041453,1.0,1.0,0.0,0.971872
...,...,...,...,...,...,...,...,...,...,...
885,7.0,5.0,1.0,1.0,7877.0,0.049869,0.0,1.0,0.0,0.060797
404,20.0,6.0,2.0,1.0,5194.0,0.000389,0.0,1.0,0.0,0.060797
401,29.0,3.0,3.0,1.0,1614.0,0.013089,1.0,1.0,0.0,0.060797
19,13.0,1.0,0.0,0.0,8088.0,0.052823,0.0,0.0,1.0,0.060797


In [133]:
# 데이터 불러오기
df = pd.read_csv("../../data/classification/vehicle1.csv")
X = df.drop('y', axis = 1)
y = df['y']

In [134]:
aml = MyAutoML3(scoring = "f1", num_iter = 100)
aml.fit(X, y)
result = aml.show_leaderboard()
display(result.sort_values(by = "점수", ascending = False))

Unnamed: 0,h1,h2,h3,h4,max_iter,learning_rate_init,s1,s2,s3,점수
24,14.0,3.0,2.0,0.0,3667.0,0.006209,1.0,1.0,0.0,0.705368
12,13.0,0.0,0.0,0.0,5240.0,0.005206,1.0,1.0,0.0,0.701981
76,8.0,4.0,1.0,0.0,3604.0,0.007573,1.0,1.0,0.0,0.682786
80,19.0,4.0,3.0,0.0,862.0,0.000868,1.0,1.0,0.0,0.679591
39,12.0,4.0,4.0,0.0,9268.0,0.007311,1.0,0.0,0.0,0.675799
...,...,...,...,...,...,...,...,...,...,...
57,18.0,4.0,2.0,2.0,1253.0,0.004606,1.0,1.0,0.0,0.000000
84,14.0,4.0,5.0,0.0,4209.0,0.001301,0.0,0.0,0.0,0.000000
16,16.0,5.0,1.0,1.0,4474.0,0.004674,0.0,0.0,0.0,0.000000
42,18.0,5.0,3.0,1.0,7592.0,0.000338,0.0,0.0,0.0,0.000000
