### 실험을 통한 하이퍼 파라미터 범위 설정

In [1]:
import pandas as pd
from sklearn.model_selection import ParameterGrid, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor as RFR
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMRegressor as LGBR
import numpy as np
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.metrics import *
from sklearn.ensemble import StackingRegressor
from scipy.stats import randint, uniform
from tqdm import tqdm

#### 샘플링

In [3]:
num_trees_rv = randint(10, 500)
learning_rate_rv = uniform(0.01, 0.3)
max_depth_rv = randint(2, 10)
num_leaves_rv = randint(8, 256)
row_ratio_rv = uniform(0.1, 0.85)
col_ratio_rv = uniform(0.3, 0.6)

In [3]:
def sampling():
    num_trees = num_trees_rv.rvs()
    learning_rate = learning_rate_rv.rvs()
    max_depth = max_depth_rv.rvs()
    num_leaves = num_leaves_rv.rvs()
    row_ratio = row_ratio_rv.rvs()
    col_ratio = col_ratio_rv.rvs()
    return num_trees, learning_rate, max_depth, num_leaves, row_ratio, col_ratio

#### 데이터 준비

In [4]:
experiment_data_list = []
for file_name in ["ele-1", "ele-2", "friedman",  "puma32h",  "wizmir"]:
    df = pd.read_csv("../../data/regression/{}.csv".format(file_name))
    X = df.drop('y', axis = 1)
    y = df['y']
    experiment_data_list.append((X, y))

#### 실험 수행

In [5]:
RFR_result = []
XGB_result = []
LGB_result = []
for _ in tqdm(range(10000)):
    data_idx = np.random.choice(range(len(experiment_data_list)))
    X, y = experiment_data_list[data_idx]
    num_trees, learning_rate, max_depth, num_leaves, row_ratio, col_ratio = sampling()

    RFR_model = RFR(
        n_estimators=num_trees,
        max_depth=max_depth,
        max_leaf_nodes=num_leaves,
        max_samples=row_ratio,
        max_features=col_ratio,
    )

    XGB_model = XGBR(
        n_estimators=num_trees,
        max_depth=max_depth,
        max_leaves=num_leaves,
        subsample=row_ratio,
        colsample_bytree=col_ratio,
        eta=learning_rate,
    )

    LGB_model = LGBR(
        n_estimators=num_trees,
        max_depth=max_depth,
        num_leaves=num_leaves,
        subsample=row_ratio,
        colsample_bytree=col_ratio,
        learning_rate=learning_rate,
    )

    RFR_score_list = cross_val_score(
        RFR_model, X, y, cv=5, scoring="neg_mean_absolute_error"
    )
    RFR_score = (-RFR_score_list).mean()
    XGB_score_list = cross_val_score(
        XGB_model, X, y, cv=5, scoring="neg_mean_absolute_error"
    )
    XGB_score = (-XGB_score_list).mean()
    LGB_score_list = cross_val_score(
        LGB_model, X, y, cv=5, scoring="neg_mean_absolute_error"
    )
    LGB_score = (-LGB_score_list).mean()

    params = [num_trees, learning_rate, max_depth, num_leaves, row_ratio, col_ratio]

    RFR_record = params + [RFR_score]
    XGB_record = params + [XGB_score]
    LGB_record = params + [LGB_score]

    RFR_result.append(RFR_record)
    XGB_result.append(XGB_record)
    LGB_result.append(LGB_record)
cols = [
    "num_trees",
    "learning_rate",
    "max_depth",
    "num_leaves",
    "row_ratio",
    "col_ratio",
    "score",
]

RFR_result = pd.DataFrame(RFR_result, columns=cols).drop('learning_rate', axis = 1)
XGB_result = pd.DataFrame(XGB_result, columns=cols)
LGB_result = pd.DataFrame(LGB_result, columns=cols)

In [6]:
RFR_result.to_csv("MyAutoML2_RFR_실험결과.csv", index = False)
XGB_result.to_csv("MyAutoML2_XGB_실험결과.csv", index = False)
LGB_result.to_csv("MyAutoML2_LGB_실험결과.csv", index = False)

#### 모델별 MAE 분포 확인

In [44]:
RFR_MAE_dist = RFR_result["score"].describe()
XGB_MAE_dist = XGB_result["score"].describe()
LGB_MAE_dist = LGB_result["score"].describe()
MAE_dist = pd.concat([RFR_MAE_dist, XGB_MAE_dist, LGB_MAE_dist], axis = 1)
MAE_dist.columns = ["RFR", "XGB", "LGB"]
display(MAE_dist)

Unnamed: 0,RFR,XGB,LGB
count,10000.0,10000.0,10000.0
mean,125.287492,128.788634,112.990791
std,177.680552,223.506338,187.934021
min,0.006287,0.006035,0.005951
25%,0.998655,0.962118,0.947524
50%,2.084521,1.497201,1.246464
75%,236.530071,85.982339,91.620124
max,542.99483,1373.506491,1133.407707


#### 모델별 결정 나무 학습 및 해석

In [15]:
def text_to_rule_list(r):
    node_list = []
    leaf_node_list = []

    for i, node in enumerate(r.split("\n")[:-1]):
        rule = node.split('- ')[1]
        indent = node.count(' ' * 3)
        if 'value' in rule:
            leaf_node_list.append([i, rule, indent])
        node_list.append([i, rule, indent])

    prediction_rule_list = []
    for leaf_node in leaf_node_list:
        prediction_rule = []
        idx, decision, indent = leaf_node
        for indent_level in range(indent-1, -1, -1):
            for node_idx in range(idx, -1, -1):
                node = node_list[node_idx]
                rule = node[1]
                if node[2] == indent_level and "value" not in node[1]:
                    prediction_rule.append(rule)
                    break
        prediction_rule_list.append([prediction_rule, decision])

    return prediction_rule_list

In [16]:
def extract_float(output):
    output = output.split('[')[1]
    output = float(output[:-1])
    return output

In [38]:
pd.set_option('max_colwidth', None)

In [68]:
RFR_X = RFR_result.drop('score', axis = 1)
RFR_y = RFR_result['score']
model = DecisionTreeRegressor(max_depth = 5).fit(RFR_X, RFR_y)
result = text_to_rule_list(export_text(model, feature_names = list(RFR_X.columns)))
result = pd.DataFrame(result, columns = ["condition", "output"])

result['condition'] = result['condition'].apply(' & '.join)
result['output'] = result['output'].apply(extract_float)
result.sort_values(by = "output", inplace = True)

display(result.head(10))

Unnamed: 0,condition,output
5,num_trees > 495.50 & col_ratio <= 0.82 & num_trees > 494.50 & max_depth <= 2.50 & max_depth <= 3.50,0.74
16,col_ratio <= 0.30 & col_ratio <= 0.30 & num_leaves <= 36.50 & max_depth > 3.50,2.07
4,num_trees <= 495.50 & col_ratio <= 0.82 & num_trees > 494.50 & max_depth <= 2.50 & max_depth <= 3.50,2.62
28,num_trees > 493.50 & max_depth <= 8.50 & col_ratio > 0.50 & num_leaves > 36.50 & max_depth > 3.50,54.4
15,col_ratio > 0.53 & num_leaves > 206.50 & col_ratio > 0.44 & max_depth > 2.50 & max_depth <= 3.50,81.71
12,row_ratio <= 0.22 & num_leaves <= 206.50 & col_ratio > 0.44 & max_depth > 2.50 & max_depth <= 3.50,87.36
30,num_leaves > 51.50 & max_depth > 8.50 & col_ratio > 0.50 & num_leaves > 36.50 & max_depth > 3.50,87.45
27,num_trees <= 493.50 & max_depth <= 8.50 & col_ratio > 0.50 & num_leaves > 36.50 & max_depth > 3.50,112.44
9,row_ratio > 0.73 & col_ratio <= 0.41 & col_ratio <= 0.44 & max_depth > 2.50 & max_depth <= 3.50,115.26
24,row_ratio > 0.10 & col_ratio <= 0.50 & col_ratio <= 0.50 & num_leaves > 36.50 & max_depth > 3.50,118.39


In [74]:
XGB_X = XGB_result.drop('score', axis = 1)
XGB_y = XGB_result['score']
model = DecisionTreeRegressor(max_depth = 5, random_state = 2022).fit(XGB_X, XGB_y)
result = text_to_rule_list(export_text(model, feature_names = list(XGB_X.columns)))
result = pd.DataFrame(result, columns = ["condition", "output"])

result['condition'] = result['condition'].apply(' & '.join)
result['output'] = result['output'].apply(extract_float)
result.sort_values(by = "output", inplace = True)

display(result.head(10))

Unnamed: 0,condition,output
3,col_ratio > 0.38 & col_ratio > 0.36 & col_ratio <= 0.57 & learning_rate <= 0.04 & num_trees <= 23.50,8.13
29,max_depth > 5.50 & row_ratio <= 0.16 & row_ratio > 0.16 & col_ratio > 0.50 & num_trees > 23.50,26.85
15,learning_rate > 0.17 & row_ratio > 0.71 & learning_rate > 0.13 & learning_rate > 0.04 & num_trees <= 23.50,29.39
2,col_ratio <= 0.38 & col_ratio > 0.36 & col_ratio <= 0.57 & learning_rate <= 0.04 & num_trees <= 23.50,31.33
23,col_ratio > 0.50 & row_ratio > 0.73 & col_ratio > 0.50 & col_ratio <= 0.50 & num_trees > 23.50,35.87
27,learning_rate > 0.31 & learning_rate > 0.31 & row_ratio <= 0.16 & col_ratio > 0.50 & num_trees > 23.50,42.16
30,col_ratio <= 0.50 & row_ratio > 0.16 & row_ratio > 0.16 & col_ratio > 0.50 & num_trees > 23.50,72.85
9,num_leaves > 25.00 & row_ratio <= 0.32 & learning_rate <= 0.13 & learning_rate > 0.04 & num_trees <= 23.50,75.59
25,num_trees > 44.50 & learning_rate <= 0.31 & row_ratio <= 0.16 & col_ratio > 0.50 & num_trees > 23.50,90.46
12,row_ratio <= 0.54 & row_ratio <= 0.71 & learning_rate > 0.13 & learning_rate > 0.04 & num_trees <= 23.50,102.09


In [91]:
LGB_X = LGB_result.drop('score', axis = 1)
LGB_y = LGB_result['score']
model = DecisionTreeRegressor(max_depth = 5, random_state = 2022).fit(LGB_X, LGB_y)
result = text_to_rule_list(export_text(model, feature_names = list(LGB_X.columns)))
result = pd.DataFrame(result, columns = ["condition", "output"])

result['condition'] = result['condition'].apply(' & '.join)
result['output'] = result['output'].apply(extract_float)
result.sort_values(by = "output", inplace = True)

display(result.head(10))

Unnamed: 0,condition,output
0,max_depth <= 3.50 & num_leaves <= 172.50 & col_ratio <= 0.57 & learning_rate <= 0.04 & num_trees <= 27.50,0.02
22,learning_rate <= 0.16 & row_ratio <= 0.10 & row_ratio <= 0.16 & col_ratio > 0.39 & num_trees > 27.50,0.46
10,col_ratio > 0.78 & num_leaves <= 14.00 & learning_rate > 0.04 & num_trees <= 27.50,1.82
6,row_ratio <= 0.55 & col_ratio > 0.70 & col_ratio > 0.57 & learning_rate <= 0.04 & num_trees <= 27.50,3.09
1,max_depth > 3.50 & num_leaves <= 172.50 & col_ratio <= 0.57 & learning_rate <= 0.04 & num_trees <= 27.50,4.53
27,max_depth > 5.50 & row_ratio <= 0.16 & row_ratio > 0.16 & col_ratio > 0.39 & num_trees > 27.50,27.05
18,learning_rate > 0.22 & col_ratio > 0.32 & num_leaves <= 10.50 & col_ratio <= 0.39 & num_trees > 27.50,32.07
14,row_ratio > 0.68 & learning_rate > 0.16 & num_leaves > 14.00 & learning_rate > 0.04 & num_trees <= 27.50,52.96
25,num_trees > 44.50 & row_ratio > 0.10 & row_ratio <= 0.16 & col_ratio > 0.39 & num_trees > 27.50,81.59
11,row_ratio <= 0.35 & learning_rate <= 0.16 & num_leaves > 14.00 & learning_rate > 0.04 & num_trees <= 27.50,89.98


### 시스템 구현 및 활용

In [38]:
class MyAutoML2:
    ## 생성자
    def __init__(
        self,
        seed=None,
        cv=5,
        scoring="mean_absolute_error",
        summarize_scoring="mean",
        num_iter=1000,
        num_base_models=100,
        fit_stacking = True
        ):

        # self.seed 정의
        if (type(seed) != int) and (seed is not None):
            raise ValueError("seed는 int형 혹은 None이어야 합니다.")
        self.seed = seed

        # self.cv 정의
        if type(cv) != int:
            raise ValueError("cv는 int형이어야 합니다.")
        if cv < 2:
            raise ValueError("cv는 2보다는 커야 합니다.")
        self.cv = cv

        # self.scoring 정의
        scoring_dict = {
            "mean_absolute_error": mean_absolute_error,
            "mean_squared_error": mean_squared_error,
            "r2": r2_score
        }

        if scoring not in scoring_dict.keys():
            msg = "scoring은 {}중 하나여야 합니다.".format(scoring_dict.keys())
            raise ValueError(msg)
        self.scoring = scoring_dict[scoring]

        # self.summarize_scoring 정의
        summarize_scoring_dict = {"mean": np.mean, "max": np.max, "min": np.min}

        if summarize_scoring not in ["mean", "max", "min"]:
            msg = "summarize_scoring는 {'mean', 'max', 'min'}중 하나여야 합니다."
            raise ValueError(msg)
        self.summarize_scoring = summarize_scoring_dict[summarize_scoring]
        
        # self.num_iter 정의
        if type(num_iter) != int:
            raise ValueError("num_iter는 int 자료형이어야 합니다.")
        elif num_iter <= 0:
            raise ValueError("num_iter는 0보다 커야 합니다.")
        self.num_iter = num_iter
        
        # self.num_base_models 정의
        if type(num_base_models) != int:
            raise ValueError("num_base_models는 int 자료형이어야 합니다.")
        elif num_base_models <= 0:
            raise ValueError("num_base_models는 0보다 커야 합니다.")
        elif num_base_models > num_iter:
            raise ValueError("num_base_models는 num_iter보다 작거나 같아야 합니다.")
        self.num_base_models = num_base_models
        
        # self.fit_stacking 정의
        if type(fit_stacking) != bool:
            raise ValueError("fit_stacking는 bool 자료형이어야 합니다.")
        self.fit_stacking = fit_stacking
    
    ## 샘플링 정의
    def sampling(self):
        model_choice = np.random.multinomial(1, [0.2, 0.3, 0.5]).argmax()
        if model_choice == 0: # 랜덤 포레스트
            max_depth = np.random.choice(range(3, 9))
            col_ratio = round(uniform(0.44, 0.82-0.44).rvs(), 3)
            model = RFR(max_depth=max_depth,
                        max_features=col_ratio)

        elif model_choice == 1: # XGBoost
            num_trees = np.random.choice(range(10, 130, 10))
            row_ratio = round(uniform(0.15, 0.60-0.15).rvs(), 3)
            col_ratio = round(uniform(0.4, 0.60-0.4).rvs(), 3)
            learning_rate = round(uniform(0.01, 0.31-0.01).rvs(), 3)
            model = XGBR(n_estimators=num_trees,
                         subsample=row_ratio,
                         colsample_bytree=col_ratio,
                         eta=learning_rate)

        elif model_choice == 2: # LightGBM
            num_trees = np.random.choice(range(10, 130, 10))
            col_ratio = round(uniform(0.3, 0.60-0.3).rvs(), 3)
            learning_rate = round(uniform(0.01, 0.2-0.01).rvs(), 3)
            row_ratio = round(uniform(0.10, 0.55-0.10).rvs(), 3)
            num_leaves = np.random.choice(range(10, 180, 10))

            model = LGBR(n_estimators=num_trees,
                         num_leaves=num_leaves,
                         subsample=row_ratio,
                         colsample_bytree=col_ratio,
                         learning_rate=learning_rate)
        return model
    
    ## fit 메서드
    def fit(self, X, y):
        # X, y 포맷 변경
        if isinstance(X, pd.DataFrame):
            X = X.values
        elif isinstance(X, list) or isinstance(X, tuple):
            X = np.array(X)
        if isinstance(y, pd.Series):
            y = y.values
        elif isinstance(y, list) or isinstance(y, tuple):
            y = np.array(y)

        # 랜덤 서치 시작
        best_score = np.inf
        self.leaderboard = []
        self.model_list = []
        for _ in range(self.num_iter):
            while True:
                model = self.sampling()
                if model not in self.model_list:
                    break
                
            self.model_list.append(model)
            kf = KFold(n_splits=self.cv, shuffle=True, random_state=self.seed)
            fold_score_list = []
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                fold_score = self.scoring(y_test, y_pred)
                fold_score_list.append(fold_score)
                # 현재까지 찾은 최고의 해 및 리더보드 업데이트
            score = self.summarize_scoring(fold_score_list)
            if best_score > score:
                best_score = score
                best_model = model
            self.leaderboard.append([str(model).replace('\n', ''), score])

        self.leaderboard = pd.DataFrame(self.leaderboard,
                                        columns=["모델", "점수"])
        
        

        if self.fit_stacking:
            top_model_idx_list = self.leaderboard.sort_values(by = "점수").index[:self.num_base_models]
            top_model_list = [self.model_list[idx] for idx in top_model_idx_list]
            top_model_tuple_list = [("model_{}".format(idx+1, model), model) for idx, model in enumerate(top_model_list)]
            
            stacking_model = StackingRegressor(top_model_tuple_list)
            
            # 스태킹 모델 평가
            fold_score_list = []
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                stacking_model.fit(X_train, y_train)
                y_pred = stacking_model.predict(X_test)
                fold_score = self.scoring(y_test, y_pred)
                fold_score_list.append(fold_score)
                
            score = self.summarize_scoring(fold_score_list)
            new_row = [str(stacking_model).replace('\n', ''), score]
            new_row = pd.Series(new_row, index = self.leaderboard.columns)
            self.leaderboard = self.leaderboard.append(new_row, ignore_index=True)
            
            if best_score > score:
                best_model = stacking_model
        
        self.model = best_model.fit(X, y)
        
    ## predict 메서드
    def predict(self, X):
        return self.model.predict(X)

    ## show_leaderboard 메서드
    def show_leaderboard(self):
        return self.leaderboard

#### 적용

In [39]:
# 데이터 불러오기
df = pd.read_csv("../../data/regression/laser.csv")
X = df.drop('y', axis = 1)
y = df['y']

In [40]:
aml = MyAutoML2()
aml.fit(X, y)
result = aml.show_leaderboard()
display(result.sort_values(by = "점수", ascending = True))

Unnamed: 0,모델,점수
607,"RandomForestRegressor(max_depth=8, max_feature...",3.545854
0,"RandomForestRegressor(max_depth=8, max_feature...",3.597390
723,"RandomForestRegressor(max_depth=8, max_feature...",3.601368
171,"RandomForestRegressor(max_depth=8, max_feature...",3.627731
634,"RandomForestRegressor(max_depth=8, max_feature...",3.649754
...,...,...
405,"LGBMRegressor(colsample_bytree=0.375, learning...",34.252825
160,"XGBRegressor(base_score=0.5, booster='gbtree',...",38.764955
161,"XGBRegressor(base_score=0.5, booster='gbtree',...",39.856808
779,"XGBRegressor(base_score=0.5, booster='gbtree',...",42.470008


In [41]:
display(result.iloc[-1])

모델    StackingRegressor(estimators=[('model_1',     ...
점수                                             4.907067
Name: 1000, dtype: object

In [42]:
# 데이터 불러오기
df = pd.read_csv("../../data/regression/wankara.csv")
X = df.drop('y', axis = 1)
y = df['y']

In [43]:
aml = MyAutoML2(scoring = "r2", summarize_scoring = "max")
aml.fit(X, y)
result = aml.show_leaderboard()
display(result.sort_values(by = "점수", ascending = False))

Unnamed: 0,모델,점수
491,"RandomForestRegressor(max_depth=8, max_feature...",0.994054
189,"RandomForestRegressor(max_depth=8, max_feature...",0.993654
922,"RandomForestRegressor(max_depth=8, max_feature...",0.993652
457,"RandomForestRegressor(max_depth=8, max_feature...",0.993563
453,"RandomForestRegressor(max_depth=7, max_feature...",0.993561
...,...,...
280,"XGBRegressor(base_score=0.5, booster='gbtree',...",-3.573971
539,"XGBRegressor(base_score=0.5, booster='gbtree',...",-3.834678
437,"XGBRegressor(base_score=0.5, booster='gbtree',...",-4.632584
55,"XGBRegressor(base_score=0.5, booster='gbtree',...",-5.447646
