In [35]:
import pandas as pd
from sklearn.model_selection import ParameterGrid, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import *
from functools import partial
import numpy as np

In [88]:
class MyAutoML1:
    ## 생성자
    def __init__(
        self,
        exclude_models=[],
        seed=None,
        cv=5,
        scoring="accuracy",
        summarize_scoring="mean",
        early_stopping=False,
        early_stopping_criteria=0.1,
    ):
        # self.exclude_models 정의
        model_set = {"KNN", "DT", "RF", "MLP"}
        if set(exclude_models) == model_set:
            raise ValueError("모든 모델을 제외할 수 없습니다.")
        improper_models = set(exclude_models) - model_set
        if len(improper_models) >= 1:
            raise ValueError(
                "{}는 exclude_models에 포함할 수 없습니다".format(improper_models)
            )
        self.exclude_models = exclude_models

        # self.seed 정의
        if (type(seed) != int) and (seed is not None):
            raise ValueError("seed는 int형 혹은 None이어야 합니다.")
        self.seed = seed

        # self.cv 정의
        if type(cv) != int:
            raise ValueError("cv는 int형이어야 합니다.")
        if cv < 2:
            raise ValueError("cv는 2보다는 커야 합니다.")
        self.cv = cv

        # self.scoring 정의
        scoring_dict = {
            "accuracy": accuracy_score,
            "precision": precision_score,
            "weighted-precision": partial(precision_score, average="weighted"),
            "macro-precision": partial(precision_score, average="macro"),
            "recall": recall_score,
            "weighted-recall": partial(recall_score, average="weighted"),
            "macro-recall": partial(recall_score, average="macro"),
            "f1": f1_score,
            "weighted-f1": partial(f1_score, average="weighted"),
            "macro-f1": partial(f1_score, average="macro"),
        }

        if scoring not in scoring_dict.keys():
            msg = "scoring은 {}중 하나여야 합니다.".format(scoring_dict.keys())
            raise ValueError(msg)
        self.scoring = scoring_dict[scoring]

        # self.summarize_scoring 정의
        summarize_scoring_dict = {"mean": np.mean, "max": np.max, "min": np.min}

        if summarize_scoring not in ["mean", "max", "min"]:
            msg = "summarize_scoring는 {'mean', 'max', 'min'}중 하나여야 합니다."
            raise ValueError(msg)
        self.summarize_scoring = summarize_scoring_dict[summarize_scoring]

        # self.early_stopping 정의
        if type(early_stopping) is not bool:
            raise ValueError("early_stopping은 True 혹은 False여야 합니다.")
        self.early_stopping = early_stopping

        # early_stopping_criteria 정의
        if type(early_stopping_criteria) is not float:
            raise ValueError("early_stopping_criteria 자료형은 float이어야 합니다.")
        if early_stopping_criteria <= 0 or early_stopping_criteria >= 1:
            raise ValueError("early_stopping_criteria는 0과 1사이의 값이어야 합니다.")
        self.early_stopping_criteria = early_stopping_criteria

    ## fit 메서드
    def fit(self, X, y):
        # X, y 포맷 변경
        if isinstance(X, pd.DataFrame):
            X = X.values
        elif isinstance(X, list) or isinstance(X, tuple):
            X = np.array(X)
        if isinstance(y, pd.Series):
            y = y.values
        elif isinstance(y, list) or isinstance(y, tuple):
            y = np.array(y)
        # K최근접 이웃 그리드 정의
        kNN_grid = ParameterGrid(
            {"n_neighbors": [3, 5, 7, 9, 11], "metric": ["euclidean", "manhattan"]}
        )
        # 결정 나무 그리드 정의
        DT_grid = ParameterGrid(
            {"max_depth": [3, 5, 7, 9], "min_samples_split": [2, 5, 10]}
        )
        # 랜덤 포레스트 그리드 정의
        RFR_grid = ParameterGrid(
            {
                "n_estimators": [50, 100, 200],
                "max_depth": [2, 3, 4],
                "max_features": [0.2, 0.4, 0.6, 0.8, 1.0],
            }
        )
        # 신경망 그리드 정의
        MLP_grid = ParameterGrid(
            {
                "hidden_layer_sizes": [
                    (10,),
                    (10, 10),
                    (20, 20),
                    (15, 15, 15),
                    (20, 20, 20),
                    (15, 15, 15, 15),
                    (30, 30, 30, 30),
                ],
                "max_iter": [2000],
            }
        )

        # 전체 그리드 정의
        grid = {
            KNeighborsClassifier: kNN_grid,
            DecisionTreeClassifier: DT_grid,
            RandomForestClassifier: RFR_grid,
            MLPClassifier: MLP_grid
        }

        # 그리드 서치 시작
        best_score = 0
        self.leaderboard = []
        for model_func in grid.keys():
            if model_func in self.exclude_models:
                continue
            for params in grid[model_func]:
                if model_func != KNeighborsClassifier:
                    params["random_state"] = self.seed
                kf = KFold(n_splits=self.cv, shuffle=True, random_state=self.seed)
                fold_score_list = []
                # 조기 종료를 하는 경우
                if self.early_stopping:
                    for train_index, test_index in kf.split(X):
                        X_train, X_test = X[train_index], X[test_index]
                        y_train, y_test = y[train_index], y[test_index]
                        model = model_func(**params).fit(X_train, y_train)
                        y_pred = model.predict(X_test)
                        fold_score = self.scoring(y_test, y_pred)
                        fold_score_list.append(fold_score)
                        if fold_score < best_score * (1 - self.early_stopping_criteria):
                            break
                # 조기 종료를 하지 않는 경우
                else:
                    for train_index, test_index in kf.split(X):
                        X_train, X_test = X[train_index], X[test_index]
                        y_train, y_test = y[train_index], y[test_index]
                        model = model_func(**params).fit(X_train, y_train)
                        y_pred = model.predict(X_test)
                        fold_score = self.scoring(y_test, y_pred)
                        fold_score_list.append(fold_score)
                # 현재까지 찾은 최고의 해 및 리더보드 업데이트
                score = self.summarize_scoring(fold_score_list)
                if score > best_score:
                    best_score = score
                    best_model_func = model_func
                    best_params = params
                self.leaderboard.append([model_func, params, score])
        self.model = best_model_func(**best_params).fit(X, y)
        self.leaderboard = pd.DataFrame(self.leaderboard,
                                        columns=["모델", "파라미터", "점수"])

    ## predict 메서드
    def predict(self, X):
        return self.model.predict(X)

    ## show_leaderboard 메서드
    def show_leaderboard(self):
        return self.leaderboard


#### 적용

In [94]:
# 데이터 불러오기
df = pd.read_csv("../../data/classification/winequality-red.csv")
X = df.drop('y', axis = 1)
y = df['y']

In [96]:
aml = MyAutoML1()
aml.fit(X, y)
result = aml.show_leaderboard()
display(result.sort_values(by = "점수", ascending = False))

Unnamed: 0,모델,파라미터,점수
57,<class 'sklearn.ensemble._forest.RandomForestC...,"{'max_depth': 4, 'max_features': 0.4, 'n_estim...",0.605390
54,<class 'sklearn.ensemble._forest.RandomForestC...,"{'max_depth': 4, 'max_features': 0.2, 'n_estim...",0.604771
62,<class 'sklearn.ensemble._forest.RandomForestC...,"{'max_depth': 4, 'max_features': 0.8, 'n_estim...",0.604136
65,<class 'sklearn.ensemble._forest.RandomForestC...,"{'max_depth': 4, 'max_features': 1.0, 'n_estim...",0.603501
66,<class 'sklearn.ensemble._forest.RandomForestC...,"{'max_depth': 4, 'max_features': 1.0, 'n_estim...",0.602878
...,...,...,...
2,<class 'sklearn.neighbors._classification.KNei...,"{'metric': 'euclidean', 'n_neighbors': 7}",0.507819
4,<class 'sklearn.neighbors._classification.KNei...,"{'metric': 'euclidean', 'n_neighbors': 11}",0.507185
3,<class 'sklearn.neighbors._classification.KNei...,"{'metric': 'euclidean', 'n_neighbors': 9}",0.496526
1,<class 'sklearn.neighbors._classification.KNei...,"{'metric': 'euclidean', 'n_neighbors': 5}",0.494075


In [97]:
# 데이터 불러오기
df = pd.read_csv("../../data/classification/bupa.csv")
X = df.drop('y', axis = 1)
y = df['y']

In [98]:
aml = MyAutoML1(scoring = "f1", early_stopping = True, early_stopping_criteria = 0.05)
aml.fit(X, y)
result = aml.show_leaderboard()
display(result.sort_values(by = "점수", ascending = False))

Unnamed: 0,모델,파라미터,점수
61,<class 'sklearn.ensemble._forest.RandomForestC...,"{'max_depth': 4, 'max_features': 0.8, 'n_estim...",0.803052
66,<class 'sklearn.ensemble._forest.RandomForestC...,"{'max_depth': 4, 'max_features': 1.0, 'n_estim...",0.801491
40,<class 'sklearn.ensemble._forest.RandomForestC...,"{'max_depth': 3, 'max_features': 0.4, 'n_estim...",0.799017
41,<class 'sklearn.ensemble._forest.RandomForestC...,"{'max_depth': 3, 'max_features': 0.4, 'n_estim...",0.792135
44,<class 'sklearn.ensemble._forest.RandomForestC...,"{'max_depth': 3, 'max_features': 0.6, 'n_estim...",0.788412
...,...,...,...
5,<class 'sklearn.neighbors._classification.KNei...,"{'metric': 'manhattan', 'n_neighbors': 3}",0.631579
13,<class 'sklearn.tree._classes.DecisionTreeClas...,"{'max_depth': 5, 'min_samples_split': 2, 'rand...",0.621622
2,<class 'sklearn.neighbors._classification.KNei...,"{'metric': 'euclidean', 'n_neighbors': 7}",0.615385
21,<class 'sklearn.tree._classes.DecisionTreeClas...,"{'max_depth': 9, 'min_samples_split': 10, 'ran...",0.613333


In [102]:
display(aml.predict(X)[:5])

array([1, 1, 1, 1, 1], dtype=int64)