# 機械学習モデルのハイパーパラメータ最適化

### グリッドサーチとランダムサーチ

#### グリッドサーチ
- 各ハイパーパラメータに複数の候補値を設定して、全ての組合せを試行する
- 組合せが多くなると計算負荷が大きくなるため、多数のパラメータの最適化には不向き

#### ランダムサーチ
- ハイパーパラメータの範囲として連続もしくは離散確率分布を指定し、ランダムにサンプリングした組合せを試行する
- 多数のパラメータの最適化にも有効だが、網羅的では無いため「当たりをつける」目途で活用

**注意：ライブラリの仕様変更**<br>
例：LogisticRegression
- scikit-learn 0.22から、パラメータ'solver'のデフォルト値が'liblinear’から‘lbfgs’に変更：<a href='https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression'>sklearn.org User 
API</a>
- solverによって、パラメータ'penalty'や'multi_class'の設定値（サポート対象）が異なる：<a href='https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression'>sklearn.org User Guide</a>


In [41]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
import scipy.stats
import time

# 処理時の警告を表示しない
# import warnings
# warnings.filterwarnings('ignore')

# データ取得（分類：Classfication）
#data = datasets.load_digits()
#data = datasets.load_wine()
data = datasets.load_iris()

# 学習用とテスト用にデータを分割
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, random_state=0)

# 標準化：学習データを基準に標準化
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)


# 分類器のハイパーパラメータリスト
# LogisticRegression: C, penalty, multi_class, rondom_state
# LinearSVC: C, penalty, multi_class, random_state
# SVC: C, kernel, decision_function_shape, random_state
# DecisionTreeClassifier: max_depth, random_state
# RandomForestClassifier: n_estimators, max_depth, random_state
# KNeighborsClassifier: n_neighbors

# ハイパーパラメータの候補値を設定（グリッドサーチ用）
model_params_grid = {
    LogisticRegression(): {
        "C": [10 ** i for i in range(-5, 5)],
        "penalty": ["l1", "l2"],
        "random_state": [0],
        "solver": ["liblinear"],  # The default solver changed from ‘liblinear’ to ‘lbfgs’ in version 0.22.
        # "multi_class": ["ovr", "multinomial"],  
        #         ValueError: Solver liblinear does not support a multinomial back
    },
    LinearSVC(): {
        "C": [10 ** i for i in range(-5, 5)],
        "multi_class": ["ovr", "crammer_singer"],
        "random_state": [0],
        # "penalty": ["l1", "l2"],  
        #         ValueError: Unsupported set of arguments: 
        #                 The combination of penalty='l1' and loss='squared_hinge' are not supported 
        #                 when dual=True,
        #                 Parameters: penalty='l1', loss='squared_hinge', dual=True
    },
    SVC(): {
        "C": [10**i for i in range(-5, 5)],
        "kernel": ["linear", "rbf", "poly", "sigmoid"],
        "decision_function_shape": ["ovr", "ovo"],
        "random_state": [0]
    },
    DecisionTreeClassifier(): {
        "max_depth": [i for i in range(1, 21)],
        "random_state": [0]
    },
    RandomForestClassifier(): {
        "n_estimators": [i for i in range(1, 21)],
        "max_depth": [i for i in range(1, 11)], # DexisionTreeClassifierより少ない値
        "random_state": [0]
    },
    KNeighborsClassifier(): {
        "n_neighbors": [i for i in range(1, 11)]
    }
}

# ハイパーパラメータの候補値を設定（ランダムリサーチ用）
model_params_random = {
    LogisticRegression(): {
        "C": scipy.stats.uniform(0.00001, 1000),
        "penalty": ["l1", "l2"],
        "random_state": scipy.stats.randint(0, 100),
        "solver": ["liblinear"],
    },
    LinearSVC(): {
        "C": scipy.stats.uniform(0.00001, 10000),
        "multi_class": ["ovr", "crammer_singer"],
        "random_state": scipy.stats.randint(0, 100),
    },
    SVC(): {
        "C": scipy.stats.uniform(0.00001, 1000),
        "kernel": ["linear", "rbf", "poly", "sigmoid"],
        "decision_function_shape": ["ovr", "ovo"],
        "random_state": scipy.stats.randint(0, 100)
    },
    DecisionTreeClassifier(): {
        "max_depth": scipy.stats.randint(1, 21),
        "random_state": scipy.stats.randint(0, 100)
    },
    RandomForestClassifier(): {
        "n_estimators": scipy.stats.randint(1, 21),
        "max_depth": scipy.stats.randint(1, 11), # DexisionTreeClassifierより少ない値
        "random_state": scipy.stats.randint(0, 100)
    },
    KNeighborsClassifier(): {
        "n_neighbors": scipy.stats.randint(1, 11)
    }
}

# パラメータ探索結果格納用の変数を初期化
score_max_grid = 0
model_best_grid = None
params_best_grid = None
score_max_random = 0
model_best_random = None
params_best_random = None

# パラメータ探索（グリッドサーチ）
start = time.time()
for model, params in model_params_grid.items():
    clf = GridSearchCV(model, params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = f1_score(y_test, y_pred, average="micro")
    if score_max_grid < score:
        score_max_grid = score
        model_best_grid = model.__class__.__name__
        params_best_grid = clf.best_params_
print("Grid Search (time): ", time.time() - start)
        
# パラメータ探索（ランダムサーチ）
start = time.time()
for model, params in model_params_random.items():
    clf = RandomizedSearchCV(model, params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = f1_score(y_test, y_pred, average="micro")
    if score_max_random < score:
        score_max_random = score
        model_best_random = model.__class__.__name__
        params_best_random = clf.best_params_
print("Random Search (time): ", time.time() - start)

print()
print("--- パラメータ探索結果(Grid Search) ---")
print("最適なモデル: {}".format(model_best_grid))
print("最適なパラメータ:{}".format(params_best_grid))
print("ベストスコア：", score_max_grid)
print()
print("--- パラメータ探索結果(Random Search) ---")
print("最適なモデル: {}".format(model_best_random))
print("最適なパラメータ:{}".format(params_best_random))
print("ベストスコア：", score_max_random)

Grid Search (time):  14.17858099937439
Random Search (time):  4.590849161148071

--- パラメータ探索結果(Grid Search) ---
最適なモデル: LogisticRegression
最適なパラメータ:{'C': 10, 'penalty': 'l1', 'random_state': 0, 'solver': 'liblinear'}
ベストスコア： 0.9736842105263158

--- パラメータ探索結果(Random Search) ---
最適なモデル: SVC
最適なパラメータ:{'C': 685.0787559981214, 'decision_function_shape': 'ovr', 'kernel': 'poly', 'random_state': 36}
ベストスコア： 0.9736842105263158
