## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [15]:
import numpy as np
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV

In [2]:
# 讀取資料集
wine = datasets.load_wine()

In [3]:
X = wine.data
Y = wine.target
Y_Name = wine.target_names
Feature_Name = wine.feature_names
print("Data Shape = ", X.shape)
print("Data Type = ", X.dtype)
print("Feature Shape = ", len(Feature_Name))
print("Kind of Targer = ", len(np.unique(Y)))

Data Shape =  (178, 13)
Data Type =  float64
Feature Shape =  13
Kind of Targer =  3


In [4]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=4)

In [13]:
# 建立模型 (使用 5 顆樹，每棵樹的最大深度為 2)
clf = RandomForestClassifier(n_estimators=5, max_depth=2)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
print("Feature importance: ", clf.feature_importances_)

Accuracy:  0.9333333333333333
Feature importance:  [0.         0.         0.04392024 0.         0.09254301 0.11903802
 0.         0.         0.         0.22299793 0.05259198 0.29497563
 0.17393319]


In [14]:
clf.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 2,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 5,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [19]:
# 設定要訓練的超參數組合
n_estimators = [int(x) for x in np.linspace(10, 2000, 20)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

param_grid = dict(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, bootstrap=bootstrap)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = RandomizedSearchCV(clf, param_grid, n_jobs=-1, verbose=1, cv = 10)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   19.0s finished


In [20]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 0.984962 using {'n_estimators': 1685, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 40, 'bootstrap': True}


In [21]:
# 使用最佳參數重新建立模型
clf_bestparam = RandomForestClassifier(n_estimators     =grid_result.best_params_['n_estimators'],
                                       min_samples_split=grid_result.best_params_['min_samples_split'],
                                       min_samples_leaf =grid_result.best_params_['min_samples_leaf'],
                                       max_features     =grid_result.best_params_['max_features'],
                                       max_depth        =grid_result.best_params_['max_depth'],
                                       bootstrap        =grid_result.best_params_['bootstrap'])
                                       
# 訓練模型
clf_bestparam.fit(x_train, y_train)

# 預測測試集
y_pred = clf_bestparam.predict(x_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
print("Feature importance: ", clf_bestparam.feature_importances_)

Accuracy:  0.9777777777777777
Feature importance:  [0.13876026 0.02538488 0.00909557 0.03276995 0.02479493 0.05648724
 0.1328421  0.0081472  0.01758689 0.16964821 0.07703689 0.12388652
 0.18355935]
