# GridSearchCV

下面來舉一個簡單的使用範例


In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC


X, y = datasets.load_digits(return_X_y=True)
n_samples = len(X)
X = X.reshape((n_samples, -1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=87)

# 設置搜尋參數
tuned_parameters = [
    {'kernel': ['rbf'], 
     'gamma': [1e-3, 1e-4, 'scale'],
     'C': [1, 10, 100, 1000],
     },
    {'kernel': ['linear'], 
     'C': [1, 10, 100, 1000],
     },
    {'kernel': ['poly'],
     'gamma': ['scale', 'auto'],
     'degree': [2, 3, 4, 5, 6],
     'C': [1, 10, 100, 1000],
     },
]

scores = ['precision_macro', 'recall_micro', 'f1_weighted', 
          'accuracy', 'balanced_accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s \n" % score)

    classifier = GridSearchCV(
        SVC(), tuned_parameters, scoring='%s' % score
    ).fit(X_train, y_train)

    print("Best parameters set found on development set: \n")
    print(classifier.best_params_)
    print("\nGrid scores on development set:\n")
    
    means = classifier.cv_results_['mean_test_score']
    stds = classifier.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, classifier.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred))
    

# Tuning hyper-parameters for precision_macro 

Best parameters set found on development set: 

{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

Grid scores on development set:

0.988 (+/-0.008) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.967 (+/-0.007) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.010) for {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
0.990 (+/-0.014) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.983 (+/-0.017) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.992 (+/-0.011) for {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
0.990 (+/-0.014) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.984 (+/-0.016) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.992 (+/-0.011) for {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
0.990 (+/-0.014) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.984 (+/-0.016) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.992 (+/-0.011) for {'C': 1000, 'gamma': 'scale', 'kernel': 'rbf'}
0.972 (+/-0.017) for {'C': 1, 'kernel':

## 可以查詢有哪些是可以使用的 scorer 


In [12]:
from sklearn import metrics

metrics.get_scorer_names()


['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_

# XGboost with GridSearchCV

* n_estimators：樹的數量
* eta [defalt = 0.3, 別名 : leanring_rate]： 學習步長
* max_depth [defalt = 6]：樹的最大深度，越大越容易 overfitting
* min_child_weight [defalt = 1]：最小葉子節點的權重合。當他的值大可以避免學到局部樣本。如果太大會發生 overfitting
* lambda [defalt = 1, 別名 : reg_alpha]： L2 正則化權重，越大必免 overfitting
* alpha [defalt = 0, 別名 : reg_alpha]： L1 正則化權重，越大必免 overfitting 
* seed：隨機樹種子

更多參數請[參考官網](https://xgboost.readthedocs.io/en/latest/parameter.html#general-parameters)

下面給一個 XGboost + Pipeline + GridSearchCV 的使用範例，
有其他需求可以拿他來改。


In [2]:
from xgboost import XGBClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


X, y = datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=87)

# 設置搜尋參數
tuned_parameters = [
    {'classifier__n_estimators': [100, 200, 300], 
     'classifier__max_depth': [1, 2, 3],
     'classifier__min_child_weight': [1, 2, 3],
     },
]

# 設置 Pipeline
estimators = [
    ('reduce_dim', PCA()), 
    ('scaler', StandardScaler()),
    ('classifier', XGBClassifier())
]
pipeline = Pipeline(estimators)


scores = ['accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s \n" % score)

    classifier = GridSearchCV(
        pipeline, 
        tuned_parameters, 
        scoring='%s' % score,
        cv=3
    ).fit(X_train, y_train)

    print("Best parameters set found on development set: \n")
    print(classifier.best_params_)
    print("\nGrid scores on development set:\n")
    
    means = classifier.cv_results_['mean_test_score']
    stds = classifier.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, classifier.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred))



# Tuning hyper-parameters for accuracy 

Best parameters set found on development set: 

{'classifier__max_depth': 1, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 100}

Grid scores on development set:

0.958 (+/-0.062) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 100}
0.958 (+/-0.062) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 200}
0.958 (+/-0.062) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 300}
0.950 (+/-0.071) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 2, 'classifier__n_estimators': 100}
0.950 (+/-0.071) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 2, 'classifier__n_estimators': 200}
0.942 (+/-0.047) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 2, 'classifier__n_estimators': 300}
0.942 (+/-0.047) for {'classifier__max_depth': 1, 'classifier__min_child

# LightGBM with GridSearchCV

更多參數請[參考官網](https://lightgbm.readthedocs.io/en/v3.3.2/Parameters.html)

下面也是給個使用範例



In [3]:
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


X, y = datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=87)

# 設置搜尋參數
tuned_parameters = [
    {'classifier__n_estimators': [100, 200, 300], 
     'classifier__max_depth': [1, 2, 3],
     'classifier__min_child_weight': [1, 2, 3],
     },
]

# 設置 Pipeline
estimators = [
    ('reduce_dim', PCA()), 
    ('scaler', StandardScaler()),
    ('classifier', LGBMClassifier())
]
pipeline = Pipeline(estimators)


scores = ['accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s \n" % score)

    classifier = GridSearchCV(
        pipeline, 
        tuned_parameters, 
        scoring='%s' % score,
        cv=3
    ).fit(X_train, y_train)

    print("Best parameters set found on development set: \n")
    print(classifier.best_params_)
    print("\nGrid scores on development set:\n")
    
    means = classifier.cv_results_['mean_test_score']
    stds = classifier.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, classifier.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred))
    

# Tuning hyper-parameters for accuracy 

Best parameters set found on development set: 

{'classifier__max_depth': 1, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 100}

Grid scores on development set:

0.950 (+/-0.041) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 100}
0.950 (+/-0.041) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 200}
0.950 (+/-0.041) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 300}
0.950 (+/-0.041) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 2, 'classifier__n_estimators': 100}
0.950 (+/-0.041) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 2, 'classifier__n_estimators': 200}
0.950 (+/-0.041) for {'classifier__max_depth': 1, 'classifier__min_child_weight': 2, 'classifier__n_estimators': 300}
0.942 (+/-0.047) for {'classifier__max_depth': 1, 'classifier__min_child

# RandomizedSearchCV

在很多時候我們並不知道那些參數好，使用 RandomizedSearchCV，
可以讓收尋空間是某個 distribution 。

下面是一個使用範例


In [4]:
from sklearn import datasets
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import numpy as np
from scipy.stats import uniform
from sklearn.utils.fixes import loguniform


X, y = datasets.load_digits(return_X_y=True)
n_samples = len(X)
X = X.reshape((n_samples, -1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=87)

# 設置搜尋參數
tuned_parameters = [
    {'kernel': ['rbf'], 
     'gamma': [1e-3, 1e-4, 'scale'],
     'C': uniform(loc=1, scale=999),                  # 均勻分布 
     },
    {'kernel': ['linear'], 
     'C': np.logspace(0, 3, base=10, num = 100),      # log 均勻分布  base^0  ~ base^3
     },
    {'kernel': ['poly'],
     'gamma': ['scale', 'auto'],
     'degree': [2, 3, 4, 5, 6],
     'C': loguniform(1e0, 1e3),                        # 也是 log 均勻分布
     },
]

scores = ['precision_macro', 'recall_micro', 'f1_weighted', 
          'accuracy', 'balanced_accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s \n" % score)

    classifier = RandomizedSearchCV(
        SVC(), tuned_parameters, scoring='%s' % score
    ).fit(X_train, y_train)

    print("Best parameters set found on development set: \n")
    print(classifier.best_params_)
    print("\nGrid scores on development set:\n")
    
    means = classifier.cv_results_['mean_test_score']
    stds = classifier.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, classifier.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    y_pred = classifier.predict(X_test)
    print(classification_report(y_test, y_pred))

# Tuning hyper-parameters for precision_macro 

Best parameters set found on development set: 

{'C': 266.0116703856221, 'gamma': 'scale', 'kernel': 'rbf'}

Grid scores on development set:

0.972 (+/-0.017) for {'C': 2.6560877829466865, 'kernel': 'linear'}
0.984 (+/-0.016) for {'C': 378.95545570861424, 'gamma': 0.0001, 'kernel': 'rbf'}
0.984 (+/-0.016) for {'C': 215.07593482209631, 'gamma': 0.0001, 'kernel': 'rbf'}
0.992 (+/-0.011) for {'C': 266.0116703856221, 'gamma': 'scale', 'kernel': 'rbf'}
0.972 (+/-0.017) for {'C': 1.1497569953977358, 'kernel': 'linear'}
0.992 (+/-0.011) for {'C': 536.0789790408055, 'gamma': 'scale', 'kernel': 'rbf'}
0.985 (+/-0.011) for {'C': 91.85505459364764, 'degree': 4, 'gamma': 'scale', 'kernel': 'poly'}
0.972 (+/-0.017) for {'C': 6.1359072734131725, 'kernel': 'linear'}
0.972 (+/-0.017) for {'C': 35.11191734215131, 'kernel': 'linear'}
0.977 (+/-0.007) for {'C': 1.7593630522585126, 'gamma': 0.0001, 'kernel': 'rbf'}

Detailed classification report:

         