In [1]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.base import clone

In [6]:
'''
載入資料集區分訓練和測試集
建立 pipeline
'''
cancer_df = load_breast_cancer()
x_train, x_test, y_train, y_test = train_test_split(cancer_df.data, cancer_df.target, random_state=123)
clf = make_pipeline(StandardScaler(), SVC(C=100))
print('steps:', clf.steps, sep='\n')
print('named_steps:', clf.named_steps, sep='\n')
model = clone(clf).fit(x_train, y_train)
print('score:', model.score(x_test, y_test))

steps:
[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]
named_steps:
{'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'svc': SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)}
score: 0.986013986013986


In [8]:
'''
配合 GridSearchCV 建立 pipeline
根據 steps 印出的模型加雙底線加參數名稱以設定參數範圍
但為何 GridSearchCV 結果反而比較差，有待查證 #------------ unsure
以及 best_score_ 和 score 的差別 #------------
'''
clf2 = make_pipeline(StandardScaler(), SVC()) # 不能事先設定參數
print(clf2.steps)
params = {'svc__C':[0.001,0.01,0.1,1,10,100,1000], 'svc__gamma':[0.001,0.01,0.1,1,10,100,1000]}
gridsearch_cv = GridSearchCV(clone(clf2), param_grid=params, cv=5, n_jobs=-1, scoring = 'accuracy') # n_jobs=-1:平行運算
gridsearch_cv.fit(x_train, y_train)
print('最佳參數組合：', gridsearch_cv.best_estimator_, sep='\n')
print('最佳參數組合分數：', gridsearch_cv.best_score_, sep='\n')
print(gridsearch_cv.best_estimator_['svc'].score(x_test, y_test))
print(gridsearch_cv.best_estimator_['svc'].predict(x_test))
print(gridsearch_cv.best_estimator_['svc'].decision_function(x_test))

[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))]
最佳參數組合：
Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=10, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)
最佳參數組合分數：
0.9717920656634748
0.3776223776223776
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0