In [1]:
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target

In [3]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=1, 
                  train_size=0.8, 
                  test_size=0.2, 
                  random_state=0)

train_index, test_index = next(ss.split(X, y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [4]:
from sklearn.decomposition import PCA

pca = PCA(whiten=True)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [10]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [11]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.956140350877193

In [12]:
clf.fit(X_train_pca, y_train)
clf.score(X_test_pca, y_test)

0.9649122807017544

In [13]:
# pipelineで一括処理
from sklearn.pipeline import Pipeline

estimators = [('pca', PCA(whiten=True)),
                   ('clf', LogisticRegression())]
pipe = Pipeline(estimators)

In [14]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.9649122807017544

In [15]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

estimators = [('mms', MinMaxScaler()),
                     ('clf', SVC(kernel='rbf', C=1e10))]
pipe = Pipeline(estimators)

In [16]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('mms', MinMaxScaler(copy=True, feature_range=(0, 1))), ('clf', SVC(C=10000000000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [17]:
pipe.score(X_test, y_test)

0.9824561403508771

# GridSearchとPipeLineの合わせ技

In [18]:
estimators = [('pca', PCA(whiten=True)), 
              ('clf', LogisticRegression())]
pipe = Pipeline(estimators)

In [21]:
from sklearn.model_selection import GridSearchCV

# paramの書き方が異なる。以下の形式にしないと、GridSearchCVに渡せない

# 形式
# estimatorsで指定した名前__指定したいパラメータ(clf__C)：パラメータ郡（リスト）[1e-5, 1e-3....]

param = {'clf__C':[1e-5, 1e-3, 1e-2, 1, 1e2, 1e5, 1e10]} # clf.C

gs = GridSearchCV(pipe, param) # GridSearchCVに直接pipeオブジェクトを渡せる！
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'clf__C': [1e-05, 0.001, 0.01, 1, 100.0, 100000.0, 10000000000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
gs.best_params_, gs.best_score_, gs.best_estimator_

({'clf__C': 1}, 0.9560439560439561, Pipeline(memory=None,
      steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
   svd_solver='auto', tol=0.0, whiten=True)), ('clf', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False))]))

In [24]:
from sklearn.svm import SVC

C_range = [1e-3, 1e-2, 1, 1e2, 1e3]

param = {
    'clf__C': C_range,
    'clf__kernel': ['linear', 'rbf'],
    'pca__whiten':[True, False],
    'pca__n_components': [30, 20, 10]
}

estimators = [('pca', PCA()), ('clf', SVC())]

pipe = Pipeline(estimators)


from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(pipe, param, n_jobs=2, verbose=2)
gs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=2)]: Done  30 out of  30 | elapsed:  2.1min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
          fit_params=None, iid=True, n_iter=10, n_jobs=2,
          param_distributions={'clf__C': [0.001, 0.01, 1, 100.0, 1000.0], 'clf__kernel': ['linear', 'rbf'], 'pca__whiten': [True, False], 'pca__n_components': [30, 20, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [25]:
gs.best_params_, gs.best_score_, gs.best_estimator_

({'pca__whiten': True,
  'pca__n_components': 20,
  'clf__kernel': 'linear',
  'clf__C': 1},
 0.9626373626373627,
 Pipeline(memory=None,
      steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=20, random_state=None,
   svd_solver='auto', tol=0.0, whiten=True)), ('clf', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False))]))

In [26]:
gs.score(X_test, y_test)

0.9649122807017544

In [27]:
gs.cv_results_



{'mean_fit_time': array([5.11577924e-02, 5.00655174e-03, 5.79767954e+01, 1.60410404e-02,
        4.67220942e-03, 1.57070160e-02, 4.00876999e-03, 5.68302472e-03,
        6.01228078e-03, 1.04143405e+00]),
 'std_fit_time': array([1.71552383e-02, 8.18899923e-04, 4.49236487e+01, 8.18773487e-04,
        4.69843493e-04, 1.25025104e-03, 1.18411894e-06, 4.73965595e-04,
        8.18876584e-04, 2.87414157e-01]),
 'mean_score_time': array([0.00952737, 0.00133681, 0.00100406, 0.00534749, 0.00066868,
        0.00601633, 0.00100358, 0.00133491, 0.0010035 , 0.00033458]),
 'std_score_time': array([1.20533805e-02, 4.73056388e-04, 6.25769923e-07, 4.73393763e-04,
        4.72831444e-04, 4.49566384e-07, 1.12391596e-07, 4.69178728e-04,
        0.00000000e+00, 4.73168619e-04]),
 'param_pca__whiten': masked_array(data=[True, True, False, False, False, False, True, True,
                    False, False],
              mask=[False, False, False, False, False, False, False, False,
                    False, Fal