In [1]:
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()
X = data.data
y = data.target

In [3]:
from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits=1,
                  train_size=0.8,
                  test_size=0.2,
                  random_state=0)

train_index, test_index = next(ss.split(X, y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [4]:
from sklearn.decomposition import PCA

pca = PCA(whiten=True)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [6]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)



0.956140350877193

In [7]:
clf.fit(X_train_pca, y_train)
clf.score(X_test_pca, y_test)



0.9649122807017544

In [8]:
from sklearn.pipeline import Pipeline

estimators = [('pca', PCA(whiten=True)),
              ('clf', LogisticRegression())]
pipe = Pipeline(estimators)

In [9]:
pipe.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))])

In [10]:
pipe.score(X_test, y_test)

0.9649122807017544

In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

estimators = [('mms', MinMaxScaler()),
              ('clf', SVC(kernel='rbf', C=1e10))]

pipe = Pipeline(estimators)

In [12]:
pipe.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('mms', MinMaxScaler(copy=True, feature_range=(0, 1))), ('clf', SVC(C=10000000000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))])

In [13]:
pipe.score(X_test, y_test)

0.9824561403508771

In [14]:
estimators = [('pca', PCA(whiten=True)),
              ('clf', LogisticRegression())]
pipe = Pipeline(estimators)

In [15]:
from sklearn.model_selection import GridSearchCV

param = {'clf__C':[1e-5, 1e-3, 1e-2, 1, 1e2, 1e5, 1e10]}

gs = GridSearchCV(pipe, param)
gs.fit(X_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'clf__C': [1e-05, 0.001, 0.01, 1, 100.0, 100000.0, 10000000000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [16]:
gs.best_params_, gs.best_score_, gs.best_estimator_

({'clf__C': 1}, 0.9560439560439561, Pipeline(memory=None,
      steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
   svd_solver='auto', tol=0.0, whiten=True)), ('clf', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='warn',
           tol=0.0001, verbose=0, warm_start=False))]))

In [17]:
gs.score(X_test, y_test)

0.9649122807017544

In [19]:
from sklearn.svm import SVC

C_range = [1e-3, 1e-2, 1, 1e2, 1e3]

param = {'clf__C':C_range,
         'clf__kernel':['linear', 'rbf'],
         'pca__whiten':[True, False],
         'pca__n_components':[30, 20, 10]}

estimators = [('pca', PCA()), ('clf', SVC())]

pipe = Pipeline(estimators)

from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(pipe, param, n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    2.0s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    5.3s finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))]),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'clf__C': [0.001, 0.01, 1, 100.0, 1000.0], 'clf__kernel': ['linear', 'rbf'], 'pca__whiten': [True, False], 'pca__n_components': [30, 20, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [20]:
gs.best_params_, gs.best_score_, gs.best_estimator_

({'pca__whiten': False,
  'pca__n_components': 10,
  'clf__kernel': 'linear',
  'clf__C': 0.01},
 0.9494505494505494,
 Pipeline(memory=None,
      steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=0.01, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
   kernel='linear', max_iter=-1, probability=False, random_state=None,
   shrinking=True, tol=0.001, verbose=False))]))

In [21]:
gs.score(X_test, y_test)

0.9298245614035088

In [22]:
gs.cv_results_



{'mean_fit_time': array([0.07829054, 0.0369854 , 0.02398205, 0.00732644, 0.0099895 ,
        0.01098951, 0.02264333, 0.01565067, 1.93223047, 0.01964776]),
 'std_fit_time': array([0.00590225, 0.02478886, 0.00388693, 0.0012466 , 0.00373744,
        0.00282648, 0.00800689, 0.00367884, 1.22204356, 0.00329674]),
 'mean_score_time': array([0.00266314, 0.00565489, 0.01198896, 0.00299788, 0.00366322,
        0.00632707, 0.00632803, 0.00532858, 0.00066606, 0.00099985]),
 'std_score_time': array([4.69684480e-04, 9.44721645e-04, 3.26341360e-03, 1.08386530e-06,
        3.08919978e-03, 5.43096734e-03, 4.70921149e-04, 4.70752210e-04,
        4.70976993e-04, 1.07214749e-06]),
 'param_pca__whiten': masked_array(data=[True, False, False, True, True, True, False, False,
                    False, False],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_pca__n_components': masked_arra