In [1]:
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = data.data
y = data.target

In [3]:
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=1, 
                  train_size=0.8, 
                  test_size=0.2, 
                  random_state=0)

train_index, test_index = next(ss.split(X, y))

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

In [4]:
from sklearn.decomposition import PCA

pca = PCA(whiten=True)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca  = pca.transform(X_test)

In [5]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [6]:
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.95614035087719296

In [7]:
clf.fit(X_train_pca, y_train)
clf.score(X_test_pca, y_test)

0.96491228070175439

In [8]:
from sklearn.pipeline import Pipeline

estimators = [('pca', PCA(whiten=True)), 
              ('clf', LogisticRegression())]
pipe = Pipeline(estimators)

In [9]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [10]:
pipe.score(X_test, y_test)

0.96491228070175439

In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

estimators = [('mms', MinMaxScaler()), 
              ('clf', SVC(kernel='rbf', C=1e10))]
pipe = Pipeline(estimators)

In [12]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('mms', MinMaxScaler(copy=True, feature_range=(0, 1))), ('clf', SVC(C=10000000000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [13]:
pipe.score(X_test, y_test)

0.98245614035087714

In [14]:
estimators = [('pca', PCA(whiten=True)), 
              ('clf', LogisticRegression())]
pipe = Pipeline(estimators)

In [15]:
from sklearn.model_selection import GridSearchCV

param = {'clf__C':[1e-5, 1e-3, 1e-2, 1, 1e2, 1e5, 1e10]} # clf.C

gs = GridSearchCV(pipe, param)
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=True)), ('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'clf__C': [1e-05, 0.001, 0.01, 1, 100.0, 100000.0, 10000000000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [16]:
gs.best_params_, gs.best_score_, gs.best_estimator_

({'clf__C': 1},
 0.95604395604395609,
 Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
   svd_solver='auto', tol=0.0, whiten=True)), ('clf', LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False))]))

In [17]:
gs.score(X_test, y_test)

0.96491228070175439

In [18]:
from sklearn.svm import SVC

C_range = [1e-3, 1e-2, 1, 1e2, 1e3]

param = {'clf__C': C_range, 
         'clf__kernel': ['linear', 'rbf'], 
         'pca__whiten': [True, False], 
         'pca__n_components': [30, 20, 10]}

estimators = [('pca', PCA()),
              ('clf', SVC())]

pipe = Pipeline(estimators)


from sklearn.model_selection import RandomizedSearchCV

gs = RandomizedSearchCV(pipe, param, n_jobs=-1, verbose=2)
gs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] pca__whiten=False, pca__n_components=30, clf__kernel=rbf, clf__C=1000.0 
[CV] pca__whiten=False, pca__n_components=30, clf__kernel=rbf, clf__C=1000.0 
[CV] pca__whiten=False, pca__n_components=30, clf__kernel=rbf, clf__C=1000.0 
[CV] pca__whiten=False, pca__n_components=20, clf__kernel=rbf, clf__C=0.01 
[CV] pca__whiten=False, pca__n_components=20, clf__kernel=rbf, clf__C=0.01 
[CV] pca__whiten=False, pca__n_components=20, clf__kernel=rbf, clf__C=0.01 
[CV] pca__whiten=True, pca__n_components=20, clf__kernel=rbf, clf__C=0.001 
[CV] pca__whiten=True, pca__n_components=20, clf__kernel=rbf, clf__C=0.001 
[CV]  pca__whiten=False, pca__n_components=20, clf__kernel=rbf, clf__C=0.01, total=   0.0s
[CV] pca__whiten=True, pca__n_components=20, clf__kernel=rbf, clf__C=0.001 
[CV]  pca__whiten=False, pca__n_components=20, clf__kernel=rbf, clf__C=0.01, total=   0.0s
[CV]  pca__whiten=False, pca__n_components=30, clf__kernel=rbf, clf

[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:    0.2s remaining:    0.2s


[CV]  pca__whiten=False, pca__n_components=30, clf__kernel=linear, clf__C=1000.0, total=   7.7s
[CV]  pca__whiten=False, pca__n_components=30, clf__kernel=linear, clf__C=1000.0, total=  37.9s
[CV]  pca__whiten=False, pca__n_components=30, clf__kernel=linear, clf__C=1000.0, total= 1.6min


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.6min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'clf__C': [0.001, 0.01, 1, 100.0, 1000.0], 'clf__kernel': ['linear', 'rbf'], 'pca__whiten': [True, False], 'pca__n_components': [30, 20, 10]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=2)

In [19]:
gs.best_params_, gs.best_score_, gs.best_estimator_

({'clf__C': 1000.0,
  'clf__kernel': 'linear',
  'pca__n_components': 30,
  'pca__whiten': False},
 0.9516483516483516,
 Pipeline(steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=30, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False)), ('clf', SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False))]))

In [20]:
gs.score(X_test, y_test)

0.97368421052631582

In [21]:
gs.cv_results_

{'mean_fit_time': array([  3.72280280e-02,   2.48262882e-02,   3.00091108e-02,
          2.86669731e-02,   4.67691738e+01,   2.96607812e-02,
          3.36525440e-02,   2.96262900e-02,   1.69839859e-02,
          1.70950890e-02]),
 'mean_score_time': array([ 0.00635759,  0.00355029,  0.00470138,  0.01006961,  0.000772  ,
         0.00669702,  0.00297546,  0.00097466,  0.00160877,  0.00318503]),
 'mean_test_score': array([ 0.63736264,  0.63736264,  0.63736264,  0.63736264,  0.95164835,
         0.63736264,  0.63736264,  0.94945055,  0.93186813,  0.63736264]),
 'mean_train_score': array([ 1.        ,  0.63736176,  0.63736176,  1.        ,  0.98462741,
         0.63736176,  1.        ,  0.95824648,  1.        ,  0.63736176]),
 'param_clf__C': masked_array(data = [1000.0 0.01 0.001 100.0 1000.0 0.01 1 0.01 1000.0 0.001],
              mask = [False False False False False False False False False False],
        fill_value = ?),
 'param_clf__kernel': masked_array(data = ['rbf' 'rbf' 'rbf' '