In [1]:
import pandas as pd
import numpy as np
import mglearn
import matplotlib.pyplot as plt

In [3]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


cancer=load_breast_cancer()
X_train,X_test,y_train,y_test=train_test_split(cancer.data,cancer.target,random_state=0)
scaler=MinMaxScaler().fit(X_train)


X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)

svm=SVC()
svm.fit(X_train_scaled,y_train)
print("Test score: {:.2f}".format(svm.score(X_test_scaled, y_test)))





Test score: 0.97


## Parameter Selection with Preprocessing

In [4]:
from sklearn.model_selection import GridSearchCV

param_grid={'C':[0.001, 0.01, 0.1, 1, 10, 100],
            'gamma' :[0.001, 0.01, 0.1, 1, 10, 100]}


grid=GridSearchCV(SVC(),param_grid=param_grid,cv=5)
grid.fit(X_train_scaled,y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Best parameters: ",grid.best_params_)
print("Test set accuracy: {:.2f}".format(grid.score(X_test_scaled,y_test)))

Best cross-validation accuracy: 0.98
Best parameters:  {'C': 1, 'gamma': 1}
Test set accuracy: 0.97


## Building Pipelines

In [5]:
from sklearn.pipeline import Pipeline

pipe=Pipeline([("scaler",MinMaxScaler()),("svm",SVC())])

In [6]:
pipe.fit(X_train,y_train)

he first, called "scaler", is an instance of MinMaxScaler, and the second, called "svm", is an instance of SVC. Now, we can fit the pipeline, like any other scikit-learn estimator

Here, pipe.fit first calls fit on the first step (the scaler), then transforms the training data using the scaler, and finally fits the SVM with the scaled data. To evaluate on the test data, we simply call pipe.score:

In [8]:
print("Test score: {:.2f}".format(pipe.score(X_test,y_test)))

#Calling the score method on the pipeline first transforms the test data using the scaler, and then calls the score method on the SVM using the scaled test data.

Test score: 0.97


Using a pipeline in a grid search works the same way as using any other estimator. We define a parameter grid to search over, and construct a GridSearchCV from the pipeline and the parameter grid. When specifying the parameter grid, there is a slight change, though. We need to specify for each parameter which step of the pipeline it belongs to. Both parameters that we want to adjust, C and gamma, are parameters of SVC, the second step. We gave this step the name "svm". The syntax to define a parameter grid for a pipeline is to specify for each parameter the step name, followed by __ (a double underscore), followed by the parameter name. To search over the C parameter of SVC we therefore have to use "svm__C" as the key in the parameter grid dictionary, and similarly for gamma

In [9]:
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
              'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [11]:
grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)
print("Best cross validation score: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test,y_test)))
print("Best Parameters: {}".format(grid.best_params_))


Best cross validation score: 0.98
Test set score: 0.97
Best Parameters: {'svm__C': 1, 'svm__gamma': 1}


In [12]:
def fit(self,X,y):
    X_transformed=X
    for name,estimator in self.steps[:-1]:
        # iterate over all but the final step
        #  fit and transform the data
        X_transformed = estimator.fit_transform(X_transformed, y)
     # fit the last step
    self.steps[-1][1].fit(X_transformed, y)
    return self

In [13]:
from sklearn.pipeline import make_pipeline

pipe_short=make_pipeline(MinMaxScaler(),SVC(C=100))

In [14]:
print("Pipeline steps:\n{}".format(pipe_short.steps))

Pipeline steps:
[('minmaxscaler', MinMaxScaler()), ('svc', SVC(C=100))]


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pipe=make_pipeline(StandardScaler(),PCA(n_components=2),StandardScaler())
print("Pipeline steps:\n{}".format(pipe.steps))

Pipeline steps:
[('standardscaler-1', StandardScaler()), ('pca', PCA(n_components=2)), ('standardscaler-2', StandardScaler())]


In [18]:
pipe.fit(cancer.data)

components=pipe.named_steps["pca"].components_
print("Components.shape {}".format(components.shape))

Components.shape (2, 30)


In [19]:
from sklearn.linear_model import LogisticRegression

pipe=make_pipeline(StandardScaler(),LogisticRegression())
param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10, 100]}

In [20]:
grid=GridSearchCV(pipe,param_grid=param_grid,cv=5)
grid.fit(X_train,y_train)

In [21]:
print("Best estimator:\n{}".format(grid.best_estimator_))

Best estimator:
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=1))])


In [22]:
print("logistic regression coeeficients: \n{}".format(grid.best_estimator_.named_steps["logisticregression"].coef_))

logistic regression coeeficients: 
[[-0.31167303 -0.58082201 -0.32131835 -0.38161278 -0.11923966  0.43130513
  -0.70867977 -0.85378868 -0.46682033  0.11842553 -1.384584    0.08915178
  -0.95504656 -0.93809826  0.18173417  0.99841869  0.1098606  -0.34148205
   0.20112256  0.80467192 -0.91482867 -0.91731629 -0.81023153 -0.85401188
  -0.45736929  0.11351219 -0.8359122  -0.98702282 -0.59104801 -0.62212143]]


In [24]:
pipe=Pipeline([('preprocessing',StandardScaler()),('classifier',SVC())])

In [25]:
from sklearn.ensemble import RandomForestClassifier

param_grid = [
    {'classifier': [SVC()], 'preprocessing': [StandardScaler(), None],
     'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'classifier': [RandomForestClassifier(n_estimators=100)],
     'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}]

In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, random_state=0)

grid=GridSearchCV(pipe,param_grid,cv=5)
grid.fit(X_train,y_train)


print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'classifier': SVC(), 'classifier__C': 10, 'classifier__gamma': 0.01, 'preprocessing': StandardScaler()}

Best cross-validation score: 0.99
Test-set score: 0.98
