<h1> <b> Algorithm chains and pipelines

In [2]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [6]:
# load and split the data
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

# compute minimum and maximum on the trainig data 
scaler = MinMaxScaler().fit(X_train)

In [7]:
# rescale the training data 
X_train_scaled = scaler.transform(X_train)
svm = SVC()
# learn on SVM on the scaled training data
svm.fit(X_train_scaled, y_train)
#scale the test data and score the sclaed data
X_test_scaled = scaler.transform(X_test)
print("The test score ", svm.score(X_test_scaled, y_test))

The test score  0.972027972027972


# Parameter Selection with Preprocessing

In [8]:
from sklearn.model_selection import GridSearchCV
# for illustration purpases only, don't use this code !
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=5)
grid.fit(X_train_scaled, y_train)
print("Best cross-validation accuracy: ", grid.best_score_)
print("Best set score: ", grid.score(X_test_scaled, y_test))
print("best parameters: ", grid.best_params_)

Best cross-validation accuracy:  0.9812311901504789
Best set score:  0.972027972027972
best parameters:  {'C': 1, 'gamma': 1}


# Building Pipelines

In [9]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('svm', SVC())
])

In [10]:
pipe.fit(X_train, y_train)

In [11]:
print("Test score: ", pipe.score(X_test, y_test))

Test score:  0.972027972027972


# Using Pipelines in Grid Searches

In [14]:
param_grid = {
    'svm__C':[0.001, 0.01, 0.1, 1, 10, 100],
    'svm__gamma':[0.001, 0.01, 0.1, 1, 10, 100]
}

In [15]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-valiation accuracy: ", grid.best_score_)
print ("Test set score: ", grid.score(X_test, y_test))
print("Best parameters: ", grid.best_params_)

Best cross-valiation accuracy:  0.9812311901504789
Test set score:  0.972027972027972
Best parameters:  {'svm__C': 1, 'svm__gamma': 1}


In [16]:
import numpy as np

In [17]:
rand = np.random.RandomState(seed = 0)
X = rand.normal(size=(100, 10000))
y = rand.normal(size=(100,))

In [18]:
from sklearn.feature_selection import SelectPercentile, f_regression
select = SelectPercentile(score_func=f_regression, percentile=5).fit(X,y)
X_scaled = select.transform(X)
print("X_selected.shape: ", X_scaled.shape)

X_selected.shape:  (100, 500)


In [19]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
print(f"Croos-validation accuracy (cv only on ridge): {np.mean(cross_val_score(Ridge(), X_scaled,y,cv=5)):.2f}")

Croos-validation accuracy (cv only on ridge): 0.91


In [20]:
pipe = Pipeline([
    ('select', SelectPercentile(score_func=f_regression, percentile=5)),
    ('ridge', Ridge())
])
print(f"cross_val accuracy (pipeline): {np.mean(cross_val_score(pipe, X, y, cv=5))}")

cross_val accuracy (pipeline): -0.24655422384952813


# The General Pipeline Interface

In [21]:
def fit(self, X, y):
    X_transformed = X
    for name, estimator in self.steps[:-1]:
        # iterate over all but the final step
        # fit and transform the data
        X_transformed = estimator.fit_transform(X_transformed, y)
    
    #fit the last step
    self.steps[-1][1].fit(X_transformed, y)
    return self

In [22]:
def predict (self, X):
    X_transformed = X
    for step in self.stepes[:-1]:
        # iterate over all but the final step
        # transform the data
        X_transformed = step[1].transform(X_transformed)
    # fit the last step
    return self.steps[-1][1].predict(X_transformed)

## Convenient Pipeline Creation with make_pipeline

In [25]:
from sklearn.pipeline import make_pipeline
# standard syntax
pipe_long = Pipeline([
    ('scaler', MinMaxScaler()),
    ('svm', SVC(C=100))
])
# abbrevated syntax
pipe_short = make_pipeline(MinMaxScaler(), SVC(C=100))

In [26]:
print("Pipeline steps: \n", pipe_short.steps)

Pipeline steps: 
 [('minmaxscaler', MinMaxScaler()), ('svc', SVC(C=100))]


## Accessing Step Attributes

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [29]:
pipe = make_pipeline(StandardScaler(), PCA(n_components=2), StandardScaler())
print("Pipeline steps: \n", pipe.steps)

Pipeline steps: 
 [('standardscaler-1', StandardScaler()), ('pca', PCA(n_components=2)), ('standardscaler-2', StandardScaler())]


In [30]:
# fit the pipeline defined before the cancer dataset
pipe.fit(cancer.data)
# extract the first two principal components form the PCA step
components = pipe.named_steps['pca'].components_
print("Components.shape: ", components.shape)


Components.shape:  (2, 30)


## Accessing Attributes in a Grid-Searched Pipeline

In [31]:
from sklearn.linear_model import LogisticRegression
pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [32]:
param_grid = {'LogisticRegression__C': [0.001, 0.1,1,10,100]}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target)