# Pipeline


A series of transformers followed by a final estimator. Constructor: List of (name, transform) tuples (implementing fit/transform) that are chained, in the order in which they are chained, with the last object an estimator.

fit -> Fit all the transforms one after the other and transform the data, then fit the transformed data using the final estimator.

fit_transform -> Fit all the transforms one after the other and transform the data, then use fit_transform on transformed data using the final estimator. Fails if the final estimator doesn't have a fit_transform function.

predict -> Applies transforms to the data, and the predict method of the final estimator. Valid only if the final estimator implements predict.

### 1) Transformers
Link: https://scikit-learn.org/stable/data_transforms.html

Implements two functions: fit and fit_transform.

### 2) Estimators

Implements: fit and predict



In [34]:
import numpy as np

from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score

rng = np.random.RandomState(0)
dataset = load_boston()
X_full, y_full = dataset.data, dataset.target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]

print(dataset.data.shape)

(506, 13)


In [31]:
# Estimate the score on the entire dataset, with no missing values
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
score = cross_val_score(estimator, X_full, y_full).mean()
print("Score with the entire dataset = %.2f" % score)

Score with the entire dataset = 0.56


In [32]:
# Add missing values in 75% of the lines
missing_rate = 0.75
n_missing_samples = int(n_samples * missing_rate)
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
                                      dtype=np.bool),
                             np.ones(n_missing_samples,
                                     dtype=np.bool)))
rng.shuffle(missing_samples)
missing_features = rng.randint(0, n_features, n_missing_samples)

# Estimate the score without the lines containing missing values
X_filtered = X_full[~missing_samples, :]
y_filtered = y_full[~missing_samples]
estimator = RandomForestRegressor(random_state=0, n_estimators=100)


score = cross_val_score(estimator, X_filtered, y_filtered).mean()
print("Score without the samples containing missing values = %.2f" % score)

Score without the samples containing missing values = 0.48


In [49]:
imp = Imputer()
imp.fit([[1, 3], [np.nan, 2], [8, 5.5]])

X = [[np.nan, 11], 
     [4,      np.nan], 
     [8,      2],
     [np.nan, 1]]
print(imp.transform(X))

[[  4.5  11. ]
 [  4.    3.5]
 [  8.    2. ]
 [  4.5   1. ]]


In [39]:
# Estimate the score after imputation of the missing values
X_missing = X_full.copy()
X_missing[np.where(missing_samples)[0], missing_features] = 0
y_missing = y_full.copy()


estimator = Pipeline([("imputer", Imputer(missing_values=0,
                                          strategy="mean",
                                          axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,
                                                       n_estimators=100))])


estimator.fit(X_missing, y_missing)
# estimator.fit_transform(X_missing, y_missing)

score = cross_val_score(estimator, X_missing, y_missing).mean()
print("Score after imputation of the missing values = %.2f" % score)

Score after imputation of the missing values = 0.57


### Note
- A pipeline is also an estimator object

# Cross validation

Link: https://scikit-learn.org/stable/modules/cross_validation.html




In [36]:
import numpy as np
from sklearn.model_selection import KFold

some_data = ["a", "b", "c", "d"]
kfold = KFold(n_splits=2)

for train, test in kfold.split(some_data):
    print("%s %s" % (train, test))

[2 3] [0 1]
[0 1] [2 3]


In [50]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
lasso = linear_model.Lasso()
print(cross_val_score(lasso, X, y, cv=3)) 

[ 0.33150734  0.08022311  0.03531764]


# Extras


### Stratified k-fold

Preserve the ratio of classes across the folds

In [51]:
from sklearn.model_selection import StratifiedKFold

X = np.ones(10)
y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
    print("%s %s" % (train, test))

[2 3 6 7 8 9] [0 1 4 5]
[0 1 3 4 5 8 9] [2 6 7]
[0 1 2 4 5 6 7] [3 8 9]


### GridSearchCV




In [48]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()

parameters = {'kernel':('linear', 'rbf'), 'C':[0.1, 1, 1.5, 2, 5, 10]}
svc = svm.SVC() # Support vector Machine

clf = GridSearchCV(svc, parameters, cv=5) 

clf.fit(iris.data, iris.target)

print(clf.best_params_)

{'kernel': 'rbf', 'C': 1.5}
