<a href="https://colab.research.google.com/github/suumino/DataAlanysis/blob/main/Part_05_Preprocessing_and_Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Preprocessing and Pipelines
=============================

In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                    digits.target)

Cross-validated pipelines including scaling, we need to estimate mean and standard deviation separately for each fold.
To do that, we build a pipeline.

In [2]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [4]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
X_train_scale = standard_scaler.transform(X_train)
svm = SVC().fit(X_train_scale, y_train)

In [5]:
#pipeline = Pipeline([("scaler", StandardScaler()),
#                     ("svm", SVC())])
# in new versions:
pipeline = make_pipeline(StandardScaler(), SVC())

In [6]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())])

In [7]:
pipeline.score(X_test, y_test)

0.9844444444444445

In [8]:
pipeline.predict(X_test)

array([5, 5, 9, 9, 7, 1, 4, 8, 7, 5, 4, 4, 8, 5, 4, 6, 3, 5, 7, 1, 4, 0,
       6, 1, 9, 9, 4, 7, 9, 3, 1, 9, 7, 7, 4, 8, 4, 7, 4, 5, 9, 1, 6, 7,
       3, 6, 6, 6, 3, 1, 8, 8, 8, 6, 8, 4, 4, 4, 1, 5, 9, 1, 8, 6, 0, 9,
       3, 5, 6, 1, 9, 1, 4, 4, 4, 3, 5, 8, 7, 5, 4, 2, 9, 9, 3, 8, 6, 8,
       8, 3, 7, 6, 1, 5, 0, 6, 8, 2, 9, 7, 5, 1, 1, 4, 0, 7, 1, 4, 6, 3,
       8, 6, 3, 1, 1, 2, 1, 1, 4, 9, 2, 7, 0, 4, 4, 2, 1, 9, 8, 6, 4, 4,
       1, 2, 5, 4, 3, 9, 4, 0, 8, 4, 0, 0, 8, 7, 6, 9, 3, 9, 6, 4, 2, 0,
       8, 3, 8, 9, 2, 5, 0, 5, 0, 5, 9, 1, 4, 7, 2, 2, 2, 3, 4, 0, 1, 3,
       5, 3, 0, 7, 8, 3, 1, 4, 6, 7, 5, 0, 5, 8, 7, 4, 8, 1, 2, 4, 1, 7,
       5, 8, 5, 1, 2, 9, 6, 4, 8, 4, 1, 0, 3, 0, 9, 9, 4, 6, 2, 5, 8, 2,
       6, 2, 7, 7, 0, 6, 0, 3, 1, 4, 1, 2, 1, 6, 6, 9, 1, 0, 0, 2, 6, 5,
       5, 4, 2, 6, 5, 2, 1, 0, 0, 0, 5, 6, 9, 5, 6, 2, 4, 3, 2, 1, 2, 8,
       5, 2, 4, 6, 5, 7, 7, 7, 4, 3, 9, 1, 4, 0, 3, 0, 4, 5, 2, 0, 2, 4,
       9, 4, 4, 1, 3, 4, 0, 3, 0, 6, 0, 9, 5, 0, 9,

Cross-validation with a pipeline
---------------------------------

In [9]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipeline, X_train, y_train)

array([0.97037037, 0.99259259, 0.9739777 , 0.98884758, 0.9739777 ])

Grid Search with a pipeline
===========================

In [10]:
import numpy as np
from sklearn.model_selection import GridSearchCV

param_grid = {'svc__C': 10. ** np.arange(-3, 3),
              'svc__gamma' : 10. ** np.arange(-3, 3),
              'standardscalar__with_mean' : [True, False],
             }

grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=-1) 

In [11]:
grid_pipeline.fit(X_train, y_train)

ValueError: ignored

In [13]:
grid_pipeline.score(X_test, y_test)

TypeError: ignored