# 12. Building a Pipeline
* Encoding categorical variables
* Filling in missing values
* Scaling non-binary variables
* Grid search to find optimal parameters

In [None]:
import pandas as pd
pd.options.display.max_columns = 100
housing = pd.read_csv('../data/housing.csv')
price = housing.pop('SalePrice')
housing.head()

## Remove Id column
The `Id` column is merely a label for the rows. Let's drop it.

In [None]:
housing = housing.drop(columns='Id')
housing.head()

## What is a scikit-learn pipeline?
A scikit-learn pipeline allows us to chain multiple transformation one after another. For instance, a pipeline would be a good idea to fill in missing values and then scale them.

Let's see how this would work without a pipeline on just numeric data.

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
si = SimpleImputer(strategy='median')
ss = StandardScaler()

In [None]:
h_num = housing.select_dtypes('number')
h_num.head()

In [None]:
X = h_num.values
y = price.values

In [None]:
X1 = si.fit_transform(X)
X2 = ss.fit_transform(X1)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr = LinearRegression()

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(lr, X2, y, cv=10)

### Making a pipeline
Import the `Pipeline` and instantiate it with a list of 2-item tuples. The first value being the **name** of the step and the second being the **transformation**.

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
steps = [('impute', si), ('scale', ss)]
pipe = Pipeline(steps)

The following imputes missing values and then scales the data.

In [None]:
pipe.fit_transform(X)

### Adding a machine learning model to the pipeline
It is possible to add a machine learning model to the pipeline as long as it comes last and has only a `fit` method.

In [None]:
steps = [('impute', si), ('scale', ss), ('lr', lr)]
pipe = Pipeline(steps)
cross_val_score(pipe, X, y, cv=10)

### Grid search with a pipeline
You can grid search with a pipeline object as well. You just need to precede the parameter value you would like to tune by two underscores and the name of the step. For instance, to tune the `strategy` parameter of the imputer, you must use the name `impute__strategy`.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {'impute__strategy': ['mean', 'median'], 'scale__with_mean': [True, False]}
gs = GridSearchCV(pipe, param_grid, cv=10)

In [None]:
gs.fit(X, y)

In [None]:
gs.best_params_

## Pipeline within a `ColumnTransformer`

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
si = SimpleImputer(strategy='median')
ss = StandardScaler()

steps = [('impute', si), ('scale', ss)]
numeric_pipe = Pipeline(steps)

si = SimpleImputer(strategy='constant', fill_value='UNKNOWN')
ohe = OneHotEncoder(sparse=False)
steps = [('impute', si), ('ohe', ohe)]
cat_pipe = Pipeline(steps)


numeric_cols = ['LotFrontage', 'BsmtFinSF2']
cat_cols = ['OverallQual', 'MSZoning', 'Street', 'Neighborhood']

transformers = [('numeric_pipe', numeric_pipe, numeric_cols),
                ('cat_pipe', cat_pipe, cat_cols)]
ct = ColumnTransformer(transformers)

lr = LinearRegression()
final_pipe = Pipeline([('ct', ct), ('lr', lr)])
cross_val_score(final_pipe, housing, y, cv=10)

## Grid Search on this Pipeline
Lots of underscores

# Exercises