# Imports

In [83]:
import os
import pandas as pd
import numpy as np
import pickle

from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

from tpot import TPOTRegressor
from tempfile import mkdtemp
from shutil import rmtree

In [84]:
#setting the display for Pipelines and ColumnTransformers
from sklearn import set_config
set_config(display='diagram')

# Pipelines

🎯 We are going to predict the charges of a health insurance contract based on various features using the following dataset.

💾 Download the dataset [here](https://wagon-public-datasets.s3.amazonaws.com/data_workflow.csv)

In [85]:
data = pd.read_csv('data/data_workflow.csv')
data.head()

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19.0,27.9,0,True,southwest,16884.924
1,18.0,33.77,1,False,southeast,1725.5523
2,,33.0,3,False,southeast,4449.462
3,33.0,22.705,0,False,northwest,21984.47061
4,32.0,28.88,0,False,northwest,3866.8552


In [86]:
data.shape

(1338, 6)

Now let's define our `X` and out `y`, so that we have something to work with. We will also do a quick manual Holdout Method.

In [87]:
X = data.drop(columns='charges')
y = data['charges']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 5), (268, 5), (1070,), (268,))

Today we will: <br>
1. Impute missing values
2. Scale numerical features
3. Encode categorical features
4. Fine-tune our model and our preprocessing

All at once, in one single cell!! :D 

## Preprocessing Pipelines

### Pipeline

In [88]:
data.head()

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19.0,27.9,0,True,southwest,16884.924
1,18.0,33.77,1,False,southeast,1725.5523
2,,33.0,3,False,southeast,4449.462
3,33.0,22.705,0,False,northwest,21984.47061
4,32.0,28.88,0,False,northwest,3866.8552


In [89]:
# from sklearn.pipeline import Pipeline

# Preprocess the "age" column
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('standard_scaler', StandardScaler())
])

pipeline.fit(X_train[['age']])
pipeline.transform(X_train[['age']])

array([[ 1.63280079],
       [-0.15381602],
       [ 0.41790136],
       ...,
       [ 1.41840678],
       [ 1.77573014],
       [ 1.48987145]])

We can now look at our pipeline, as well as access individual steps of it:

In [90]:
pipeline

In [91]:
print(pipeline[1]) # by index
print(pipeline['standard_scaler']) # by name

StandardScaler()
StandardScaler()


### Column Transformer ⑂

Column Transformers allow you to apply specific changes to specific columns in parallel

For this one we will:<br>
    1. Impute and scale numerical values<br>
    2. Encode categorical values

In [92]:
# from sklearn.compose import ColumnTransformer

# Impute then scale numerical values: 
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean")),
    ('standard_scaler', StandardScaler())
])

# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')

# Parallelize "num_transformer" and "cat_transfomer"
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['age', 'bmi']),
    ('cat_transformer', cat_transformer, ['smoker', 'region'])
])

Let's checkout out our CT!

In [93]:
preprocessor

That's cool but how do we use it?

In [94]:
X_train_transformed = preprocessor.fit_transform(X_train)

#Using display instead of print to see a proper DF 
# in the form of a table with borders around rows and columns
print("Original training set")
display(X_train.head(3))

print("Preprocessed training set - Notice how we are missing feature names!")
display(pd.DataFrame(data=X_train_transformed).head(3))

Original training set


Unnamed: 0,age,bmi,children,smoker,region
722,62.0,37.4,0,False,southwest
84,37.0,34.8,2,True,southwest
327,45.0,36.48,2,True,northwest


Preprocessed training set - Notice how we are missing feature names!


Unnamed: 0,0,1,2,3,4,5,6,7
0,1.63292,1.102755,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.153703,0.673722,0.0,1.0,0.0,0.0,0.0,1.0
2,0.418016,0.950943,0.0,1.0,0.0,1.0,0.0,0.0


In [95]:
preprocessor.get_feature_names_out()

array(['num_transformer__age', 'num_transformer__bmi',
       'cat_transformer__smoker_False', 'cat_transformer__smoker_True',
       'cat_transformer__region_northeast',
       'cat_transformer__region_northwest',
       'cat_transformer__region_southeast',
       'cat_transformer__region_southwest'], dtype=object)

In [96]:
pd.DataFrame(
    data = X_train_transformed,
    columns=preprocessor.get_feature_names_out()
).head()

Unnamed: 0,num_transformer__age,num_transformer__bmi,cat_transformer__smoker_False,cat_transformer__smoker_True,cat_transformer__region_northeast,cat_transformer__region_northwest,cat_transformer__region_southeast,cat_transformer__region_southwest
0,1.63292,1.102755,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.153703,0.673722,0.0,1.0,0.0,0.0,0.0,1.0
2,0.418016,0.950943,0.0,1.0,0.0,1.0,0.0,0.0
3,1.48999,-0.821294,1.0,0.0,0.0,0.0,1.0,0.0
4,0.346551,-1.463194,1.0,0.0,1.0,0.0,0.0,0.0


what about the children? We can't leave them behind! :(

In [97]:
#Making sure the children are safe and sound
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['age','bmi']),
    ('cat_transformer', cat_transformer, ['region','smoker'])],
    remainder='passthrough'
)

preprocessor

In [98]:
pd.DataFrame(
    preprocessor.fit_transform(X_train),
    columns=preprocessor.get_feature_names_out()
).head(3)

Unnamed: 0,num_transformer__age,num_transformer__bmi,cat_transformer__region_northeast,cat_transformer__region_northwest,cat_transformer__region_southeast,cat_transformer__region_southwest,cat_transformer__smoker_False,cat_transformer__smoker_True,remainder__children
0,1.63292,1.102755,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,-0.153703,0.673722,0.0,0.0,0.0,1.0,0.0,1.0,2.0
2,0.418016,0.950943,0.0,1.0,0.0,0.0,0.0,1.0,2.0


### Custom: Function Transformer →

If you'd like to do you r own thing, you can use custom transformers. These encapsulate a Python function into a transformer object. As it is a transformer, it still works with Pipelines and other transformers, like the ColumnTransformer we just saw.

In [99]:
# from sklearn.preprocessing import FunctionTransformer

# Create a transformer that compresses data to 2 digits (for instance!)
# rounder = FunctionTransformer(np.round)

# We can use a lambda function for more customizable functions
rounder = FunctionTransformer(lambda array: np.round(array, decimals=2))

In [100]:
# Add it at the end of our numerical transformer
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('rounder', rounder)])

# Encode categorical values
cat_transformer = OneHotEncoder(drop='if_binary',
                                handle_unknown='ignore')

preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['bmi', 'age']),
    ('cat_transformer', cat_transformer, ['region', 'smoker'])],
    remainder='passthrough')
preprocessor

In [101]:
pd.DataFrame(preprocessor.fit_transform(X_train)).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.1,1.63,0.0,0.0,0.0,1.0,0.0,0.0
1,0.67,-0.15,0.0,0.0,0.0,1.0,1.0,2.0
2,0.95,0.42,0.0,1.0,0.0,0.0,1.0,2.0


### Feature Union ||

We can take this a step further! By applying entire transformers in parallel.<br>
yes we can! Let's do that and create a nwe feature as well: `bmi_age_ratio`.

In [102]:
X_train.head(3)

Unnamed: 0,age,bmi,children,smoker,region
722,62.0,37.4,0,False,southwest
84,37.0,34.8,2,True,southwest
327,45.0,36.48,2,True,northwest


In [103]:
# from sklearn.pipeline import FeatureUnion

# Create a custom transformer that multiplies/divides two columns
# Notice that we are creating this new feature completely randomly just as an example
bmi_age_ratio_constructor = FunctionTransformer(lambda df: pd.DataFrame(df["bmi"] / df["age"]))

union = FeatureUnion([
    ('preprocess', preprocessor), # columns 0-7
    ('bmi_age_ratio', bmi_age_ratio_constructor) # new column 8
])

union

### Summary with make_*** shortcuts

In [104]:
# from sklearn.pipeline import make_pipeline
# from sklearn.pipeline import make_union
# from sklearn.compose import make_column_transformer

Pipeline([
    ('my_name_for_the_imputer', SimpleImputer()),
    ('my_name_for_the_scaler', StandardScaler())
])

In [105]:
make_pipeline(SimpleImputer(), StandardScaler()) # here we dont get to name our steps

Let's try and redo it all with this new method!

In [106]:
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
cat_transformer = OneHotEncoder()

preproc_basic = make_column_transformer(
    (num_transformer, ['age', 'bmi']),
    (cat_transformer, ['smoker', 'region']),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

Cool right? The upside is that it's much faster to write and implement, but the big downside is that you can't select you own names for transformers, which may be something that you care about doing.<br>
We can also use `make_column_selector`to select features to be used based on their `dtype`.

In [107]:
# from sklearn.compose import make_column_selector

num_col = make_column_selector(dtype_include=['float64'])
cat_col = make_column_selector(dtype_include=['object','bool'])

X_train.dtypes

age         float64
bmi         float64
children      int64
smoker         bool
region       object
dtype: object

### Complete Preprocessing Pipeline

In [108]:
# from sklearn.compose import make_column_selector

#IMpute then scale numerical values
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
num_col = make_column_selector(dtype_include=['float64'])

# Encode categorical values
cat_transformer = OneHotEncoder()
cat_col = make_column_selector(dtype_include=['object','bool'])

# Use the ColumnTransformer to parallelize these operations
preproc_basic = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough'
)

preproc_full = make_union(preproc_basic, bmi_age_ratio_constructor)
preproc_full

## Including Models in Pipelines

In [109]:
# from sklearn.linear_model import Ridge

# Preprocessor -Impute then scala numerical values
num_transformer = make_pipeline(SimpleImputer(), StandardScaler())
# encode cateforical values
cat_transformer = OneHotEncoder()

# use the ColumnTransformer to parallelize these operations
preproc = make_column_transformer(
    (num_transformer, make_column_selector(dtype_include=['float64'])),
    (cat_transformer, make_column_selector(dtype_include=['object','bool'])),
    remainder='passthrough'
)

# Add estimator
pipeline = make_pipeline(preproc, Ridge())
pipeline

Because my pipeline inherits from my last step, i can do this:

In [110]:
# Train Pipeline
pipeline.fit(X_train,y_train)

# Make predictions
pipeline.predict(X_test.iloc[0:1])

# Score model
pipeline.score(X_test,y_test)

0.7581563627208925

### Cross-validate Pipeline

In [111]:
# from sklearn.model_selection import cross_val_score

cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2').mean()

0.7398977387417156

### Gridsearch a Pipeline

Believe it.. Grid Search is also possible!

In [112]:
# Which parameters of the pipeline are GridSearch-able? here you can look for the params you wanna grid search
# pipeline.get_params()

In [113]:
# from sklearn.model_selection import GridSearchCV

# instantiate a GS
grid_search = GridSearchCV(
    pipeline, # instead of our model
    param_grid={
        # we can access ANY component of the Pipeline
        # and ANY available hyperparamater you want to optimize - These are double
        # underscores also referred to as "dunder"
        #step              #pipeline  #transformer   #strat
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]
    },
    cv=5,
    scoring="r2")

grid_search.fit(X_train, y_train)
grid_search.best_params_

{'columntransformer__pipeline__simpleimputer__strategy': 'median',
 'ridge__alpha': 1}

💾 Let's save the pipelined model with the best hyperparameters.

In [114]:
pipeline_tuned = grid_search.best_estimator_
pipeline_tuned

🔮 We can use this "best" model for predictions without re-training it!

In [115]:
pipeline_tuned.predict(X_test[0:1])

array([4652.06632974])

An extra tip for you guys:

### Cache to avoid repeated computations

Are your preprocessing steps too long to run?

You can use caching techniques!

In [116]:
# from tempfile import mkdtemp
# from shutil import rmtree

# Create a temp folder
cache_dir = mkdtemp()

# Instantiate the Pipeline with the cache parameter
pipeline = make_pipeline(preproc_basic, Ridge(), memory=cache_dir)

#instantiate a GS
# instantiate a GS
grid_search = GridSearchCV(
    pipeline, 
    param_grid={
        # Access any component of the Pipeline
        # and ANY available hyperparamater you want to optimize - These are double
        # underscores also referred to as "dunder"
        'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]
    },
    cv=5,
    scoring="r2")
# Clear the cache directory after the cross-validation
rmtree(cache_dir)

In [117]:
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'columntransformer__pipeline__simpleimputer__strategy': 'median',
 'ridge__alpha': 1}

In [118]:
# Clear the directory after the CV
rmtree(cache_dir)

###  Debug your pipe

In [119]:
# Access the components of a Pipeline with `named_steps`
pipeline_tuned.named_steps.keys()

dict_keys(['columntransformer', 'ridge'])

In [120]:
# Check intermediate steps
print("Before preprocessing, X_train.shape = ")
print(X_train.shape)
print("After preprocessing, X_train_preprocessed.shape = ")
pipeline_tuned.named_steps["columntransformer"].fit_transform(X_train).shape

Before preprocessing, X_train.shape = 
(1070, 5)
After preprocessing, X_train_preprocessed.shape = 


(1070, 9)

### Exporting models/Pipelines

💾 You can export your final model/pipeline as a pickle file so you can load them into your python file or your notebook so you can use it directly as it is.

👉The file can then be loaded back into a notebook or deployed on a server (see ML Ops module).

In [121]:
# import pickle

# Export Pipeline as pickle file
with open("pipeline.pkl", "wb") as file:
    pickle.dump(pipeline_tuned, file)

# Load Pipeline from pickle file
my_pipeline = pickle.load(open("pipeline.pkl","rb"))

my_pipeline.score(X_test, y_test)

0.7582138256450727

# That cool thing I told you about 🎉

**Auto ML**

So here we have our training and testing sets.

In [122]:
# import os
# from tpot import TPOTRegressor

X_train_preproc = preproc_basic.fit_transform(X_train)
X_test_preproc = preproc_basic.transform(X_test)

In [123]:
# Instantiate TPOTClassifier
                    #like epochs    #how much info
tpot = TPOTRegressor(generations=4, population_size=20, verbosity=2, scoring='r2', n_jobs=-1, cv=2)

# Process autoML with TPOT
tpot.fit(X_train_preproc, y_train)

# Print score
print(tpot.score(X_test_preproc, y_test))


Optimization Progress:   0%|          | 0/100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8428341848382512

Generation 2 - Current best internal CV score: 0.8428341848382512

Generation 3 - Current best internal CV score: 0.844320128749735

Generation 4 - Current best internal CV score: 0.8446595024972126

Best pipeline: AdaBoostRegressor(SGDRegressor(CombineDFs(input_matrix, input_matrix), alpha=0.0, eta0=0.01, fit_intercept=True, l1_ratio=0.5, learning_rate=invscaling, loss=epsilon_insensitive, penalty=elasticnet, power_t=100.0), learning_rate=0.001, loss=linear, n_estimators=100)
0.8704595162969838




In [124]:
# Export TPOT Pipeline to a Python file
tpot.export(os.path.join(os.getcwd(),'tpot_iris_pipeline.py'))

! cat 'tpot_iris_pipeline.py'

import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8446595024972126
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    StackingEstimator(estimator=SGDRegressor(alpha=0.0, eta0=0.01, fit_intercept=True,