In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
data = load_boston()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'])

In [2]:
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge

The pipeline we are going to setup is composed of the following tasks:

1. **Data Normalization**: in this tutorial we have selected three different normalization methods, including the QuantileTransformer ([check out the documentation](https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-transformer))..
2. **Dimensionality Reduction**: we selected Principal Component Analysis (PCA) and a univariate feature selection algorithm as possible candidates.
3. **Regression**: we apply a simple regularized linear method, although the method is easily extendable to other learning algorithms.

In [3]:
# reference if we were to do ML manually

# scaler = StandardScaler()
# pca = PCA()
# ridge = Ridge()

# X_train = scaler.fit_transform(X_train)
# X_train = pca.fit_transform(X_train)
# ridge.fit(X_train, y_train)

In [4]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('reduce_dim', PCA()),
    ('regressor', Ridge())
])

In [5]:
pipe = pipe.fit(X_train, y_train)
print(f'Testing score: {pipe.score(X_test, y_test)}')

Testing score: 0.7088728909967148


In [6]:
print(pipe.steps[1][1].explained_variance_)

[6.16377199 1.43921168 1.271392   0.83419792 0.791169   0.6445408
 0.53901425 0.4109692  0.27807481 0.22318429 0.19856896 0.1741073
 0.06618935]


### Pipeline Tuning (Basic)

In [7]:
import numpy as np
n_features_to_test = np.arange(1, 11)

In [8]:
alpha_to_test = 2.0**np.arange(-6, 6)

In [9]:
params = {
    'reduce_dim__n_components': n_features_to_test,
    'regressor__alpha': alpha_to_test
}

In [10]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print(f'Final score is: {gridsearch.score(X_test, y_test)}')

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Final score is: 0.6833318874612437


### Pipeline Tuning (Advanced)

In [11]:
scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]

In [12]:
params = {
    'scaler': scalers_to_test,
    'reduce_dim__n_components': n_features_to_test,
    'regressor__alpha': alpha_to_test
}

In [13]:
# seperate PCA and KBest hyper params for optimization
params = [
        {'scaler': scalers_to_test,
         'reduce_dim': [PCA()],
         'reduce_dim__n_components': n_features_to_test,
         'regressor__alpha': alpha_to_test},

        {'scaler': scalers_to_test,
         'reduce_dim': [SelectKBest(f_regression)],
         'reduce_dim__k': n_features_to_test,\
         'regressor__alpha': alpha_to_test}
        ]

In [15]:
gridsearch = GridSearchCV(pipe, params, verbose=1).fit(X_train, y_train)
print(f'Final score is: {gridsearch.score(X_test, y_test)}')

Fitting 5 folds for each of 720 candidates, totalling 3600 fits




Final score is: 0.6918523723342485




In [16]:
gridsearch.best_params_

{'reduce_dim': PCA(n_components=10),
 'reduce_dim__n_components': 10,
 'regressor__alpha': 4.0,
 'scaler': RobustScaler()}