In [2]:
import numpy as np
import os
import datetime
import pandas as pd
import random
from scipy import stats
# from tqdm import tqdm
from matplotlib import pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from sklearn import preprocessing
from sklearn import datasets

In [21]:
from sklearn import model_selection
from sklearn import pipeline
from sklearn import linear_model
from sklearn import dummy
from sklearn import ensemble
from sklearn import metrics
from sklearn import cluster
from sklearn import decomposition

### Grid Search

In [4]:
# load data and split
iris = datasets.load_iris()
features = iris.data
target = iris.target
ols = linear_model.LogisticRegression()

In [5]:
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)

In [7]:
gridsearch = model_selection.GridSearchCV(ols, hyperparameters, cv=5, verbose=0)
best_model = gridsearch.fit(features, target)











In [8]:
print('Best Penalty: ', best_model.best_estimator_.get_params()['penalty'])
print('Best C: ', best_model.best_estimator_.get_params()['C'])

Best Penalty:  l1
Best C:  7.742636826811269


### Randomized Search

In [10]:
penalty = ['l1', 'l2']
C = stats.uniform(loc=0, scale=4)
hyperparameters = dict(C=C, penalty=penalty)

In [15]:
random_search = model_selection.RandomizedSearchCV(ols, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0,
                                                  n_jobs=-1)
best_model = random_search.fit(features, target)



In [16]:
print('Best Penalty: ', best_model.best_estimator_.get_params()['penalty'])
print('Best C: ', best_model.best_estimator_.get_params()['C'])

Best Penalty:  l1
Best C:  1.668088018810296


### Multiple Learning Algorithms

In [18]:
my_pipeline = pipeline.Pipeline([('classifier', ensemble.RandomForestClassifier())])
# Dictionary of possible algorithms and their hyperparameters
search_space = [{'classifier': [linear_model.LogisticRegression()], 'classifier__penalty': ['l1', 'l2'],
                'classifier__C': np.logspace(0, 4, 10)},
                {'classifier': [ensemble.RandomForestClassifier()], 'classifier__n_estimators': [10, 100, 1000],
                'classifier__max_features': [1, 2, 3]}]

In [19]:
grid_search = model_selection.GridSearchCV(my_pipeline, search_space, cv=5, verbose=0)
best_model = grid_search.fit(features, target)













In [20]:
best_model.best_estimator_.get_params()['classifier']

LogisticRegression(C=7.742636826811269, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

### Add Preprocessing Step

In [23]:
np.random.seed(0)
# preprocessing Object w/ standardization and PCA
my_pp = pipeline.FeatureUnion([('std', preprocessing.StandardScaler()), ('pca', decomposition.PCA())])
my_pipe = pipeline.Pipeline([('preprocess', my_pp), ('classifier', linear_model.LogisticRegression())])
# Create a space of candidate values
search_space = [{'preprocess__pca__n_components': [1, 2, 3], 'classifier__penalty': ['l1', 'l2'],
                'classifier__C': np.logspace(0, 4, 10)}]

In [24]:
clf = model_selection.GridSearchCV(my_pipe, search_space, cv=5, verbose=0, n_jobs=-1)
best_model = clf.fit(features, target)



In [25]:
best_model.best_estimator_.get_params()

{'memory': None, 'steps': [('preprocess', FeatureUnion(n_jobs=None,
          transformer_list=[('std', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
     svd_solver='auto', tol=0.0, whiten=False))],
          transformer_weights=None)),
  ('classifier',
   LogisticRegression(C=7.742636826811269, class_weight=None, dual=False,
             fit_intercept=True, intercept_scaling=1, max_iter=100,
             multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
             solver='warn', tol=0.0001, verbose=0, warm_start=False))], 'preprocess': FeatureUnion(n_jobs=None,
        transformer_list=[('std', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False))],
        transformer_weights=None), 'classifier': LogisticRegression(C=7.742636826811269, class_w

### Parallelization

In [26]:
ols = linear_model.LogisticRegression()
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 1000)
hyperparameters = dict(C=C, penalty=penalty)
grid_search = model_selection.GridSearchCV(ols, hyperparameters, cv=5, verbose=1, n_jobs=-1)
best_model = grid_search.fit(features, target)

Fitting 5 folds for each of 2000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  67 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 3067 tasks      | elapsed:   23.8s
[Parallel(n_jobs=-1)]: Done 7757 tasks      | elapsed:   56.1s
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:  1.2min finished


Additionally some algorithms have specific methods that speed up Model Selection like linear_model.LogisticRegressionCV which identifies the optimum value of the hyperparameter C

### Evaluate Performance After Model Selection
- Wrap the cross-validation use for model search in another cross-validation
- inner cross validation selects the best model while the outer cross-validation provides an unbiased evaluation of the model's performance

In [27]:
ols = linear_model.LogisticRegression()
C = np.logspace(0, 4, 20)
hyperparameters = dict(C=C)
grid_search = model_selection.GridSearchCV(ols, hyperparameters, cv=5, verbose=0, n_jobs=-1)

In [28]:
# Nested Cross Validation
model_selection.cross_val_score(grid_search, features, target).mean()



0.9534313725490197

In [29]:
model_selection.cross_val_score(grid_search, features, target)



array([1.        , 0.90196078, 0.95833333])