In [1]:
#All credits go to Chris Albon's blog where he has shown such techniques in a lucid manner
# https://chrisalbon.com/#articles

In [2]:
# Load libraries
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Set random seed
np.random.seed(0)

In [3]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

## Example of Cross Validation with cross_val_score

In [4]:
cross_val_score(LogisticRegression(), X, y, cv=5)

array([ 1.        ,  0.96666667,  0.93333333,  0.9       ,  1.        ])

In [5]:
#Generally we are interested in avg of cv scores
np.mean(cross_val_score(LogisticRegression(), X, y, cv=5))

0.96000000000000019

In [6]:
#The scoring metric is 'accuracy'. We can change it to 'roc_auc' but as this problem has multiclass labels, it won.t work here

## Example of hyperparameter tuning with GridSearchCV

In [7]:
hyperparameters = {
    'penalty': ['l1', 'l2'], 
    'C': np.logspace(0,4,10)}

clf = GridSearchCV(LogisticRegression(), hyperparameters, cv=5, verbose=0)

best_fit = clf.fit(X, y)

best_fit.best_params_

{'C': 7.7426368268112693, 'penalty': 'l1'}

## Example of Pipeline along with GridSearchCV

In [8]:
pipe = Pipeline([('classifier', LogisticRegression())])

# Create space of candidate learning algorithms and their hyperparameters
search_space = [{'classifier': [LogisticRegression()],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(0, 4, 10)},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__max_features': [1, 2, 3]}]

In [9]:
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0)

In [10]:
best_model = clf.fit(X, y)

In [12]:
best_model.best_estimator_

Pipeline(memory=None,
     steps=[('classifier', LogisticRegression(C=7.7426368268112693, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [11]:
best_model.best_params_

{'classifier': LogisticRegression(C=7.7426368268112693, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
 'classifier__C': 7.7426368268112693,
 'classifier__penalty': 'l1'}