In [1]:
%matplotlib inline

# Parameter tuning:

1. GridSearchCV exhaustively considers all parameter combinations
2. RandomizedSearchCV can sample a given number of candidates from a parameter space with a specified distribution

# Pointer for Parameter Search

## 1.Specify an objective metric
By default, parameter search uses the score function of the estimator to evaluate a parameter setting. These are the <b>sklearn.metrics.accuracy_score</b> for classification and <b>sklearn.metrics.r2_score</b> for regression. 
For some applications, other scoring functions are better suited (for example in unbalanced classification, the accuracy score is often uninformative). 
An alternative scoring function can be specified via the scoring parameter to GridSearchCV, RandomizedSearchCV

## 2.Specifying multiple metrics for evaluation

## 3.Model selection: development and evaluation
When evaluating the resulting model it is important to do it on held-out samples that were not seen during the grid search process: it is recommended to split the data into a development set (to be fed to the GridSearchCV instance) and an evaluation set to compute performance metrics.
This can be done by using the train_test_split utility function.
## 4.Parallelism
Use the n_jobs parameter!


# Parameter estimation using grid search with cross-validation


This examples shows how a classifier is optimized by cross-validation,
which is done using the :class:`sklearn.model_selection.GridSearchCV` object
on a development set that comprises only half of the available labeled data.

The performance of the selected hyper-parameters and trained model is
then measured on a dedicated evaluation set that was not used during
the model selection step.

More details on tools available for model selection can be found in the
sections on `cross_validation` and `grid_search`.




In [2]:
from __future__ import print_function

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

print(__doc__)

# Loading the Digits dataset
digits = datasets.load_digits()

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality.

Automatically created module for IPython interactive environment
# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}

Grid scores on development set:

0.986 (+/-0.016) for {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.959 (+/-0.029) for {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.026) for {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.025) for {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.988 (+/-0.017) for {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.982 (+/-0.025) for {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.975 (+/-0.014) for {'C': 1, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 10, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 100, 'kernel': 'linear'}
0.975 (+/-0.014) for {'C': 1000, 'kernel': 'linear'}

Detailed classification report:

The model 

# KNN Classifier

In [4]:
from sklearn.neighbors import KNeighborsClassifier

tuned_parameters = [{'n_neighbors': [2,5,10,20], 
                     'weights': ['uniform','distance'],
                     'algorithm': ['auto','kd_tree','brute'],
                     'p':[1,2,4]}]
                    

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=5,
                       scoring='%s_macro' % score,n_jobs=-1,random_state=42)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for precision

Best parameters set found on development set:

{'algorithm': 'auto', 'n_neighbors': 2, 'p': 2, 'weights': 'distance'}

Grid scores on development set:

0.969 (+/-0.016) for {'algorithm': 'auto', 'n_neighbors': 2, 'p': 1, 'weights': 'uniform'}
0.977 (+/-0.023) for {'algorithm': 'auto', 'n_neighbors': 2, 'p': 1, 'weights': 'distance'}
0.978 (+/-0.021) for {'algorithm': 'auto', 'n_neighbors': 2, 'p': 2, 'weights': 'uniform'}
0.988 (+/-0.021) for {'algorithm': 'auto', 'n_neighbors': 2, 'p': 2, 'weights': 'distance'}
0.977 (+/-0.024) for {'algorithm': 'auto', 'n_neighbors': 2, 'p': 4, 'weights': 'uniform'}
0.984 (+/-0.017) for {'algorithm': 'auto', 'n_neighbors': 2, 'p': 4, 'weights': 'distance'}
0.974 (+/-0.017) for {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
0.976 (+/-0.023) for {'algorithm': 'auto', 'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
0.980 (+/-0.022) for {'algorithm': 'auto', 'n_neighbors': 5, 'p': 2, 'weigh

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        89
          1       0.97      1.00      0.98        90
          2       1.00      1.00      1.00        92
          3       0.95      0.99      0.97        93
          4       0.99      1.00      0.99        76
          5       0.98      0.94      0.96       108
          6       0.99      1.00      0.99        89
          7       1.00      1.00      1.00        78
          8       1.00      0.95      0.97        92
          9       0.96      0.96      0.96        92

avg / total       0.98      0.98      0.98       899




# Source:

1. https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html#sphx-glr-auto-examples-model-selection-plot-grid-search-digits-py