In [58]:
from sklearn.datasets import load_iris

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn import svm

import numpy as np

In [71]:
# References
# https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
# http://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html
# https://www.kdnuggets.com/2017/12/managing-machine-learning-workflows-scikit-learn-pipelines-part-1.html

In [59]:
# Load data, splitting will be handled by grid search
iris = load_iris()
X = iris.data
y = iris.target

In [65]:
# Construct pipelines
pipe_lr = Pipeline([('scl', StandardScaler()),
            ('pca', PCA()),
            ('clf', LogisticRegression())])

pipe_svm = Pipeline([('scl', StandardScaler()),
            ('pca', PCA()),
            ('clf', svm.SVC())])
    

In [61]:
# Parameters to use for grid search
param_lr = {
    'pca__n_components': (2,3,4),
    'clf__penalty': ('l1', 'l2'),
    'clf__C': np.logspace(-2, 10, 13),
}

# https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
# http://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html
param_svm = {
    'pca__n_components': (2,3,4),
    'clf__C': np.logspace(-2, 10, 13),
    'clf__gamma': np.logspace(-9, 3, 13),
}

In [66]:
# Create dictionary for easy referencing
pipe_dict = {
    "lr": ('Logistic Regression', pipe_lr, param_lr),
    "svm": ('Support Vector Machine', pipe_svm, param_svm)
}

In [67]:
classifiers = ["lr", "svm"]
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [70]:
best_pipe = best_clf = best = None

for clf in classifiers:
    model, pipe, param_grid = pipe_dict[clf]
    grid = GridSearchCV(pipe, param_grid=param_grid, cv=cv)
    grid.fit(X, y)
    
    print("{}: The best parameters are {} with a score of {:0.6f}".format(
          model, grid.best_params_, grid.best_score_)
    )
    if grid.best_score_ > best:
        best_pipe = pipe
        best_clf = model
        best = grid.best_score_
    

Logistic Regression: The best parameters are {'clf__penalty': 'l1', 'clf__C': 10000.0, 'pca__n_components': 4} with a score of 0.953333
Support Vector Machine: The best parameters are {'clf__gamma': 0.10000000000000001, 'pca__n_components': 3, 'clf__C': 1.0} with a score of 0.980000


In [74]:
# Save pipeline to file

joblib.dump(best_pipe, 'best_pipeline.pkl', compress=1)
print('Saved {} pipeline to file'.format(best_clf))

Saved Support Vector Machine pipeline to file
