# Managing Machine Learning Workflows with Scikit-learn Pipelines

### Source
* Part 1: https://www.kdnuggets.com/2017/12/managing-machine-learning-workflows-scikit-learn-pipelines-part-1.html
* Part 2: https://www.kdnuggets.com/2018/01/managing-machine-learning-workflows-scikit-learn-pipelines-part-2.html
* Part 3: https://www.kdnuggets.com/2018/01/managing-machine-learning-workflows-scikit-learn-pipelines-part-3.html

### 使用 Pipeline 看三個不同的模型的結果

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree

# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Construct some pipelines
# 每個步驟是一個由 (名稱, 物件) 組成的 tuple
# 然後把全部的步驟依照順序放到 list 裡面
pipe_lr = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('clf', LogisticRegression(random_state=42))
])

pipe_svm = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('clf', svm.SVC(random_state=42))
])

pipe_dt = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('clf', tree.DecisionTreeClassifier(random_state=42))
])

# List of pipelines for each of iteration
pipelines = [pipe_lr, pipe_svm, pipe_dt]

# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression',
             1: 'Support Vector Machine',
             2: 'Decision Tree'}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)
    
# Compare accuracies
for idx, val in enumerate(pipelines):
    print("%s pipeline test accuracy: %.3f" % (pipe_dict[idx], val.score(X_test, y_test)))
    
# Identify the most accurate model on test data
best_acc = 0.0
best_clf = 0
best_pipe = ""

for idx, val in enumerate(pipelines):
    if val.score(X_test, y_test) > best_acc:
        best_acc = val.score(X_test, y_test)
        best_pipe = val
        best_clf = idx

print("Classifier with best accuracy: %s" % pipe_dict[best_clf])

# Save pipeline to file
joblib.dump(best_pipe, "best_pipeline.pkl", compress=1)
print("Save %s pipeline to file" % pipe_dict[best_clf])

Logistic Regression pipeline test accuracy: 0.933
Support Vector Machine pipeline test accuracy: 0.900
Decision Tree pipeline test accuracy: 0.867
Classifier with best accuracy: Logistic Regression
Save Logistic Regression pipeline to file




### 使用一個 Pipeline

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn import tree

# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Construct pipeline
pipe = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('clf', tree.DecisionTreeClassifier(random_state=42))
])

# Fit the pipeline
pipe.fit(X_train, y_train)

# Pipeline test accuracy
print("Test accuracy: %.3f" % pipe.score(X_test, y_test))

# Pipeline estimator params; estimator is stored as step 3 ([2]), second item ([1])
# 注意這邊
print("\nModel hyperparameters:\n", pipe.steps[2][1].get_params())

Test accuracy: 0.867

Model hyperparameters:
 {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': 42, 'splitter': 'best'}


### 把上面的加上 Grid search

In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV # 注意這裡
from sklearn import tree

# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Construct pipeline
pipe = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('clf', tree.DecisionTreeClassifier(random_state=42))
])

param_range = [1, 2, 3, 4, 5]

# Set grid search params
# 參數和參數的值分別是 key 和 value
# 物件名稱__參數名稱:[參數值]
grid_params = [{
    'clf__criterion':['gini', 'entropy'],
    'clf__min_samples_leaf': param_range,
    'clf__max_depth': param_range,
    'clf__min_samples_split': param_range[1:],
    'clf__presort': [True, False] # 對 accuracy 沒有影響，但會影響 run time
}]

# Construct grid search
gs = GridSearchCV(estimator=pipe,
                  param_grid = grid_params,
                  scoring="accuracy",
                  cv=10)

# Fit using grid search
gs.fit(X_train, y_train)

# Best accuracy
print("Best accuracy: %.3f" % gs.best_score_)

# Best params
print("\nBest params:\n", gs.best_params_)

Best accuracy: 0.925

Best params:
 {'clf__criterion': 'gini', 'clf__max_depth': 2, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__presort': True}




### 用 Pipeline 和 GridSearchCV跑許多不同的 model

In [4]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

# Construct some pipelines
pipe_lr = Pipeline([
    ('scl', StandardScaler()),
    ('clf', LogisticRegression(random_state=42))
])


pipe_lr_pca = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('clf', LogisticRegression(random_state=42))
])

pipe_rf = Pipeline([
    ('scl', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=42))
])

pipe_rf_pca = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('clf', RandomForestClassifier(random_state=42))
])

pipe_svm = Pipeline([
    ('scl', StandardScaler()),
    ('clf', svm.SVC(random_state=42))
])

pipe_svm_pca = Pipeline([
    ('scl', StandardScaler()),
    ('pca', PCA(n_components=2)),
    ('clf', svm.SVC(random_state=42))
])

# Set grid search params
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range_fl = [1.0, 0.5, 0.1]

grid_params_lr = [{
    'clf__penalty':['l1', 'l2'],
    'clf__C': param_range_fl,
    'clf__solver':['liblinear']
}]

grid_params_rf = [{
    'clf__criterion':['gini', 'entropy'],
    'clf__min_samples_leaf': param_range,
    'clf__max_depth': param_range,
    'clf__min_samples_split': param_range[1:]
}]

grid_params_svm = [{
    'clf__kernel':['linear', 'rbf'],
    'clf__C':param_range
}]


# Construct grid searchs
jobs = -1

gs_lr = GridSearchCV(
    estimator=pipe_lr,
    param_grid=grid_params_lr,
    scoring="accuracy",
    cv=10
)

gs_lr_pca = GridSearchCV(
    estimator=pipe_lr_pca,
    param_grid=grid_params_lr,
    scoring="accuracy",
    cv=10
)

gs_rf = GridSearchCV(
    estimator=pipe_rf,
    param_grid=grid_params_rf,
    scoring="accuracy",
    cv=10,
    n_jobs=jobs
)

gs_rf_pca = GridSearchCV(
    estimator=pipe_rf_pca,
    param_grid=grid_params_rf,
    scoring="accuracy",
    cv=10,
    n_jobs=jobs
)

gs_svm = GridSearchCV(
    estimator=pipe_svm,
    param_grid=grid_params_svm,
    scoring="accuracy",
    cv=10,
    n_jobs=jobs
)

gs_svm_pca = GridSearchCV(
    estimator=pipe_svm_pca,
    param_grid=grid_params_svm,
    scoring="accuracy",
    cv=10,
    n_jobs=jobs
)

# List of pipelines for ease of iteration
grids = [gs_lr, gs_lr_pca,
         gs_rf, gs_rf_pca,
         gs_svm, gs_svm_pca]

# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {
    0: 'Logistic Regression',
    1: 'Logistic Regression w/ PCA',
    2: 'Random Forest',
    3: 'Random Forest w/ PCA',
    4: 'Support Vector Machine',
    5: 'Support Vector Machine w/ PCA'
}

# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''

for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    # Fit grid search
    gs.fit(X_train, y_train)
    # Best params
    print('Best params: %s' % gs.best_params_)
    # Best training data accuracy
    print('Best train accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx

print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
dump_file = 'best_gs_pipeline.pkl'
joblib.dump(best_gs, dump_file, compress=1)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))



Performing model optimizations...

Estimator: Logistic Regression
Best params: {'clf__C': 1.0, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Best train accuracy: 0.917
Test set accuracy score for best params: 0.967

Estimator: Logistic Regression w/ PCA




Best params: {'clf__C': 0.5, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Best train accuracy: 0.858
Test set accuracy score for best params: 0.933

Estimator: Random Forest




Best params: {'clf__criterion': 'gini', 'clf__max_depth': 3, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 2}
Best train accuracy: 0.950
Test set accuracy score for best params: 1.000

Estimator: Random Forest w/ PCA




Best params: {'clf__criterion': 'gini', 'clf__max_depth': 5, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 8}
Best train accuracy: 0.908
Test set accuracy score for best params: 0.900

Estimator: Support Vector Machine




Best params: {'clf__C': 3, 'clf__kernel': 'linear'}
Best train accuracy: 0.967
Test set accuracy score for best params: 0.967

Estimator: Support Vector Machine w/ PCA
Best params: {'clf__C': 4, 'clf__kernel': 'rbf'}
Best train accuracy: 0.925
Test set accuracy score for best params: 0.900

Classifier with best test set accuracy: Random Forest

Saved Random Forest grid search pipeline to file: best_gs_pipeline.pkl


