# Apply the Pipeline class and GridSearch with Cross-Validation to decide which Model to use

## Import Packages and Load Data

In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np

In [2]:
cancer = load_breast_cancer()
cancer.data.shape

(569, 30)

In [3]:
X = cancer.data
y = cancer.target

## Train Test Split (70/30)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## Instantiate the Pipeline, define the Parameter Grid and execute GridSearchCV

In [5]:
pl = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])

In [6]:
param_grid = [
    {'classifier': [SVC()], 'preprocessing': [StandardScaler(), None],
     'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
     'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
    {'classifier': [RandomForestClassifier(n_estimators=500)], 'preprocessing': [None],
     'classifier__max_depth': [2, 5, 10]}]

In [7]:
grid_search = GridSearchCV(pl, param_grid, verbose=2, n_jobs=-1)

In [8]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 75 candidates, totalling 375 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 375 out of 375 | elapsed:    6.6s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('classifier',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shri...
                                            

## Best Model is SVC with C=10 and Gamma=0.001

In [9]:
print('Best Parameters:\n{}\n'.format(grid_search.best_params_))
print('Best Cross-Validation Score: {:.2f}'.format(grid_search.best_score_))
print('Test Set Score: {:.2f}'.format(grid_search.score(X_test, y_test)))

Best Parameters:
{'classifier': SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False), 'classifier__C': 10, 'classifier__gamma': 0.001, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

Best Cross-Validation Score: 0.98
Test Set Score: 0.96
