In [23]:
# Model and Parameter Selection

# load new dataset
import pandas as pd
import numpy as np
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
print(df.head())

# classify label and feature
X = df.iloc[:, 2:].values
y = df.iloc[:, 1].values

# encode the string label to numerical label
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(pd.DataFrame(y).head())

# split dataset into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)
X_train = pd.DataFrame(X_train).fillna(pd.DataFrame(X_train).mean()).values
# y_train = pd.DataFrame(y_train).fillna(pd.DataFrame(y_train).mean()).values
# print(pd.DataFrame(X_train).isnull().sum())
# print(pd.DataFrame(y_train).isnull().sum())

         0  1      2      3       4       5        6        7       8   \
0    842302  M  17.99  10.38  122.80  1001.0  0.11840  0.27760  0.3001   
1    842517  M  20.57  17.77  132.90  1326.0  0.08474  0.07864  0.0869   
2  84300903  M  19.69  21.25  130.00  1203.0  0.10960  0.15990  0.1974   
3  84348301  M  11.42  20.38   77.58   386.1  0.14250  0.28390  0.2414   
4  84358402  M  20.29  14.34  135.10  1297.0  0.10030  0.13280  0.1980   

        9   ...     22     23      24      25      26      27      28      29  \
0  0.14710  ...  25.38  17.33  184.60  2019.0  0.1622  0.6656  0.7119  0.2654   
1  0.07017  ...  24.99  23.41  158.80  1956.0  0.1238  0.1866  0.2416  0.1860   
2  0.12790  ...  23.57  25.53  152.50  1709.0  0.1444  0.4245  0.4504  0.2430   
3  0.10520  ...  14.91  26.50   98.87   567.7  0.2098  0.8663  0.6869  0.2575   
4  0.10430  ...  22.54  16.67  152.20  1575.0  0.1374  0.2050  0.4000  0.1625   

       30       31  
0  0.4601  0.11890  
1  0.2750  0.08902  
2  0.

In [24]:
# Grid Search for selecting best hyper-parameters
# -----------------------------------------

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

pipe_svc = make_pipeline(StandardScaler(), 
                         SVC(random_state=1))
parameter_range = [0.0001, 0.001, 0.01, 0.1, 
                   1.0, 10.0, 100.0, 1000.0]
parameter_grid = [{'svc__C':parameter_range, 'svc__kernel':['linear']}, 
                  {'svc__C':parameter_range, 'svc__gamma':parameter_range, 'svc__kernel':['rbf']}]
gs_svc = GridSearchCV(estimator=pipe_svc, 
                      param_grid=parameter_grid, 
                      scoring='accuracy', 
                      cv=10, 
                      refit=True, 
                      n_jobs=-1)
gs_svc = gs_svc.fit(X_train, y_train)
gs_est = gs_svc.best_estimator_
print('best score: ', gs_svc.best_score_)
print('best parameter: ', gs_svc.best_params_)
print('test score: ', gs_est.score(X_test, y_test))

best score:  0.9846859903381642
best parameter:  {'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}
test score:  0.9736842105263158


In [25]:
# Nested Cross-Validation for selecting best model
# ------------------------------------------------
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
import numpy as np

gs_svc = GridSearchCV(estimator=pipe_svc, 
                      param_grid=parameter_grid, 
                      scoring='accuracy', 
                      cv=2)
gs_dt = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0), 
                     param_grid=[{'max_depth':[1, 2, 3, 4, 5, 6, 7, None]}], 
                     scoring='accuracy', 
                     cv=2)
scores_svc = cross_val_score(gs_svc, X_train, y_train, 
                             scoring='accuracy', cv=5)
scores_dt = cross_val_score(gs_dt, X_train, y_train, 
                            scoring='accuracy', cv=5)
print('SVC accuracy: %.3f +/- %.3f' % (np.mean(scores_svc), np.std(scores_svc)))
print('DT accuracy: %.3f +/- %.3f' % (np.mean(scores_dt), np.std(scores_dt)))


SVC accuracy: 0.974 +/- 0.015
DT accuracy: 0.934 +/- 0.016
