## Import Packages and Load Data

In [37]:
from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import LinearSVC
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix

In [38]:
cancer = load_breast_cancer()

In [39]:
cancer.data.shape

(569, 30)

In [40]:
X = cancer['data']
y = cancer['target']

## Train Test Split (70/30)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

## LinearSVC

In [45]:
lsvc = LinearSVC(random_state=0)

In [46]:
lsvc.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)

In [47]:
print('Train Set Score: {:.2f}'.format(lsvc.score(X_train, y_train)))
print('Test Set Score: {:.2f}'.format(lsvc.score(X_test, y_test)))

Train Set Score: 0.92
Test Set Score: 0.92


In [48]:
pred = lsvc.predict(X_test)

In [49]:
#13 misclassifications
print(confusion_matrix(y_test, pred))

[[ 53  13]
 [  0 105]]


## Compare with kernelized SCV (default kernel='rbf')

In [50]:
svc = SVC(random_state=0)

In [51]:
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [52]:
print('Train Set Score: {:.2f}'.format(svc.score(X_train, y_train)))
print('Test Set Score: {:.2f}'.format(svc.score(X_test, y_test)))

Train Set Score: 0.92
Test Set Score: 0.92


In [53]:
pred = svc.predict(X_test)

In [54]:
#still 13 misclassifications
print(confusion_matrix(y_test, pred))

[[ 56  10]
 [  3 102]]


## Scale features to improve performance

In [55]:
scaler = MinMaxScaler()

In [56]:
scaler.fit(X_train)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [57]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [58]:
svc = SVC(random_state=0)

In [59]:
svc.fit(X_train_scaled, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [60]:
print('Train Set Score: {:.2f}'.format(svc.score(X_train_scaled, y_train)))
print('Test Set Score: {:.2f}'.format(svc.score(X_test_scaled, y_test)))

Train Set Score: 0.98
Test Set Score: 0.97


In [61]:
pred = svc.predict(X_test_scaled)

In [62]:
#much better, only 5 misclassifications
print(confusion_matrix(y_test, pred))

[[ 62   4]
 [  1 104]]


## Increase C parameter to fit a more complex model

In [63]:
svc = SVC(C=10, random_state=0)

In [64]:
svc.fit(X_train_scaled, y_train)

SVC(C=10, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [65]:
print('Train Set Score: {:.2f}'.format(svc.score(X_train_scaled, y_train)))
print('Test Set Score: {:.2f}'.format(svc.score(X_test_scaled, y_test)))

Train Set Score: 0.99
Test Set Score: 0.97


In [66]:
pred = svc.predict(X_test_scaled)

In [67]:
#still 5 misclassifications; default C=1 already doing a very good job
print(confusion_matrix(y_test, pred))

[[ 63   3]
 [  2 103]]
