In [1]:
## toy dataset for classification of patient with breast cancer diagnosis
## this tutorial uses SVM (Support Vector Machines)
## tutorial url:
## https://www.datacamp.com/tutorial/svm-classification-scikit-learn-python

In [28]:
import numpy as np
import pandas as pd
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [3]:
orig = pd.read_csv('../Datasets/wisconsin_breast_cancer.csv')

print(orig.shape)
orig.head()

(569, 33)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
df = orig.copy()

target = df.diagnosis

target = target.map({'M': 1, 'B': 0})

df.drop(columns=['id', 'diagnosis', 'Unnamed: 32'], inplace=True)

print(df.shape)

(569, 30)


In [7]:
x_train, x_test, y_train, y_test = train_test_split(df, target,
                                                    test_size=0.2, random_state=1)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(455, 30) (114, 30) (455,) (114,)


## SVM 

In [44]:
# kernel option: 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
clf = svm.SVC(kernel='linear')
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
y_pred[:5]

array([0, 1, 0, 1, 1], dtype=int64)

In [45]:
print("SVM Accuracy:",metrics.accuracy_score(y_test, y_pred))

SVM Accuracy: 0.956140350877193


In [46]:
metrics.confusion_matrix(y_test, y_pred)

array([[72,  0],
       [ 5, 37]], dtype=int64)

## Logistic Regression for comparison

In [47]:
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)

df_scaled[:2]

array([[0.52103744, 0.0226581 , 0.54598853, 0.36373277, 0.59375282,
        0.7920373 , 0.70313964, 0.73111332, 0.68636364, 0.60551811,
        0.35614702, 0.12046941, 0.3690336 , 0.27381126, 0.15929565,
        0.35139844, 0.13568182, 0.30062512, 0.31164518, 0.18304244,
        0.62077552, 0.14152452, 0.66831017, 0.45069799, 0.60113584,
        0.61929156, 0.56861022, 0.91202749, 0.59846245, 0.41886396],
       [0.64314449, 0.27257355, 0.61578329, 0.50159067, 0.28987993,
        0.18176799, 0.20360825, 0.34875746, 0.37979798, 0.14132266,
        0.15643672, 0.08258929, 0.12444047, 0.12565979, 0.11938675,
        0.08132304, 0.0469697 , 0.25383595, 0.08453875, 0.0911101 ,
        0.60690146, 0.30357143, 0.53981772, 0.43521431, 0.34755332,
        0.15456336, 0.19297125, 0.63917526, 0.23358959, 0.22287813]])

In [48]:
x_train_lr, x_test_lr, y_train_lr, y_test_lr = train_test_split(df_scaled, target,
                                                    test_size=0.2, random_state=1)

print(x_train_lr.shape, x_test_lr.shape, y_train_lr.shape, y_test_lr.shape)

(455, 30) (114, 30) (455,) (114,)


In [51]:
lr = LogisticRegression(max_iter=200)
lr.fit(x_train_lr, y_train_lr)

y_pred_lr = lr.predict(x_test_lr)
y_pred_lr[:5]

array([0, 1, 0, 1, 0], dtype=int64)

In [52]:
print("Logistic Regression  Accuracy:",metrics.accuracy_score(y_test_lr, y_pred_lr))

Logistic Regression  Accuracy: 0.956140350877193


In [53]:
metrics.confusion_matrix(y_test_lr, y_pred_lr)

array([[72,  0],
       [ 5, 37]], dtype=int64)

In [54]:
## Performance of SVM and Logistic Regression are the same for this toy dataset.
## Not surprising as the test dataset is small, and False Negative would be more
## common with more benign outcomes to learn from.
## Balancing the class_weight led to a minor drop in accuracy.