In [1]:
## toy dataset for classification of patient with diabetes
## this tutorial uses SVM (Support Vector Machines)
## tutorial url:
## https://www.datacamp.com/tutorial/svm-classification-scikit-learn-python

In [2]:
import numpy as np
import pandas as pd
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [3]:
orig = pd.read_csv('../Datasets/pima-indians-diabetes.csv')

print(orig.shape)
orig.head()

(768, 9)


Unnamed: 0,Pregnant,Glucose,BloodP,Triceps,Insulin,BMI,DPF,Age,Diabetic
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
orig.Diabetic.value_counts()

0    500
1    268
Name: Diabetic, dtype: int64

In [7]:
df = orig.copy()

target = df.Diabetic

df.drop(columns=['Diabetic'], inplace=True)

print(df.shape)

(768, 8)


In [8]:
x_train, x_test, y_train, y_test = train_test_split(df, target,
                                                    test_size=0.2, random_state=1)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(614, 8) (154, 8) (614,) (154,)


## SVM 

In [40]:
# kernel option: 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
clf = svm.SVC(kernel='linear', class_weight='balanced', C=1)
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)
y_pred[:5]

array([1, 0, 0, 0, 0], dtype=int64)

In [41]:
print("SVM Test Accuracy:",metrics.accuracy_score(y_test, y_pred))

SVM Test Accuracy: 0.7922077922077922


In [42]:
metrics.confusion_matrix(y_test, y_pred)

array([[82, 17],
       [15, 40]], dtype=int64)

## Logistic Regression for comparison

In [43]:
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)

df_scaled[:2]

array([[0.35294118, 0.74371859, 0.59016393, 0.35353535, 0.        ,
        0.50074516, 0.23441503, 0.48333333],
       [0.05882353, 0.42713568, 0.54098361, 0.29292929, 0.        ,
        0.39642325, 0.11656704, 0.16666667]])

In [44]:
x_train_lr, x_test_lr, y_train_lr, y_test_lr = train_test_split(df_scaled, target,
                                                    test_size=0.2, random_state=1)

print(x_train_lr.shape, x_test_lr.shape, y_train_lr.shape, y_test_lr.shape)

(614, 8) (154, 8) (614,) (154,)


In [45]:
lr = LogisticRegression(max_iter=200)
lr.fit(x_train_lr, y_train_lr)

y_pred_lr = lr.predict(x_test_lr)
y_pred_lr[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [48]:
print("Logistic Regression Test Accuracy:",metrics.accuracy_score(y_test_lr, y_pred_lr))

Logistic Regression Test Accuracy: 0.7727272727272727


In [49]:
metrics.confusion_matrix(y_test_lr, y_pred_lr)

array([[91,  8],
       [27, 28]], dtype=int64)

In [None]:
## Notes:
## SVM does perform better on True Positive + False Negative, at the cost of correctly identifying
## the majority class