In [1]:
## toy dataset for classification of patient being diabetic
## dataset has 8 features, which is compressed down using Principal component analysis (PCA)

In [2]:
import numpy as np
import pandas as pd

In [3]:
orig = pd.read_csv('pima-indians-diabetes.csv')

print(orig.shape)
orig.head()

(768, 9)


Unnamed: 0,Pregnant,Glucose,BloodP,Triceps,Insulin,BMI,DPF,Age,Diabetic
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df = orig.copy()

In [5]:
## target not balanced
df.Diabetic.value_counts()

0    500
1    268
Name: Diabetic, dtype: int64

In [6]:
df.describe()

Unnamed: 0,Pregnant,Glucose,BloodP,Triceps,Insulin,BMI,DPF,Age,Diabetic
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Normalise values

In [7]:
from sklearn.preprocessing import MinMaxScaler

In [10]:
target = df.Diabetic

features = df.iloc[:, :-1].values

print(target.shape)
print(features.shape)
print(features[0])

(768,)
(768, 8)
[  6.    148.     72.     35.      0.     33.6     0.627  50.   ]


In [11]:
np.set_printoptions(precision=2, suppress=True)

mms = MinMaxScaler(feature_range=(0, 1))

features_scaled = mms.fit_transform(features)

features_scaled[0]

array([0.35, 0.74, 0.59, 0.35, 0.  , 0.5 , 0.23, 0.48])

## PCA

In [12]:
from sklearn.decomposition import PCA

In [13]:
## can iterate + use test accuracy to identify ideal n for PCA
pca = PCA(4)
projected_scaled = pca.fit_transform(features_scaled)

projected_scaled[0]

array([ 0.3 ,  0.09, -0.02,  0.02])

In [14]:
pca.explained_variance_ratio_

array([0.31, 0.21, 0.12, 0.1 ])

## Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

In [16]:
lr = LogisticRegression(fit_intercept=True)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(projected_scaled, target)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(576, 4) (192, 4) (576,) (192,)


In [18]:
lr.fit(X_train, y_train)
print (lr.coef_, lr.intercept_)

[[ 2.64  2.36 -3.2   0.47]] [-0.79]




In [19]:
y_pred = lr.predict(X_test)

y_pred[:5]

array([0, 0, 0, 1, 1], dtype=int64)

In [20]:
print ('Score of model in training group: {0:2.2f}'.format(lr.score(X_train, y_train)))
print ('Score of model in test group: {0:2.2f}'.format(lr.score(X_test, y_test)))

Score of model in training group: 0.75
Score of model in test group: 0.70


In [21]:
hits = np.count_nonzero(y_test == y_pred)
misses = np.count_nonzero(y_test != y_pred)

print ("Accuracy is: {:3.2f}".format(hits/(hits+misses)))

Accuracy is: 0.70


In [22]:
lrsgd = SGDClassifier(loss = 'log', fit_intercept=True, max_iter = 5000)

In [23]:
lrsgd.fit(X_train, y_train)
print (lrsgd.coef_, lrsgd.intercept_)



[[ 3.19  3.01 -4.56  0.68]] [-0.85]


In [24]:
lrsgd_y_pred = lrsgd.predict(X_test)

lrsgd_y_pred[:5]

array([0, 0, 0, 1, 1], dtype=int64)

In [25]:
print ('Score of SGD model in training group: {0:2.2f}'.format(lrsgd.score(X_train, y_train)))
print ('Score of SGD model in test group: {0:2.2f}'.format(lrsgd.score(X_test, y_test)))

Score of SGD model in training group: 0.75
Score of SGD model in test group: 0.71


## Confusion Matrix

In [26]:
from sklearn.metrics import classification_report, confusion_matrix  

In [27]:
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred)) 

[[106  11]
 [ 47  28]]
              precision    recall  f1-score   support

           0       0.69      0.91      0.79       117
           1       0.72      0.37      0.49        75

   micro avg       0.70      0.70      0.70       192
   macro avg       0.71      0.64      0.64       192
weighted avg       0.70      0.70      0.67       192



In [28]:
## Notes:
## need to work on hyperparameter tuning
## need to work on iteration of hyperparameters