In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


In [2]:
diabetes = pd.read_csv('diabetes.csv')

In [3]:
diabetes.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [4]:
diabetes.shape

(768, 9)

In [5]:
diabetes.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [6]:
X = diabetes.drop('Outcome',axis=1).values

In [7]:
X

array([[  6.   , 148.   ,  72.   , ...,  33.6  ,   0.627,  50.   ],
       [  1.   ,  85.   ,  66.   , ...,  26.6  ,   0.351,  31.   ],
       [  8.   , 183.   ,  64.   , ...,  23.3  ,   0.672,  32.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,  26.2  ,   0.245,  30.   ],
       [  1.   , 126.   ,  60.   , ...,  30.1  ,   0.349,  47.   ],
       [  1.   ,  93.   ,  70.   , ...,  30.4  ,   0.315,  23.   ]])

In [8]:
Y = diabetes['Outcome'].values

In [9]:
Y

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,

In [10]:
### Holdout Cross validation method

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.3,random_state=None)

hmodel = LogisticRegression(solver='liblinear')
hmodel.fit(x_train,y_train)
hscore = hmodel.score(x_test,y_test)

print("Holdout model score:", hscore*100)

Holdout model score: 71.86147186147186


In [11]:
### K-fold cross validation Method

from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)
kmodel = LogisticRegression(solver='liblinear')

kscore = cross_val_score(kmodel,X,Y,cv=kfold)

print("K-fold Model score:", kscore*100)
print("K-fold Model Avg. score", np.average(kscore)*100)


K-fold Model score: [76.62337662 71.42857143 75.32467532 82.35294118 75.81699346]
K-fold Model Avg. score 76.30931160342925


In [12]:
### Stratified K-fold cross validation method


from sklearn.model_selection import StratifiedKFold

sfold = StratifiedKFold(n_splits=5,random_state=None)

smodel = LogisticRegression(solver='liblinear')

sscore = cross_val_score(smodel,X,Y,cv=sfold)

print("S-fold Model score:", sscore*100)
print("S-fold Model Avg. score", np.average(sscore)*100)

S-fold Model score: [76.62337662 75.32467532 75.32467532 80.39215686 76.47058824]
S-fold Model Avg. score 76.8270944741533


In [13]:
### Leave one out cross validation method

from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()
lmodel = LogisticRegression(solver='liblinear')

lscore = cross_val_score(lmodel,X,Y,cv=loo)

print("LeaveOneOut Model score:", lscore*100)
print("LeaveOneOut Model Avg. score", np.average(lscore)*100)

LeaveOneOut Model score: [100. 100. 100. 100. 100. 100.   0.   0. 100.   0. 100. 100.   0. 100.
 100. 100.   0.   0. 100.   0. 100. 100. 100.   0. 100.   0. 100. 100.
   0. 100. 100. 100. 100. 100. 100. 100.   0.   0.   0.   0.   0.   0.
 100. 100.   0. 100. 100. 100.   0. 100. 100. 100. 100. 100.   0. 100.
 100. 100.   0. 100. 100. 100. 100. 100.   0. 100.   0. 100. 100. 100.
   0. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100.   0. 100. 100. 100. 100. 100. 100.   0. 100.   0. 100. 100.
 100.   0. 100. 100. 100. 100. 100. 100. 100. 100. 100.   0. 100. 100.
 100. 100. 100.   0.   0. 100. 100. 100. 100. 100. 100. 100.   0.   0.
 100. 100.   0.   0. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100.   0.   0. 100. 100. 100.   0. 100. 100. 100. 100.   0.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.   0.   0. 100. 100.
 100. 100.   0. 100. 100. 100. 100. 100. 100. 100.   0. 100. 100. 100.
 100. 100. 100. 100. 100.   0.   0.   0. 100. 100. 1

### leave p-out cross validation method

### this method requires intensive computation

from sklearn.model_selection import LeavePOut

lpo = LeavePOut(p=2)

lpmodel = LogisticRegression(solver='liblinear')

lpscore = cross_val_score(lpmodel,X,Y,cv=lpo)

print("LeaveOneOut Model score:", lpscore*100)
print("LeaveOneOut Model Avg. score", np.average(lpscore)*100)



In [14]:
### ShuffleSplit cross validation -> hybrid of traditional train-test splitting and the k-fold cross-validation



from sklearn.model_selection import ShuffleSplit

ssfold = ShuffleSplit(n_splits=5,test_size=0.3)

ssmodel = LogisticRegression(solver='liblinear')

ssscore = cross_val_score(ssmodel,X,Y,cv=ssfold)

print("Shuffile split Model score:", ssscore*100)
print("Shuffile split Model Avg. score", np.average(ssscore)*100)

Shuffile split Model score: [76.19047619 74.45887446 77.92207792 80.08658009 72.29437229]
Shuffile split Model Avg. score 76.19047619047619
