KNN - Diabetes prediction (Binary output - Yes/No)

In [1]:
#General python tool to handle our data
import numpy as np
import pandas as pd

#Splitting data to train + test our model
from sklearn.model_selection import train_test_split

#To standardize all features and confine their values to [-1,1]
from sklearn.preprocessing import StandardScaler

#Scikit-learn classification algorithm based on KNN technique
from sklearn.neighbors import KNeighborsClassifier

#Testing validity of our model
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [5]:
dataset= pd.read_csv('diabetes.csv')
print("Number of records:", len(dataset))
dataset.head()

Number of records: 768


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
#Replacing zero values with mean values of that column
zero_not_accepted= ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

for column in zero_not_accepted:
    dataset[column]= dataset[column].replace(0, np.NaN)
    mean= int(dataset[column].mean(skipna= True))
    dataset[column]= dataset[column].replace(np.NaN, mean)
dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27.0,155.0,36.8,0.340,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1,126.0,60.0,29.0,155.0,30.1,0.349,47,1


In [38]:
#Splitting dataset- Test/Train
X= dataset.iloc[:, :8] #8th column is skipped as it is "Outcome"
Y= dataset.iloc[:, 8]
X_train, X_test, Y_train, Y_test= train_test_split(X, Y, random_state=0,test_size= 0.2)

In [39]:
#Feature scaling (Standardization of feature values)
sc_X= StandardScaler()
X_train= sc_X.fit_transform(X_train)
X_test= sc_X.fit_transform(X_test)

#Y is 0/1: No standardization for outcomes anyway

In [40]:
#Determining "k" for KNN
import math as m
k= int(m.sqrt(len(Y_test)))
if k%2==0: #Avoid even values to avoid equal 0 and equal 1 neighbors
    k-=1
print("We consider k=", k)

We consider k= 11


In [41]:
#Defining the model
classifier= KNeighborsClassifier(n_neighbors= k, p=2,  metric= 'euclidean')
classifier.fit(X_train, Y_train)

In [42]:
#Predicting test results
Y_pred= classifier.predict(X_test)
Y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [51]:
#Evaluating the model
cm= confusion_matrix(Y_test, Y_pred)
f1= f1_score(Y_test, Y_pred)
accuracy= accuracy_score(Y_test, Y_pred)
print("Correct Predictions: ", cm[0][0]+cm[1][1])
print("Incorrect Predictions: ", cm[0][1]+cm[1][0])
print("f1_score:", f1*100)
print("Accuracy score:", accuracy*100)

Correct Predictions:  124
Incorrect Predictions:  30
f1_score: 65.9090909090909
Accuracy score: 80.51948051948052
