In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_curve

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_diabetes

In [3]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [4]:
df["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [5]:
x = df.drop("Outcome",axis =1)
y = df["Outcome"]

x_train, x_test,y_train, y_test = train_test_split(x,y,test_size=0.25,random_state=42,stratify=y)

In [6]:
y_train

751    0
358    0
718    0
536    0
651    0
      ..
676    1
113    0
556    0
152    1
107    0
Name: Outcome, Length: 576, dtype: int64

In [7]:
x_train

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
751,121,78,39,74,39.0,0.261,28
358,88,74,40,54,35.3,0.378,48
718,108,60,46,178,35.5,0.415,24
536,105,90,0,0,29.6,0.197,46
651,117,60,23,106,33.8,0.466,27
...,...,...,...,...,...,...,...
676,156,86,0,0,24.8,0.230,53
113,76,62,0,0,34.0,0.391,25
556,97,70,40,0,38.1,0.218,30
152,156,86,28,155,34.3,1.189,42


# Model Training

In [8]:
knn_clf = KNeighborsClassifier(n_neighbors=5,p=2) # k=5, p=2 (Euclidean dist)
knn_clf.fit(x_train,y_train)

KNeighborsClassifier()

# Evaluation

In [9]:
y_pred = knn_clf.predict(x_test)
y_pred[40:45]

array([1, 0, 0, 0, 1])

In [10]:
y_test[40:45]

235    1
418    0
496    0
539    1
484    1
Name: Outcome, dtype: int64

In [11]:
# Testing data evealuation
y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print("confusion_matrix is:\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy is :",accuracy)
print("*"*80)
clf_report = classification_report(y_test,y_pred)
print("classification_report:\n",clf_report)

confusion_matrix is:
 [[105  20]
 [ 36  31]]
********************************************************************************
Accuracy is : 0.7083333333333334
********************************************************************************
classification_report:
               precision    recall  f1-score   support

           0       0.74      0.84      0.79       125
           1       0.61      0.46      0.53        67

    accuracy                           0.71       192
   macro avg       0.68      0.65      0.66       192
weighted avg       0.70      0.71      0.70       192



In [12]:
# Training data evaluation
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print("confusion_matrix is:\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train,y_pred_train)
print("Accuracy is :",accuracy)
print("*"*80)
clf_report = classification_report(y_train,y_pred_train)
print("classification_report:\n",clf_report)

confusion_matrix is:
 [[334  41]
 [ 69 132]]
********************************************************************************
Accuracy is : 0.8090277777777778
********************************************************************************
classification_report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       375
           1       0.76      0.66      0.71       201

    accuracy                           0.81       576
   macro avg       0.80      0.77      0.78       576
weighted avg       0.81      0.81      0.81       576



# Hyperparameter tuning

In [13]:
knn_clf = KNeighborsClassifier()

hyperparameters = {"n_neighbors" :np.arange(3,30),
                  "p":[1,2]}

gscv_knn_clf = GridSearchCV(knn_clf,hyperparameters,cv = 5)
gscv_knn_clf.fit(x_train,y_train)
gscv_knn_clf.best_estimator_

KNeighborsClassifier(n_neighbors=29, p=1)

In [14]:
gscv_knn_clf.best_params_

{'n_neighbors': 29, 'p': 1}

In [15]:
# Testing data evealuation
# knn_clf = KNeighborsClassifier(n_neighbors=29, p=1)
knn_clf = gscv_knn_clf.best_estimator_


y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print("confusion_matrix is:\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy is :",accuracy)
print("*"*80)
clf_report = classification_report(y_test,y_pred)
print("classification_report:\n",clf_report)

confusion_matrix is:
 [[108  17]
 [ 36  31]]
********************************************************************************
Accuracy is : 0.7239583333333334
********************************************************************************
classification_report:
               precision    recall  f1-score   support

           0       0.75      0.86      0.80       125
           1       0.65      0.46      0.54        67

    accuracy                           0.72       192
   macro avg       0.70      0.66      0.67       192
weighted avg       0.71      0.72      0.71       192



In [16]:
# Training data evealuation
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print("confusion_matrix is:\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train,y_pred_train)
print("Accuracy is :",accuracy)
print("*"*80)
clf_report = classification_report(y_train,y_pred_train)
print("classification_report:\n",clf_report)

confusion_matrix is:
 [[345  30]
 [ 93 108]]
********************************************************************************
Accuracy is : 0.7864583333333334
********************************************************************************
classification_report:
               precision    recall  f1-score   support

           0       0.79      0.92      0.85       375
           1       0.78      0.54      0.64       201

    accuracy                           0.79       576
   macro avg       0.79      0.73      0.74       576
weighted avg       0.79      0.79      0.77       576

