# KNN Algorithm

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import confusion_matrix,classification_report, accuracy_score, roc_curve

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_diabetes

In [3]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [4]:
df["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [5]:
x = df.drop("Outcome",axis = 1)
y = df['Outcome']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42,stratify=y)

In [6]:
x_test

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
635,104,72,0,0,31.2,0.465,38
698,127,88,11,155,34.5,0.598,28
637,94,76,18,66,31.6,0.649,23
402,136,84,41,88,35.0,0.286,35
425,184,78,39,277,37.0,0.264,31
...,...,...,...,...,...,...,...
62,44,62,0,0,25.0,0.587,36
477,114,76,17,110,23.8,0.466,31
311,106,70,37,148,39.4,0.605,22
116,124,74,0,0,34.0,0.220,38


# Model Training

In [31]:
knn_clf = KNeighborsClassifier(n_neighbors=5,p=2) # k = 5, p= 2(Euclidean dist)
knn_clf.fit(x_train,y_train)

KNeighborsClassifier()

# Evaluation

In [32]:


y_pred = knn_clf.predict(x_test)
y_pred[40:45]

array([1, 0, 0, 0, 1], dtype=int64)

In [33]:
y_test[40:45]

235    1
418    0
496    0
539    1
484    1
Name: Outcome, dtype: int64

In [34]:
# Testing data Evalutaion
cnf_matrix = confusion_matrix(y_test,y_pred)
print("Confision marix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy score is",accuracy)
print("*"*80)
clf_report = classification_report(y_test,y_pred)
print("Classification report :\n",clf_report)

Confision marix :
 [[105  20]
 [ 36  31]]
********************************************************************************
Accuracy score is 0.7083333333333334
********************************************************************************
Classification report :
               precision    recall  f1-score   support

           0       0.74      0.84      0.79       125
           1       0.61      0.46      0.53        67

    accuracy                           0.71       192
   macro avg       0.68      0.65      0.66       192
weighted avg       0.70      0.71      0.70       192



In [35]:
# Training data Evalutaion
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print("Confision marix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train,y_pred_train)
print("Accuracy score is",accuracy)
print("*"*80)
clf_report = classification_report(y_train,y_pred_train)
print("Classification report :\n",clf_report)

Confision marix :
 [[334  41]
 [ 69 132]]
********************************************************************************
Accuracy score is 0.8090277777777778
********************************************************************************
Classification report :
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       375
           1       0.76      0.66      0.71       201

    accuracy                           0.81       576
   macro avg       0.80      0.77      0.78       576
weighted avg       0.81      0.81      0.81       576



# Hyperparameter Tuning

In [46]:
knn_clf = KNeighborsClassifier()

hyperparameters = {"n_neighbors":np.arange(3,30),
                  "p" : [1,2]}

gscv_knn_clf = GridSearchCV(knn_clf,hyperparameters,cv = 4)
gscv_knn_clf.fit(x_train,y_train)
gscv_knn_clf.best_estimator_

KNeighborsClassifier(n_neighbors=26, p=1)

In [43]:
gscv_knn_clf.best_params_

{'n_neighbors': 26, 'p': 1}

In [44]:
# Testing data Evalutaion

#knn_clf = KNeighborsClassifier(n_neighbors=29, p=1)
knn_clf = gscv_knn_clf.best_estimator_

cnf_matrix = confusion_matrix(y_test,y_pred)
print("Confision marix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy score is",accuracy)
print("*"*80)
clf_report = classification_report(y_test,y_pred)
print("Classification report :\n",clf_report)

Confision marix :
 [[105  20]
 [ 36  31]]
********************************************************************************
Accuracy score is 0.7083333333333334
********************************************************************************
Classification report :
               precision    recall  f1-score   support

           0       0.74      0.84      0.79       125
           1       0.61      0.46      0.53        67

    accuracy                           0.71       192
   macro avg       0.68      0.65      0.66       192
weighted avg       0.70      0.71      0.70       192



In [45]:
# Training data Evalutaion
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print("Confision marix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train,y_pred_train)
print("Accuracy score is",accuracy)
print("*"*80)
clf_report = classification_report(y_train,y_pred_train)
print("Classification report :\n",clf_report)

Confision marix :
 [[344  31]
 [103  98]]
********************************************************************************
Accuracy score is 0.7673611111111112
********************************************************************************
Classification report :
               precision    recall  f1-score   support

           0       0.77      0.92      0.84       375
           1       0.76      0.49      0.59       201

    accuracy                           0.77       576
   macro avg       0.76      0.70      0.72       576
weighted avg       0.77      0.77      0.75       576

