# KNN Algorithm

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.datasets import load_iris

In [26]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


## Train Test Split

In [53]:
x = df.drop('Outcome',axis = 1)
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=24, stratify=y)

## Feature Engineering

### 1. Normalization

In [40]:
normal_scaler = MinMaxScaler()
array = normal_scaler.fit_transform(x_train)
x_train = pd.DataFrame(array,columns=x_train.columns)
x_train.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.373737,0.42623,0.10101,0.042553,0.468013,0.079195,0.016667
1,0.777778,0.639344,0.30303,0.118203,0.520202,0.034247,0.4
2,0.60101,0.721311,0.414141,0.200946,0.762626,0.181079,0.083333
3,0.681818,0.442623,0.0,0.0,0.449495,0.258134,0.683333
4,0.686869,0.57377,0.0,0.0,0.525253,0.470034,0.016667


## 2. Standardization

In [54]:
std_scaler = StandardScaler()
array = std_scaler.fit_transform(x_train) # Storing mean and std of all columns
x_train = pd.DataFrame(array,columns=x_train.columns)
x_train.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,-1.488136,-0.867845,-0.661204,-0.390921,-0.523694,-0.620371,-0.964641
1,1.087678,0.470794,0.602201,0.155464,-0.125097,-0.931891,1.026565
2,-0.039241,0.985656,1.297075,0.753073,1.726449,0.08574,-0.618344
3,0.475922,-0.764873,-1.292907,-0.698263,-0.665132,0.619774,2.498326
4,0.50812,0.058905,-1.292907,-0.698263,-0.086523,2.088366,-0.964641


# Model Training

In [55]:
knn_clf = KNeighborsClassifier() # Euclidean Distance, k = 5
knn_clf.fit(x_train,y_train)

KNeighborsClassifier()

## Evaluation

In [59]:
# x_test

In [56]:
# Testing Data Evaluation
x_test_new = pd.DataFrame(std_scaler.transform(x_test))
x_test_new

Unnamed: 0,0,1,2,3,4,5,6
0,-0.039241,-0.764873,-0.471693,-0.271399,-1.230882,-0.810250,-0.791492
1,0.347131,-0.147039,1.233904,-0.698263,0.312073,-0.836951,-0.964641
2,-2.035497,-0.455956,-1.292907,-0.698263,-1.308030,0.762182,2.931197
3,-0.586601,-0.867845,-1.292907,-0.698263,-0.870859,-1.187040,-1.051215
4,0.830096,0.367822,0.918053,0.957968,0.813534,-0.442360,-0.358622
...,...,...,...,...,...,...,...
187,-3.870764,-1.073790,-0.029501,-0.698263,-0.922291,-1.003095,-0.964641
188,-0.940776,-0.764873,0.286350,0.155464,-0.858001,-0.724211,-0.878067
189,-0.811985,0.676739,0.286350,0.838446,0.402079,-0.727178,0.853417
190,0.089550,0.161877,1.549756,1.265309,0.222068,0.756249,0.074249


In [57]:
y_pred = knn_clf.predict(x_test_new)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_test, y_pred)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[102  23]
 [ 30  37]]
********************************************************************************
Accuracy is : 0.7239583333333334
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.77      0.82      0.79       125
           1       0.62      0.55      0.58        67

    accuracy                           0.72       192
   macro avg       0.69      0.68      0.69       192
weighted avg       0.72      0.72      0.72       192



In [22]:
# Training Data Evaluation
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_train, y_pred_train)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[332  43]
 [ 63 138]]
********************************************************************************
Accuracy is : 0.8159722222222222
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.84      0.89      0.86       375
           1       0.76      0.69      0.72       201

    accuracy                           0.82       576
   macro avg       0.80      0.79      0.79       576
weighted avg       0.81      0.82      0.81       576



## Hyperparameter Tuning

In [98]:
knn_clf = KNeighborsClassifier() 

hyperparameters = {"n_neighbors" : np.arange(3,30),
                  'p' :[1,2]}

gscv_knn_clf = GridSearchCV(knn_clf,hyperparameters,cv=5)
gscv_knn_clf.fit(x_train, y_train)
gscv_knn_clf.best_estimator_

KNeighborsClassifier(n_neighbors=23, p=1)

In [99]:
# Testing Data Evaluation
knn_clf = gscv_knn_clf.best_estimator_

y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_test, y_pred)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[108  17]
 [ 35  32]]
********************************************************************************
Accuracy is : 0.7291666666666666
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.76      0.86      0.81       125
           1       0.65      0.48      0.55        67

    accuracy                           0.73       192
   macro avg       0.70      0.67      0.68       192
weighted avg       0.72      0.73      0.72       192



In [101]:
# Training Data Evaluation
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_train, y_pred_train)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[347  28]
 [ 89 112]]
********************************************************************************
Accuracy is : 0.796875
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.80      0.93      0.86       375
           1       0.80      0.56      0.66       201

    accuracy                           0.80       576
   macro avg       0.80      0.74      0.76       576
weighted avg       0.80      0.80      0.79       576

