# KNN Algorithm

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve

import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.datasets import load_iris

In [2]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [3]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [4]:
x = df.drop('Outcome',axis = 1)
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=24, stratify=y)

In [5]:
x_test

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
385,119,54,13,50,22.3,0.205,24
397,131,66,40,0,34.3,0.196,22
537,57,60,0,0,21.7,0.735,67
268,102,52,0,0,25.1,0.078,21
244,146,76,35,194,38.2,0.329,29
...,...,...,...,...,...,...,...
75,0,48,20,0,24.7,0.140,22
650,91,54,25,100,25.2,0.234,23
429,95,82,25,180,35.0,0.233,43
594,123,72,45,230,33.6,0.733,34


In [None]:
Model training 

In [12]:
knn_clf = KNeighborsClassifier() # K=5, Euclidean distance
knn_clf.fit(x_train,y_train) # saveing the data, storing data points

KNeighborsClassifier()

In [None]:
Evaluation :
    

In [13]:
#Testing data Evaluation
y_pred = knn_clf.predict(x_test)
y_pred[30:35]

array([1, 0, 0, 0, 1], dtype=int64)

In [14]:
y_test[30:35]

653    0
52     0
706    1
204    0
568    0
Name: Outcome, dtype: int64

In [15]:
#Testing Data Evaluation
cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_test, y_pred)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[96 29]
 [32 35]]
********************************************************************************
Accuracy is : 0.6822916666666666
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.75      0.77      0.76       125
           1       0.55      0.52      0.53        67

    accuracy                           0.68       192
   macro avg       0.65      0.65      0.65       192
weighted avg       0.68      0.68      0.68       192



In [16]:
# Training Data Evaluation
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_train, y_pred_train)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[334  41]
 [ 67 134]]
********************************************************************************
Accuracy is : 0.8125
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       375
           1       0.77      0.67      0.71       201

    accuracy                           0.81       576
   macro avg       0.80      0.78      0.79       576
weighted avg       0.81      0.81      0.81       576



In [None]:
change values of k 6, 11, and run code again
and check values and value of p also

In [None]:
Hyperparameter Tuning

In [17]:
knn_clf = KNeighborsClassifier() 

hyperparameters = {"n_neighbors" : np.arange(3,30),
                  'p' :[1,2]}

gscv_knn_clf = GridSearchCV(knn_clf,hyperparameters,cv=5)
gscv_knn_clf.fit(x_train, y_train)
gscv_knn_clf.best_estimator_

KNeighborsClassifier(n_neighbors=7)

In [18]:
gscv_knn_clf.be st_params_

{'n_neighbors': 7, 'p': 2}

In [19]:
#Testing Data Evaluation

#knn_clf = KNeighborsClassifier()  
knn_clf = gscv_knn_clf.best_estimator_
 
cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_test, y_pred)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[96 29]
 [32 35]]
********************************************************************************
Accuracy is : 0.6822916666666666
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.75      0.77      0.76       125
           1       0.55      0.52      0.53        67

    accuracy                           0.68       192
   macro avg       0.65      0.65      0.65       192
weighted avg       0.68      0.68      0.68       192



In [20]:
# Training Data Evaluation
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_train, y_pred_train)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[333  42]
 [ 68 133]]
********************************************************************************
Accuracy is : 0.8090277777777778
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.83      0.89      0.86       375
           1       0.76      0.66      0.71       201

    accuracy                           0.81       576
   macro avg       0.80      0.77      0.78       576
weighted avg       0.81      0.81      0.81       576



# Prepocessing technique


## Feature Engineering

### 1. Normalization

In [84]:
x_df = df.drop('Outcome',axis = 1)
# x_df.describe()
x_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,148,50,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,52
3,150,66,23,94,28.1,0.167,21
4,150,40,35,168,43.1,2.288,33


In [88]:
x_df.describe()


Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,121.117188,69.076823,20.536458,79.799479,31.992578,0.471876,33.24349
std,31.805091,19.367794,15.952218,115.244002,7.88416,0.331329,11.758182
min,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,142.0,80.0,32.0,127.25,36.6,0.62625,41.0
max,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [90]:
X_148 = (148-121.117188)/31.805091
X_148

0.8452361290209797

In [91]:
X_148 = (85-121.117188)/31.805091
X_148

-1.1355788291880693

In [None]:
fit           >> train
fit_transform >> train
transform     >> train, test

In [83]:
normal_scaler = MinMaxScaler()
array = normal_scaler.fit_transform(x_df)
x_normal_df = pd.DataFrame(array,columns=x_df.columns)
# x_normal_df

In [86]:
std_scaler = StandardScaler()
array = std_scaler.fit_transform(x_df)
x_std_df = pd.DataFrame(array,columns=x_df.columns)
x_std_df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.845787,-0.985618,0.90727,-0.692891,0.204013,0.468492,1.426022
1,-1.136319,-0.158966,0.530902,-0.692891,-0.684422,-0.365061,-0.190927
2,1.946957,-0.262298,-1.288212,-0.692891,-1.103255,0.604397,1.596227
3,0.908711,-0.158966,0.154533,0.123302,-0.494043,-0.920763,-1.041953
4,0.908711,-1.502276,0.90727,0.765836,1.409746,5.484909,-0.020722


# Train Test Split

In [11]:
x = df.drop('Outcome',axis = 1)
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=24, stratify=y)

In [75]:
# Normalization
x = x_normal_df.copy()
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=24, stratify=y)
# x_test

In [94]:
# Standardization
x = x_std_df.copy()
y = df['Outcome']

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25, random_state=24, stratify=y)
# x_test

# Model Training

In [95]:
knn_clf = KNeighborsClassifier() # Euclidean Distance, k = 5
knn_clf.fit(x_train,y_train)

KNeighborsClassifier()

## Evaluation

In [96]:
# Testing Data Evaluation
y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_test, y_pred)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[102  23]
 [ 30  37]]
********************************************************************************
Accuracy is : 0.7239583333333334
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.77      0.82      0.79       125
           1       0.62      0.55      0.58        67

    accuracy                           0.72       192
   macro avg       0.69      0.68      0.69       192
weighted avg       0.72      0.72      0.72       192



In [97]:
# Training Data Evaluation
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_train, y_pred_train)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[334  41]
 [ 63 138]]
********************************************************************************
Accuracy is : 0.8194444444444444
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.84      0.89      0.87       375
           1       0.77      0.69      0.73       201

    accuracy                           0.82       576
   macro avg       0.81      0.79      0.80       576
weighted avg       0.82      0.82      0.82       576



## Hyperparameter Tuning

In [98]:
knn_clf = KNeighborsClassifier() 

hyperparameters = {"n_neighbors" : np.arange(3,30),
                  'p' :[1,2]}

gscv_knn_clf = GridSearchCV(knn_clf,hyperparameters,cv=5)
gscv_knn_clf.fit(x_train, y_train)
gscv_knn_clf.best_estimator_

KNeighborsClassifier(n_neighbors=23, p=1)

In [99]:
# Testing Data Evaluation
knn_clf = gscv_knn_clf.best_estimator_

y_pred = knn_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_test, y_pred)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[108  17]
 [ 35  32]]
********************************************************************************
Accuracy is : 0.7291666666666666
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.76      0.86      0.81       125
           1       0.65      0.48      0.55        67

    accuracy                           0.73       192
   macro avg       0.70      0.67      0.68       192
weighted avg       0.72      0.73      0.72       192



In [101]:
# Training Data Evaluation
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train, y_pred_train)
print("Confusion Matrix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train, y_pred_train)
print("Accuracy is :",accuracy)
print("*"*80)

clf_report = classification_report(y_train, y_pred_train)
print("Classification Report :\n",clf_report)

Confusion Matrix :
 [[347  28]
 [ 89 112]]
********************************************************************************
Accuracy is : 0.796875
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

           0       0.80      0.93      0.86       375
           1       0.80      0.56      0.66       201

    accuracy                           0.80       576
   macro avg       0.80      0.74      0.76       576
weighted avg       0.80      0.80      0.79       576

