In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import confusion_matrix,classification_report, accuracy_score, roc_curve

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_diabetes

In [2]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1


In [3]:
df["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

# Preprocessing Steps

### Feature engineering

### 1.Normalization

In [4]:
x_df = df.drop("Outcome",axis = 1)
x_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,148,50,35,0,33.6,0.627,50
1,85,66,29,0,26.6,0.351,31
2,183,64,0,0,23.3,0.672,52
3,150,66,23,94,28.1,0.167,21
4,150,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63
764,122,70,27,0,36.8,0.340,27
765,121,72,23,112,26.2,0.245,30
766,126,60,0,0,30.1,0.349,47


In [5]:
x_df.describe()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,121.117188,69.076823,20.536458,79.799479,31.992578,0.471876,33.24349
std,31.805091,19.367794,15.952218,115.244002,7.88416,0.331329,11.758182
min,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,142.0,80.0,32.0,127.25,36.6,0.62625,41.0
max,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [None]:
fit            >> train
fit_transform  >> train
transform      >> train, test

In [11]:
normal_scalar= MinMaxScaler()
array = normal_scalar.fit_transform(x_df)
array

array([[0.74371859, 0.40983607, 0.35353535, ..., 0.50074516, 0.23441503,
        0.48333333],
       [0.42713568, 0.54098361, 0.29292929, ..., 0.39642325, 0.11656704,
        0.16666667],
       [0.91959799, 0.52459016, 0.        , ..., 0.34724292, 0.25362938,
        0.51666667],
       ...,
       [0.6080402 , 0.59016393, 0.23232323, ..., 0.390462  , 0.07130658,
        0.15      ],
       [0.63316583, 0.49180328, 0.        , ..., 0.4485842 , 0.11571307,
        0.43333333],
       [0.46733668, 0.57377049, 0.31313131, ..., 0.45305514, 0.10119556,
        0.03333333]])

In [14]:
x_normal_df = pd.DataFrame(array,columns = x_df.columns)
x_normal_df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.743719,0.409836,0.353535,0.000000,0.500745,0.234415,0.483333
1,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667
2,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.516667
3,0.753769,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000
4,0.753769,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000
...,...,...,...,...,...,...,...
763,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000
764,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000
765,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000
766,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333


In [15]:
#x_148 = (Xi - Xmin) / (Xmax - Xmin)
x_148 = (148 - 0) / (199 - 0)
x_148

0.7437185929648241

# Train test Split

In [20]:
x = x_normal_df.copy()
y = df["Outcome"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42,stratify=y)

In [21]:
x_test

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
635,0.522613,0.590164,0.000000,0.000000,0.464978,0.165243,0.283333
698,0.638191,0.721311,0.111111,0.183215,0.514158,0.222032,0.116667
637,0.472362,0.622951,0.181818,0.078014,0.470939,0.243809,0.033333
402,0.683417,0.688525,0.414141,0.104019,0.521610,0.088813,0.233333
425,0.924623,0.639344,0.393939,0.327423,0.551416,0.079419,0.166667
...,...,...,...,...,...,...,...
62,0.221106,0.508197,0.000000,0.000000,0.372578,0.217336,0.250000
477,0.572864,0.622951,0.171717,0.130024,0.354694,0.165670,0.166667
311,0.532663,0.573770,0.373737,0.174941,0.587183,0.225021,0.016667
116,0.623116,0.606557,0.000000,0.000000,0.506706,0.060632,0.283333


# Model Training

In [22]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train,y_train)

KNeighborsClassifier()

# Evaluation

In [23]:
y_pred = knn_clf.predict(x_test)
y_pred

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0], dtype=int64)

In [24]:
# Testing data Evalutaion
cnf_matrix = confusion_matrix(y_test,y_pred)
print("Confision marix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy score is",accuracy)
print("*"*80)
clf_report = classification_report(y_test,y_pred)
print("Classification report :\n",clf_report)

Confision marix :
 [[104  21]
 [ 32  35]]
********************************************************************************
Accuracy score is 0.7239583333333334
********************************************************************************
Classification report :
               precision    recall  f1-score   support

           0       0.76      0.83      0.80       125
           1       0.62      0.52      0.57        67

    accuracy                           0.72       192
   macro avg       0.69      0.68      0.68       192
weighted avg       0.72      0.72      0.72       192



In [25]:
# Training data Evalutaion
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print("Confision marix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train,y_pred_train)
print("Accuracy score is",accuracy)
print("*"*80)
clf_report = classification_report(y_train,y_pred_train)
print("Classification report :\n",clf_report)

Confision marix :
 [[329  46]
 [ 59 142]]
********************************************************************************
Accuracy score is 0.8177083333333334
********************************************************************************
Classification report :
               precision    recall  f1-score   support

           0       0.85      0.88      0.86       375
           1       0.76      0.71      0.73       201

    accuracy                           0.82       576
   macro avg       0.80      0.79      0.80       576
weighted avg       0.82      0.82      0.82       576



# Hyperparameter tuning

In [29]:
knn_clf = KNeighborsClassifier()

hyperparameters = {"n_neighbors":np.arange(3,30),
                  "p" : [1,2]}

gscv_knn_clf = GridSearchCV(knn_clf,hyperparameters,cv = 5)
gscv_knn_clf.fit(x_train,y_train)
gscv_knn_clf.best_estimator_

KNeighborsClassifier(n_neighbors=7, p=1)

In [31]:
# Testing data Evalutaion


knn_clf = gscv_knn_clf.best_estimator_

cnf_matrix = confusion_matrix(y_test,y_pred)
print("Confision marix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy score is",accuracy)
print("*"*80)
clf_report = classification_report(y_test,y_pred)
print("Classification report :\n",clf_report)

Confision marix :
 [[104  21]
 [ 32  35]]
********************************************************************************
Accuracy score is 0.7239583333333334
********************************************************************************
Classification report :
               precision    recall  f1-score   support

           0       0.76      0.83      0.80       125
           1       0.62      0.52      0.57        67

    accuracy                           0.72       192
   macro avg       0.69      0.68      0.68       192
weighted avg       0.72      0.72      0.72       192



In [32]:
# Training data Evalutaion
y_pred_train = knn_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print("Confision marix :\n",cnf_matrix)
print("*"*80)
accuracy = accuracy_score(y_train,y_pred_train)
print("Accuracy score is",accuracy)
print("*"*80)
clf_report = classification_report(y_train,y_pred_train)
print("Classification report :\n",clf_report)

Confision marix :
 [[328  47]
 [ 66 135]]
********************************************************************************
Accuracy score is 0.8038194444444444
********************************************************************************
Classification report :
               precision    recall  f1-score   support

           0       0.83      0.87      0.85       375
           1       0.74      0.67      0.70       201

    accuracy                           0.80       576
   macro avg       0.79      0.77      0.78       576
weighted avg       0.80      0.80      0.80       576



# Standardization

std_scalar= StandardScaler()
array = normal_scalar.fit_transform(x_df)
x_std_df = pd.DataFrame(array,columns = x_df.columns)
x_std_df