In [15]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from itertools import accumulate
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix
from sklearn.feature_selection import f_classif
from sklearn.utils import resample

# to supress warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

sns.set_context('notebook')
sns.set_style('white')



In [16]:
df = pd.read_excel('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/X4i8vXLw81g4wEH473zIFA/Diabetes-Classification.xlsx')

In [17]:
df.head()

Unnamed: 0,Patient number,Cholesterol,Glucose,HDL Chol,Chol/HDL ratio,Age,Gender,Height,Weight,BMI,Systolic BP,Diastolic BP,waist,hip,Waist/hip ratio,Diabetes,Unnamed: 16,Unnamed: 17
0,1,193,77,49,3.9,19,female,61,119,22.5,118,70,32,38,0.84,No diabetes,6.0,6.0
1,2,146,79,41,3.6,19,female,60,135,26.4,108,58,33,40,0.83,No diabetes,,
2,3,217,75,54,4.0,20,female,67,187,29.3,110,72,40,45,0.89,No diabetes,,
3,4,226,97,70,3.2,20,female,64,114,19.6,122,64,31,39,0.79,No diabetes,,
4,5,164,91,67,2.4,20,female,70,141,20.2,122,86,32,39,0.82,No diabetes,,


In [19]:
# last columns are not nescessary, hence dropping those
df.drop(columns=['Unnamed: 16', 'Unnamed: 17'])
df.columns

Index(['Patient number', 'Cholesterol', 'Glucose', 'HDL Chol',
       'Chol/HDL ratio', 'Age', 'Gender', 'Height', 'Weight', 'BMI',
       'Systolic BP', 'Diastolic BP', 'waist', 'hip', 'Waist/hip ratio',
       'Diabetes', 'Unnamed: 16', 'Unnamed: 17'],
      dtype='object')

In [20]:
# before analysing seeing what proportion of patient have diabetes and not
frequency_table = df['Diabetes'].value_counts(); #returns count of unique values
props = frequency_table.apply(lambda x: x/len(df['Diabetes']))
print(props)

Diabetes
No diabetes    0.846154
Diabetes       0.153846
Name: count, dtype: float64


In [22]:
# reducing becoz we're going to compute using thies fields only
attributes_required = ['Diabetes', 'Cholesterol', 'Glucose','BMI', 'Waist/hip ratio', 'HDL Chol', 'Chol/HDL ratio', 'Systolic BP', 'Diastolic BP', 'Weight' ]
df_reduced = df[attributes_required]

df_reduced.head()

Unnamed: 0,Diabetes,Cholesterol,Glucose,BMI,Waist/hip ratio,HDL Chol,Chol/HDL ratio,Systolic BP,Diastolic BP,Weight
0,No diabetes,193,77,22.5,0.84,49,3.9,118,70,119
1,No diabetes,146,79,26.4,0.83,41,3.6,108,58,135
2,No diabetes,217,75,29.3,0.89,54,4.0,110,72,187
3,No diabetes,226,97,19.6,0.79,70,3.2,122,64,114
4,No diabetes,164,91,20.2,0.82,67,2.4,122,86,141


In [25]:
# diabetes is target column (type is string in this case)
# hence filter only numeric cols for applying scaling
numerical_columns = df_reduced.iloc[:, 1:10]
numerical_columns

Unnamed: 0,Cholesterol,Glucose,BMI,Waist/hip ratio,HDL Chol,Chol/HDL ratio,Systolic BP,Diastolic BP,Weight
0,193,77,22.5,0.84,49,3.9,118,70,119
1,146,79,26.4,0.83,41,3.6,108,58,135
2,217,75,29.3,0.89,54,4.0,110,72,187
3,226,97,19.6,0.79,70,3.2,122,64,114
4,164,91,20.2,0.82,67,2.4,122,86,141
...,...,...,...,...,...,...,...,...,...
385,227,105,25.2,0.88,44,5.2,150,90,125
386,226,279,37.5,0.85,52,4.3,144,88,192
387,301,90,21.7,0.76,118,2.6,218,90,115
388,232,184,24.0,0.92,114,2.0,170,82,127


In [26]:
# use of StandardScaler is to scale the data to reduce the impact of extreme values
# apply scaler
scaler = StandardScaler()
preproc_reduced = scaler.fit(numerical_columns)
# preproc_reduced is a StandardScaler Object
# fit method returns a StandardScaler Object

In [27]:
df_standardized = preproc_reduced.transform(numerical_columns)
# transform returns a ndarray or spmatrix
# main parameter - a matrix like object (like StandardScaler())
df_standardized


array([[-0.31901328, -0.5646553 , -0.95194412, ..., -0.83807114,
        -0.98582201, -1.44731175],
       [-1.37261932, -0.52743157, -0.36035801, ..., -1.27608741,
        -1.87597193, -1.05084021],
       [ 0.21899831, -0.60187902,  0.07953936, ..., -1.18848416,
        -0.83746369,  0.23769232],
       ...,
       [ 2.10203888, -0.32270108, -1.07329512, ...,  3.54209161,
         0.49776118, -1.54642964],
       [ 0.55525555,  1.42681407, -0.724411  , ...,  1.43961349,
        -0.0956721 , -1.24907598],
       [-0.94669348, -0.24825362,  1.65710232, ...,  1.00159722,
        -0.0956721 ,  0.98107646]])

In [28]:
# converting the ndarray to pandas dataframe
df_standardized = pd.DataFrame(df_standardized, columns=numerical_columns.columns)
# Dataframe method takes 2 params, one is an Iterable Object, second is columns indexes
df_standardized.head()

Unnamed: 0,Cholesterol,Glucose,BMI,Waist/hip ratio,HDL Chol,Chol/HDL ratio,Systolic BP,Diastolic BP,Weight
0,-0.319013,-0.564655,-0.951944,-0.565995,-0.073401,-0.360132,-0.838071,-0.985822,-1.447312
1,-1.372619,-0.527432,-0.360358,-0.70276,-0.536983,-0.533102,-1.276087,-1.875972,-1.05084
2,0.218998,-0.601879,0.079539,0.117828,0.216339,-0.302476,-1.188484,-0.837464,0.237692
3,0.420753,-0.192418,-1.391841,-1.249818,1.143504,-0.763729,-0.662865,-1.430897,-1.571209
4,-0.969111,-0.304089,-1.300828,-0.839524,0.96966,-1.224982,-0.662865,0.201045,-0.902163


In [29]:
# concatatenating the removed col before applying scaler
df_stize = pd.concat([df_reduced['Diabetes'], df_standardized], axis=1)
df_stize

Unnamed: 0,Diabetes,Cholesterol,Glucose,BMI,Waist/hip ratio,HDL Chol,Chol/HDL ratio,Systolic BP,Diastolic BP,Weight
0,No diabetes,-0.319013,-0.564655,-0.951944,-0.565995,-0.073401,-0.360132,-0.838071,-0.985822,-1.447312
1,No diabetes,-1.372619,-0.527432,-0.360358,-0.702760,-0.536983,-0.533102,-1.276087,-1.875972,-1.050840
2,No diabetes,0.218998,-0.601879,0.079539,0.117828,0.216339,-0.302476,-1.188484,-0.837464,0.237692
3,No diabetes,0.420753,-0.192418,-1.391841,-1.249818,1.143504,-0.763729,-0.662865,-1.430897,-1.571209
4,No diabetes,-0.969111,-0.304089,-1.300828,-0.839524,0.969660,-1.224982,-0.662865,0.201045,-0.902163
...,...,...,...,...,...,...,...,...,...,...
385,No diabetes,0.443170,-0.043523,-0.542385,-0.018937,-0.363140,0.389404,0.563581,0.497761,-1.298635
386,Diabetes,0.420753,3.194941,1.323387,-0.429231,0.100443,-0.129506,0.300771,0.349403,0.361590
387,No diabetes,2.102039,-0.322701,-1.073295,-1.660112,3.924999,-1.109668,3.542092,0.497761,-1.546430
388,Diabetes,0.555256,1.426814,-0.724411,0.528122,3.693208,-1.455608,1.439613,-0.095672,-1.249076


In [30]:
# split the dataset now
# X will have all attributes but target(predictors)
# y will have only Diabetes attribute (target)
X = df_stize.drop(columns=['Diabetes'])
y = df_stize['Diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
# encoding target variable from string to numerical (0 or 1)
# becoz KNN requires numerical input

# Initialize LabelEncoder
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

# fit_transform takes a arraylike param and returns ndarray (encoded)


In [36]:
# creating the knn model
knn = KNeighborsClassifier()
# fit the knn model
knn.fit(X_train, y_train_encoded)
# after fitting , predicting the target using test dataset
y_pred = knn.predict(X_test)
# calculate overall accuracy by comparing below two
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f'Accuracy: {accuracy:.2%}')
# i got 88.46% yours will differ


Accuracy: 88.46%


In [37]:
# lets see if we can work on tuning to improve accuracy
# hyperparameter tuning
# in this case the hyperparameter is number of neighbors

knn = KNeighborsClassifier()

param_grid = {'n_neighbors': range(1,12)}
param_grid # it is a dict we are gonna use

{'n_neighbors': range(1, 12)}

## GridSearchCV:

GridSearchCV is a function in the sklearn library that performs an exhaustive search over a specified parameter grid for an estimator (in this case, the K-Nearest Neighbors model knn).
It trains the model for every combination of the provided parameters and evaluates it using cross-validation to find the best combination of hyperparameters.

### expalining the parameters passed

##### knn:

This is the machine learning model you are trying to tune, which in this case is likely a K-Nearest Neighbors (KNN) classifier or regressor.


##### param_grid:

This is a dictionary or a list of dictionaries where the keys are the hyperparameter names, and the values are the lists of settings to be tested. For example:
python

param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
GridSearchCV will test all possible combinations of these hyperparameters.


##### cv=10:

cv stands for cross-validation. Setting cv=10 means that the grid search will use 10-fold cross-validation. This means that the dataset will be split into 10 parts (folds), and for each combination of parameters, the model will be trained on 9 folds and tested on the 10th. This process is repeated 10 times, with each fold being used as the test set once.

In [53]:
#performing grid search with cross validation
grid_search = GridSearchCV(knn,param_grid,cv=10)
# grid_search is a GridSearchCV object

In [54]:
#fitting the hyperparameters after cross-validation
grid_search.fit(X_train, y_train_encoded)


In [57]:
print("Best params found", grid_search.best_params_)
print(f"Best accuracy score: , {grid_search.best_score_:.3f}")

# i got n_neighbors = 7 as my best hence my k value is 7

Best params found {'n_neighbors': 7}
Best accuracy score: , 0.917


In [56]:
# full results
results = grid_search.cv_results_

for mean_score, std_score, params in zip(results['mean_test_score'],results['std_test_score'], results['params']):
    print(f"Mean accuracy: {mean_score:.3f} (std: {std_score:.3f}) with: {params}")

Mean accuracy: 0.875 (std: 0.053) with: {'n_neighbors': 1}
Mean accuracy: 0.820 (std: 0.047) with: {'n_neighbors': 2}
Mean accuracy: 0.901 (std: 0.037) with: {'n_neighbors': 3}
Mean accuracy: 0.897 (std: 0.038) with: {'n_neighbors': 4}
Mean accuracy: 0.913 (std: 0.038) with: {'n_neighbors': 5}
Mean accuracy: 0.913 (std: 0.038) with: {'n_neighbors': 6}
Mean accuracy: 0.917 (std: 0.043) with: {'n_neighbors': 7}
Mean accuracy: 0.917 (std: 0.043) with: {'n_neighbors': 8}
Mean accuracy: 0.917 (std: 0.036) with: {'n_neighbors': 9}
Mean accuracy: 0.917 (std: 0.036) with: {'n_neighbors': 10}
Mean accuracy: 0.913 (std: 0.038) with: {'n_neighbors': 11}
