## KNN - Predict Diabetes

In [7]:
# import required libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [8]:
# Load data
df = pd.read_csv("datasets/diabetes.csv")
len(df) 

768

In [9]:
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Impute zero values with the mean

In [10]:
# List of columns that should not have zero values
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

# Iterate over each column in the list
for column in zero_not_accepted:
    # Replace zero values with NaN (missing value)
    df[column] = df[column].replace(0, np.NaN)
    
    # Calculate the mean of the column, skipping NaN values
    mean = int(df[column].mean(skipna=True))
    
    # Replace NaN values with the calculated mean
    df[column] = df[column].replace(np.NaN, mean)


### Train test split

In [11]:
# Train test split
X = df.iloc[:, 0:8]
y = df.iloc[:, 8]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

### Feature Scaling

- some algorithms, such as K-nearest neighbors (KNN) and support vector machines (SVM), are sensitive to the scale of the features and may produce inaccurate results if the features are not scaled appropriately.

In [12]:
# Scaler instance
sc_X = StandardScaler()
# Scale X_train qnd X_test
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

### Make Predictions

In [13]:
# KNN classifier
classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')

In [14]:
# Fit classifier to training set
classifier.fit(X_train, y_train)