# K Nearest Neighbors Diabetes Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

file_path = 'C:/Users/ratho/OneDrive/Desktop/diabetes.csv'
dataset = pd.read_csv(file_path)

print("Dataset length:", len(dataset))
print(dataset.head())

zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN)
    mean = int(dataset[column].mean(skipna=True))
    dataset[column] = dataset[column].replace(np.NaN, mean)


X = dataset.iloc[:, 0:8]
y = dataset.iloc[:, 8]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

print("Training data points:", len(X_train))
print("Training labels:", len(y_train))
print("Testing data points:", len(X_test))
print("Testing labels:", len(y_test))

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
y_pred

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Dataset length: 768
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
Training data points: 614
Training labels: 614
Testing data points: 154
Testing labels: 154
Confusion Matrix:
[[94 13]
 [15 32]]
F1 Score: 0.6956521739130436
Accuracy Score: 0.8181818181818182
