# KNN: Predict Diabetes

In [2]:
# Standard Python Packages
import pandas as pd
import numpy as np

# SK Tools
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Reduce Bias of Large Numbers
from sklearn.preprocessing import StandardScaler

# Verify Model Accuracy
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [6]:
# Preview Dataset
df = pd.read_csv(r"C:\Users\zacha\龙京\python\youtube\diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
# Number of Rows 
print(len(df))

768


In [11]:
# Eliminating Zero
no_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

for column in no_zero:
    df[column] = df[column].replace(0, np.NaN)
    mean = int(df[column].mean(skipna = True))
    df[column] = df[column].replace(np.NaN, mean)

In [14]:
# Data Split
X = df.iloc[:, 0:8]
y = df.iloc[:, 8]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

In [15]:
# Scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [16]:
# Determining # of K
import math 
math.sqrt(len(y_test))

12.409673645990857

In [17]:
# Define Model
classifier = KNeighborsClassifier(n_neighbors = 11, p = 2, metric = 'euclidean') 

In [20]:
# Fit the Model
classifier.fit(X_train, y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=11)

In [21]:
# Predict Test Results
y_pred = classifier.predict(X_test)
print(y_pred)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [24]:
# Evaluate Model
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[94 13]
 [15 32]]


In [25]:
# Model Accuracy
print(accuracy_score(y_test, y_pred))

0.8181818181818182
