In [None]:
# Importing Libs
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import neighbors
# Styling used for VSCode
from matplotlib import style
style.use('dark_background')

from sklearn.neighbors import KNeighborsClassifier

# Importing Data
data = pd.read_csv('diabetes_binary_health_indicators_BRFSS2015.csv')

In [None]:
# Preprocessing
X = data.drop('Diabetes_binary', axis=1)
y = data.iloc[:,0]


In [None]:
# Split train and test : 5%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [None]:

error = []

# Calculating error for K values between 1 and 20
for i in range(1, 21):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))


In [None]:
avg = (1-np.array(error))*100
for i in range(0,20):
    print("If K = %d , then accuracy is %d%%" %(i+1,avg[i]))

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(1, 21), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
print(error)

In [None]:
#Training and Predictions with optimal parameter k=12
classifier = KNeighborsClassifier(n_neighbors=16)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
y_pred

In [None]:
# Evaluating the Algorithm k =19
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# decision surface for a binary classification dataset
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.decomposition import PCA

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00'])

# generate dataset
pca = PCA(n_components=2)
pca.fit(X)
PCAX = pca.transform(X)

# # define bounds of the domain
knn = neighbors.KNeighborsClassifier(n_neighbors=12)
knn.fit(PCAX, y)

x_min, x_max = PCAX[:, 0].min() - .1, PCAX[:, 0].max() + .1
y_min, y_max = PCAX[:, 1].min() - .1, PCAX[:, 1].max() + .1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                        np.linspace(y_min, y_max, 100))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])

Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(PCAX[:, 0], PCAX[:, 1], c=y, cmap=cmap_bold, s=1)


In [None]:
#ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, classifier.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, classifier.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='KNN (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()