In [27]:
import matplotlib.pyplot as plt
import numpy
import pandas as pd

In [None]:
data = pd.read_csv("./data/diabetes.csv")
print("dataset shape{}".format(data.shape))
data.head()

In [None]:
data.groupby("Outcome").size()

In [None]:
X = data.iloc[:, 0:8]
Y = data.iloc[:, 8]
print("shape of X {}; shape of Y {}".format(X.shape, Y.shape))

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier

models = []
models.append(("KNN", KNeighborsClassifier(n_neighbors=2)))
models.append(
    ("KNN with weights", KNeighborsClassifier(n_neighbors=2, weights="distance"))
)
models.append(("Radius Neighbors", RadiusNeighborsClassifier(radius=500.0)))

In [None]:
results = []
for name, model in models:
    model.fit(X_train, Y_train)
    results.append((name, model.score(X_test, Y_test)))
for i in range(len(results)):
    print("name: {};score: {}".format(results[i][0], results[i][1]))

In [None]:
from sklearn.model_selection import KFold, cross_val_score

results = []
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_result = cross_val_score(model, X, Y, cv=kfold)
    results.append((name, cv_result))
for i in range(len(results)):
    print("name: {};cross val score: {}".format(results[i][0], results[i][1].mean()))

In [None]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, Y_train)
train_score = knn.score(X_train, Y_train)
test_score = knn.score(X_test, Y_test)
print("train score: {};test score: {}".format(train_score, test_score))

In [None]:
from common.utils import plot_learning_curve
from sklearn.model_selection import ShuffleSplit

knn = KNeighborsClassifier(n_neighbors=2)
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plt.figure(figsize=(10, 6))
plot_learning_curve(knn, "Learn Curve for KNN Diabetes", X, Y, ylim=(0.0, 1.01), cv=cv)
plt.show()

In [None]:
from sklearn.feature_selection import SelectKBest

selector = SelectKBest(k=2)
X_new = selector.fit_transform(X, Y)
X_new[0:5]

In [None]:
results = []
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_result = cross_val_score(model, X_new, Y, cv=kfold)
    results.append((name, cv_result[0]))
for i in range(len(results)):
    print("name: {};cross val score: {}".format(results[i][0], results[i][1].mean()))

In [None]:
plt.figure(figsize=(10, 6))
plt.ylabel("BMI")
plt.xlabel("Glucose")
plt.scatter(X_new[Y == 0][:, 0], X_new[Y == 0][:, 1], c="r", s=20, marker="o")
plt.scatter(X_new[Y == 1][:, 0], X_new[Y == 1][:, 1], c="g", s=20, marker="^")