In [41]:
import pandas as pd
import sklearn
import scipy
import numpy as np
import seaborn

## Baseline Models

In [42]:
#split the data into test set(20%) and training set
from sklearn.model_selection import train_test_split
data = pd.read_csv("diabetes_data.csv")
BASELINE_ACC = []
# Change all categorial into binary values
# Male will be 0 and Female will be 1 (seems to make the models work better)
data = data.replace(["Yes", "No", "Male", "Female", "Positive", "Negative"], [1, 0, 0, 1, 1, 0])
train_set, test_set = train_test_split(data, test_size=0.2, random_state=43)
#make a copy of training set just incase we accidentally mutate data
train_set_copy = train_set.copy()
test_set_copy = test_set.copy()
data.describe()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
count,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0,520.0
mean,48.028846,0.369231,0.496154,0.448077,0.417308,0.586538,0.455769,0.223077,0.448077,0.486538,0.242308,0.459615,0.430769,0.375,0.344231,0.169231,0.615385
std,12.151466,0.483061,0.500467,0.497776,0.493589,0.492928,0.498519,0.41671,0.497776,0.5003,0.428892,0.498846,0.495661,0.484589,0.475574,0.375317,0.486973
min,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,47.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,57.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
max,90.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### KNN Model

In [43]:
from sklearn.neighbors import KNeighborsClassifier
#default of number of neighbors
col_num = len(data.columns) - 1
num_neighbors = 5
train_features = train_set_copy.iloc[:,:col_num].values
train_target = train_set_copy.iloc[:, -1].values
neigh = KNeighborsClassifier(n_neighbors = num_neighbors)
neigh.fit(train_features, train_target)
#Below is KNN weighted
neigh_weighted = KNeighborsClassifier(n_neighbors = num_neighbors, weights="distance")
neigh_weighted.fit(train_features, train_target)

KNeighborsClassifier(weights='distance')

In [44]:
#Test accuracy
from sklearn.metrics import accuracy_score
#observe accuracy for KNN
test_features = test_set_copy.iloc[:, :col_num].values
knn_predictions = neigh.predict(test_features)
knn_weighted_predictions = neigh_weighted.predict(test_features)
actual = test_set_copy.iloc[:, -1]
#observe accuracy for KNN weighted
knn_acc = accuracy_score(knn_predictions, actual)
knn_weighted = accuracy_score(knn_weighted_predictions, actual)
print("KNN score: ", knn_acc)
print("KNN weighted score: ", knn_weighted)
BASELINE_ACC.append(knn_weighted)

KNN score:  0.8942307692307693
KNN weighted score:  0.9326923076923077


In [45]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#Confusion matrix for KNN weighted
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#Confusion matrix for KNN weighted
def confusion_matrix_learn (actual, predictions):
    neigh_confusion_matrix = confusion_matrix(actual, predictions)
    classification = classification_report(actual, predictions)
    print(neigh_confusion_matrix)
    print(classification)
confusion_matrix_learn(actual, knn_weighted_predictions)

[[38  1]
 [ 6 59]]
              precision    recall  f1-score   support

           0       0.86      0.97      0.92        39
           1       0.98      0.91      0.94        65

    accuracy                           0.93       104
   macro avg       0.92      0.94      0.93       104
weighted avg       0.94      0.93      0.93       104



### Naive Bayes

In [46]:
## Fit to naive bayes model
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_features, train_target)
## Observe initial accuracy
naive_predictions = gnb.predict(test_features)
naive_acc = accuracy_score(naive_predictions, actual)
print(f"Naive Bayes Accuracy: {naive_acc}" )
BASELINE_ACC.append(naive_acc)

Naive Bayes Accuracy: 0.8942307692307693


In [47]:
confusion_matrix_learn(actual, naive_predictions)

[[34  5]
 [ 6 59]]
              precision    recall  f1-score   support

           0       0.85      0.87      0.86        39
           1       0.92      0.91      0.91        65

    accuracy                           0.89       104
   macro avg       0.89      0.89      0.89       104
weighted avg       0.89      0.89      0.89       104



### SVM

In [48]:
from sklearn import svm
SVM_Clss = svm.SVC()
SVM_Clss.fit(train_features, train_target)
#observe accuracy for SVM
SVM_predictions = SVM_Clss.predict(test_features)
svm_acc = accuracy_score(SVM_predictions, actual)
print(f"SVM acc: {svm_acc}" )
BASELINE_ACC.append(svm_acc)

SVM acc: 0.625


In [49]:
confusion_matrix_learn(actual, SVM_predictions)

[[ 0 39]
 [ 0 65]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        39
           1       0.62      1.00      0.77        65

    accuracy                           0.62       104
   macro avg       0.31      0.50      0.38       104
weighted avg       0.39      0.62      0.48       104



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree

In [50]:
from sklearn import tree
dec_tree = tree.DecisionTreeClassifier(random_state=0)
dec_tree = dec_tree.fit(train_features, train_target)
dec_tree_predictions = dec_tree.predict(test_features)
dec_acc = accuracy_score(dec_tree_predictions, actual)
print(f"Decistion Tree Accuracy: {dec_acc}" )
BASELINE_ACC.append(dec_acc)

Decistion Tree Accuracy: 0.9807692307692307


In [51]:
### Use Analysis of n-fold, cross validation, and confusion matrix
X_values = train_set_copy.iloc[:,:col_num].values
Y_values = train_set_copy.iloc[:, -1].values
dec_scores = cross_val_score(dec_tree, X_values, Y_values, cv = 7)
print(f"Scores: {dec_scores}")
print(f"Mean: {dec_scores.mean()}")
print(f"Standard Deviation: {dec_scores.std()}")

Scores: [0.96666667 0.93333333 0.93333333 0.98305085 1.         0.98305085
 0.91525424]
Mean: 0.9592413236481033
Standard Deviation: 0.02958644466574478


In [52]:
confusion_matrix_learn(actual, dec_tree_predictions)

[[39  0]
 [ 2 63]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        39
           1       1.00      0.97      0.98        65

    accuracy                           0.98       104
   macro avg       0.98      0.98      0.98       104
weighted avg       0.98      0.98      0.98       104



In [53]:
BASELINE_ACC

[0.9326923076923077, 0.8942307692307693, 0.625, 0.9807692307692307]