In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Last three are for testing accuracy of our model

In [13]:
data = pd.read_csv("data/diabetes.csv")

print(data.head(), "\n", data.shape, "\n")

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI   
0            6      148             72             35        0  33.6  \
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1   
 (768, 9) 



In [14]:
for column in data.columns:
    print(f"{column}: {pd.unique(data[column])}")

Pregnancies: [ 6  1  8  0  5  3 10  2  4  7  9 11 13 15 17 12 14]
Glucose: [148  85 183  89 137 116  78 115 197 125 110 168 139 189 166 100 118 107
 103 126  99 196 119 143 147  97 145 117 109 158  88  92 122 138 102  90
 111 180 133 106 171 159 146  71 105 101 176 150  73 187  84  44 141 114
  95 129  79   0  62 131 112 113  74  83 136  80 123  81 134 142 144  93
 163 151  96 155  76 160 124 162 132 120 173 170 128 108 154  57 156 153
 188 152 104  87  75 179 130 194 181 135 184 140 177 164  91 165  86 193
 191 161 167  77 182 157 178  61  98 127  82  72 172  94 175 195  68 186
 198 121  67 174 199  56 169 149  65 190]
BloodPressure: [ 72  66  64  40  74  50   0  70  96  92  80  60  84  30  88  90  94  76
  82  75  58  78  68 110  56  62  85  86  48  44  65 108  55 122  54  52
  98 104  95  46 102 100  61  24  38 106 114]
SkinThickness: [35 29  0 23 32 45 19 47 38 30 41 33 26 15 36 11 31 37 42 25 18 24 39 27
 21 34 10 60 13 20 22 28 54 40 51 56 14 17 50 44 12 46 16  7 52 43 48  8
 49 

In [15]:
for column in ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']:
    data[column] = data[column].replace(0, np.NaN)
    
    mean = data[column].mean(skipna=True)
    data[column] = data[column].replace(np.NaN, mean)

In [17]:
X = data.iloc[:, :8]
y = data.iloc[:, -1]

X_train,X_test,y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# random_state === random_seed
# stratify is used in categorical datasets to meintain same proportions of different categories in training and testing sets

In [18]:
#feature Scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [21]:
np.sqrt(len(y_test))

12.409673645990857

In [24]:
# since K should be odd int:
K = int(np.floor(np.sqrt(len(y_test)))) + 1

classifier = KNeighborsClassifier(n_neighbors=K, p=2, metric='euclidean')
classifier.fit(X_train,y_train)

In [25]:
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
      dtype=int64)

In [26]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[83, 17],
       [23, 31]], dtype=int64)

## Analysis of Confusion Matrix

To analyze a confusion matrix, you can consider the following key metrics and interpretations:

1. True Positives (TP): The number of instances correctly predicted as positive. In your confusion matrix, it is 83.
2. True Negatives (TN): The number of instances correctly predicted as negative. In your confusion matrix, it is 31.
3. False Positives (FP): The number of instances incorrectly predicted as positive. In your confusion matrix, it is 17.
4. False Negatives (FN): The number of instances incorrectly predicted as negative. In your confusion matrix, it is 23.

Now, let's explore some evaluation metrics that can be derived from the confusion matrix:

### 1. Accuracy: 
It measures the overall correctness of the model's predictions.

Accuracy = $\frac{{TP + TN}}{{TP + TN + FP + FN}}$
In your case, accuracy = $\frac{{83 + 31}}{{83 + 17 + 23 + 31}}$ = 0.73 or 73%.

### 2. Precision: 
It assesses the accuracy of the positive predictions made by the model.

Precision = $\frac{{TP}}{{TP + FP}}$
In your case, precision = $\frac{{83}}{{83 + 17}}$ = 0.83 or 83%.

### 3. Recall (Sensitivity or True Positive Rate): 
It measures the proportion of actual positives that were correctly identified by the model.

Recall = $\frac{{TP}}{{TP + FN}}$
In your case, recall = $\frac{{83}}{{83 + 23}}$ = 0.78 or 78%.

### 4. Specificity (True Negative Rate): 
It measures the proportion of actual negatives that were correctly identified by the model.

Specificity = $\frac{{TN}}{{TN + FP}}$
In your case, specificity = $\frac{{31}}{{31 + 17}}$ = 0.65 or 65%.

### 5. F1 Score: 
It is the harmonic mean of precision and recall, providing a balance between the two metrics.

F1 Score = $\frac{{2 \cdot (Precision \cdot Recall)}}{{Precision + Recall}}$
In your case, F1 Score = $\frac{{2 \cdot (0.83 \cdot 0.78)}}{{0.83 + 0.78}}$ = 0.80 or 80%.