<a href="https://colab.research.google.com/github/surajkr214/Programming-For-Data-Science/blob/main/CN_5021_DS_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Libraries**

In [104]:
import pandas as pd
import numpy as numpy
import math # Needed for square root calculation later

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # to avoid biases in data
from sklearn.neighbors import KNeighborsClassifier # the KNN algorithm

# Libraries for testing the model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

**Read the Data**

In [105]:
# Load the dataset
# Make sure diabetes.csv is uploaded to your Colab session
dataset = pd.read_csv('/content/diabetes.csv')

# Print length of dataset
print(len(dataset))

# Show the first 5 rows
print(dataset.head())

768
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


**Pre-Processing the Data (Cleaning)**

In [106]:
# Replacing zero's
# There are situations where the data is zero, maybe because the data has not been recorded
list_no_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

for column in list_no_zero:
    # Replace the zero with no-data (NaN)
    dataset[column] = dataset[column].replace(0, numpy.nan)

    # Calculate the mean of the column, skipping the NaNs
    mean = int(dataset[column].mean(skipna=True))

    # Replace the NaN with the calculated mean
    dataset[column] = dataset[column].replace(numpy.nan, mean)

**Check the Dataset**

In [107]:
# To check if we have any zeros at all in the BloodPressure column
print(dataset['BloodPressure'])

0      72.0
1      66.0
2      64.0
3      66.0
4      40.0
       ... 
763    76.0
764    70.0
765    72.0
766    60.0
767    70.0
Name: BloodPressure, Length: 768, dtype: float64


**Splitting the Dataset**

In [108]:
# Splitting the dataset for training and testing

# X contains columns 0 to 7 (attributes like Glucose, BMI, Age, etc.)
# We don't consider the last column as it is the output
X = dataset.iloc[:, 0:8]

# Y contains column 8 only (the outcome: Diabetic or Not)
Y = dataset.iloc[:, 8]

# Split the data: 80% for training, 20% for testing
# random_state=0 ensures we get the same split every time we run the code
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.2)

**Feature Scaling**

In [109]:
# Feature Scaling
# Instead of some columns being 1 to 100, and some 1 to 10, we will scale the data

scaler_X = StandardScaler()

# Fit and transform the training set
X_train = scaler_X.fit_transform(X_train)

# Transform the test set (do not fit, use the same scale as training)
X_test = scaler_X.transform(X_test)

**Determine Value of K (Neighbors)**

In [110]:
# Calculate K: square root of the length of the test data
print(math.sqrt(len(Y_test)))

# Result is roughly 12.4.
# Since we have an even number (12), we usually subtract 1 to make it odd to avoid ties.
# So we will choose K = 11

12.409673645990857


**Define and Fit the Model**

In [111]:
# Defining the model
# n_neighbors=11 (calculated in previous step)
# p=2 means Euclidean distance (standard distance formula)
# metric='euclidean'
classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')

# Fitting the model with the scaled input training data
classifier.fit(X_train, Y_train)

**Predict Results**

In [112]:
# Predict the results from the Test Set
Y_pred = classifier.predict(X_test)

# Print the predictions to see them
print(Y_pred)

[1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0
 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0
 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 0]


**Evaluation (Confusion Matrix)**

In [113]:
# Evaluate Model
cmatrix = confusion_matrix(Y_test, Y_pred)
print(cmatrix)

# Interpretation:
# [0][0] = True Positive (Correctly predicted diabetic)
# [1][1] = True Negative (Correctly predicted not diabetic)
# [0][1] = False Negative
# [1][0] = False Positive

[[94 13]
 [15 32]]


**Evaluation (Scores)**

In [114]:
# F1 Score
# Takes into account false positives and false negatives
print("F1 Score:")
print(f1_score(Y_test, Y_pred))

# Accuracy Score
# Strictly how many were right vs how many were wrong
print("Accuracy Score:")
print(accuracy_score(Y_test, Y_pred))

F1 Score:
0.6956521739130435
Accuracy Score:
0.8181818181818182
