KNN algorithm in Machine learning 

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

Explanation:
Pandas and NumPy are used for data manipulation.
Scikit-learn (sklearn) provides various machine learning tools:
train_test_split for dividing the data into training and testing sets.
StandardScaler for normalizing features.
KNeighborsClassifier for building a KNN classifier.
confusion_matrix, f1_score, and accuracy_score for evaluating model performance.
This code sets up the necessary imports for preparing, training, and evaluating a KNN model.

In [44]:
file_path = r"C:\Users\Wsiwale\Desktop\Dataset\diabetes.csv"
diabetes_dataSet = pd.read_csv(file_path)

In [63]:
diabetes_dataSet.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [46]:
diabetes_dataSet.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


Values of columns like Glucose, BloodPressure cannot be accepted as zeroes because it will affect the outcoome we can replace such values with the mean of the respective column

In [47]:
#Replaceing zeroes
#creating list 
zeros_not_accepted = ["Glucose", "BloodPressure", "SkinThickness", "BMI", "Insulin"]
for column in zeros_not_accepted:
    diabetes_dataSet[column] = diabetes_dataSet[column].replace(0, np.NaN)
    mean = int(diabetes_dataSet[column].mean(skipna = True))
    diabetes_dataSet[column] = diabetes_dataSet[column].replace(np.NaN, mean)

In [48]:
print(diabetes_dataSet["Insulin"])

0      155.0
1      155.0
2      155.0
3       94.0
4      168.0
       ...  
763    180.0
764    155.0
765    112.0
766    155.0
767    155.0
Name: Insulin, Length: 768, dtype: float64


In [49]:
#split dataset into training and testing ste
X = diabetes_dataSet.iloc[:, 0:8] # 0 to 8 we are only looking atthe part on that column to train for column 9 is part of the answer  the training is 0 to 8 column
Y = diabetes_dataSet.iloc[:, 8]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0, test_size = 0.2)

In [50]:
# feature scaling 
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [51]:
#then define the model : Init K-NN
classifier = KNeighborsClassifier(n_neighbors =11, p=2, metric = "euclidean")

In [52]:
# calculating the nearest neighbor using the square root of all the all the given the sample parameter si this case we have find 12 -1 = 11
import math
math.sqrt(len(Y_test))

12.409673645990857

In [57]:
#fit Model

classifier.fit(X_train, Y_train)
KNeighborsClassifier(algorithm = "auto", leaf_size = 30, metric ="euclidean",
                     metric_params = None, n_jobs = 1, n_neighbors = 11, p =2,
                     weights = "uniform")

In [59]:
# predict the test set results
Y_pred = classifier.predict(X_test)
Y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [60]:
# Its important to evaluate the model, lets use confusion matrix to do that
#Evaluate the model
cm = confusion_matrix(Y_test, Y_pred)
print(cm)


[[94 13]
 [15 32]]


the results show that 94 and 32 are the actual values predicted 
it mains that 94 people they dont have diabets 
then 13 people have diabets and higher risk 
then 32 people have diabets and are collect 
15 classifier as incollect 

In [61]:
#the f1 get the balance of all scole 
print(f1_score(Y_test, Y_pred))

0.6956521739130435


In [62]:
# how many we got write and wrong 
print(accuracy_score(Y_test, Y_pred))

0.8181818181818182
