# K-Nearest Neighbors (K-NN)

## Importing the libraries

In [148]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Importing the dataset

In [149]:
# Replace 'file_id' with the file ID you extracted from the sharing link
file_id = '1Gb3AiXXEKhkaZZtIXxqcV0jIdVckcrLB'

# Construct the direct download link for the CSV file
download_link = f'https://drive.google.com/uc?id={file_id}'

# Read the CSV file into a DataFrame
dataset = pd.read_csv(download_link)


In [150]:
dataset

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


## Encoding the Categorical columns

In [151]:
# Categorical columns
categorical_columns = ['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'HeartDisease']

# Apply label encoding
label_encoder = LabelEncoder()
for col in categorical_columns:
    dataset[col] = label_encoder.fit_transform(dataset[col])

# Convert all columns to integers (except 'Age' and 'Cholesterol' which are already integers)
dataset = dataset.astype(int)
dataset


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0,2,0
1,49,0,2,160,180,0,1,156,0,1,1,1
2,37,1,1,130,283,0,2,98,0,0,2,0
3,48,0,0,138,214,0,1,108,1,1,1,1
4,54,1,2,150,195,0,1,122,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,110,264,0,1,132,0,1,1,1
914,68,1,0,144,193,1,1,141,0,3,1,1
915,57,1,0,130,131,0,1,115,1,1,1,1
916,57,0,1,130,236,0,0,174,0,0,1,1


In [152]:
# Now 'dataset' contains the preprocessed data with all columns as integers.
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [153]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state = 42)

## Feature Scaling

In [154]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) #avoid data leakage

## Training the K-NN model on the Training set

In [155]:
from math import sqrt

class KNN():
    def __init__(self, k):
        self.k = k

    def fit(self, X_train, y_train):
        self.x_train = X_train
        self.y_train = y_train

    def calculate_euclidean(self, sample1, sample2):
        distance = np.sqrt(np.sum((sample1 - sample2)**2))
        return distance

    def nearest_neighbors(self, test_sample):
        distances = []
        for i in range(len(self.x_train)):
            distance = self.calculate_euclidean(self.x_train[i], test_sample)
            distances.append((self.y_train[i], distance))
        distances.sort(key=lambda x: x[1])
        return distances

    def predict(self, test_set):
        predictions = []
        for test_sample in test_set:
            neighbors = self.nearest_neighbors(test_sample)
            k_nearest_labels = [neighbor[0] for neighbor in neighbors[:self.k]]
            predictions.append((k_nearest_labels, neighbors[:self.k]))
        return predictions


    def majority_vote(self, predictions):
        counts = {}
        for labels, _ in predictions:
            for label in labels:
                if label in counts:
                    counts[label] += 1
                else:
                    counts[label] = 1
        max_label = max(counts, key=counts.get)
        return max_label


In [156]:
# Using our modified KNN model
model = KNN(25)
model.fit(X_train, y_train)

In [157]:
# Training the inbuilt K-NN model on the Training set
knn_classifier = KNeighborsClassifier(n_neighbors=25, p=2)
knn_classifier.fit(X_train, y_train)

## Predicting the Test set results

### a) Using the KNN Implemented Class

In [158]:
predictions = model.predict(X_test)  # our model's predictions
print(predictions)

[([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1], [(0, 1.5300445978952786), (0, 1.556039410480794), (0, 1.8172702540067305), (0, 1.832672614872204), (0, 1.9069246965831501), (0, 1.984154587174136), (0, 2.008354919117871), (0, 2.0223718991339394), (0, 2.02730947992847), (0, 2.0486771653593823), (0, 2.129622411735166), (0, 2.2222479407565627), (0, 2.292917919023391), (0, 2.2976488893203855), (0, 2.3234600502199845), (0, 2.3467991351157824), (1, 2.399903967333162), (0, 2.406937756718335), (0, 2.4153541335554434), (0, 2.439086443775984), (0, 2.4633565933331436), (0, 2.471525365467336), (0, 2.4815617972349786), (0, 2.4983996567736453), (1, 2.5079574748439426)]), ([1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1], [(1, 2.3926340482717685), (0, 2.8833560694385767), (1, 2.916322592532682), (0, 3.034061951677914), (1, 3.0584302934929486), (0, 3.1181588972938297), (0, 3.135504979810072), (0, 3.1365836339343476), (0, 3.1397102163406454), (0, 

In [159]:
print("The majority class : ", model.majority_vote(predictions))

The majority class :  1


### b) Using the Inbuild KNN Class from sklearn

In [160]:
# Predicting the Test set results using inbuilt K-NN model
pred_knn_classifier = knn_classifier.predict(X_test)

## Making the Confusion Matrix to compare both models

In [161]:
# Extracting the k-nearest labels from the predictions
y_pred_custom_knn = [pred[0][0] for pred in predictions]

# Printing the confusion matrix and accuracy for our custom KNN model
cm_custom_knn = confusion_matrix(y_test, y_pred_custom_knn)
accuracy_custom_knn = accuracy_score(y_test, y_pred_custom_knn)

print("Confusion Matrix for Custom K-NN Model:")
print(cm_custom_knn)
print("Accuracy for Custom K-NN Model:", accuracy_custom_knn)

Confusion Matrix for Custom K-NN Model:
[[ 99  13]
 [ 31 133]]
Accuracy for Custom K-NN Model: 0.8405797101449275


In [162]:
# Calculating accuracy and confusion matrix for inbuilt K-NN model
accuracy_knn_classifier = accuracy_score(y_test, pred_knn_classifier)
cm_knn_classifier = confusion_matrix(y_test, pred_knn_classifier)

# Printing the accuracy and confusion matrix for inbuilt K-NN model
print("Accuracy for inbuilt K-NN Model:", accuracy_knn_classifier)
print("Confusion Matrix for inbuilt K-NN Model:")
print(cm_knn_classifier)

Accuracy for inbuilt K-NN Model: 0.8405797101449275
Confusion Matrix for inbuilt K-NN Model:
[[100  12]
 [ 32 132]]
