In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:

# Load the heart disease dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
data = pd.read_csv(url, names=names)
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [3]:
# Preprocess the data
data = data.replace('?', np.nan) # replace missing values with NaN
data = data.dropna() # drop rows with missing values
data = data.astype('float64') # convert all columns to float64
data['target'] = np.where(data['target'] > 0, 1, 0) # convert target values to binary
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [4]:
# Split the data into training and testing sets
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [5]:

# Define the distance metric (Euclidean distance)
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [6]:

# Define the KNN algorithm
def knn(X_train, y_train, X_test, k):
    # Calculate distances between X_test and all training samples
    distances = []
    for i in range(len(X_train)):
        dist = euclidean_distance(X_train[i], X_test)
        distances.append((dist, y_train[i]))
    
    # Select the k nearest neighbors
    k_nearest_neighbors = sorted(distances)[:k]
    
    # Count the number of occurrences of each class among the k nearest neighbors
    class_counts = {}
    for _, label in k_nearest_neighbors:
        class_counts[label] = class_counts.get(label, 0) + 1
    
    # Assign the class label of the test sample based on majority vote
    predicted_label = max(class_counts, key=class_counts.get)
    return predicted_label


In [7]:
# Make predictions on the testing set using the KNN algorithm
y_pred = []
for i in range(len(X_test)):
    y_pred.append(knn(X_train, y_train, X_test[i], k=5))


In [8]:

# Calculate the accuracy of the KNN algorithm
accuracy = np.sum(y_pred == y_test) / len(y_test)
print(f'Accuracy: {accuracy:.2%}')


Accuracy: 70.00%
