In [124]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [125]:
# Load the heart disease dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
data = pd.read_csv(url, names=names)
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [126]:
data.dtypes

age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca           object
thal         object
target        int64
dtype: object

In [127]:
# Show the number of "?" values for each column
print(data.eq('?').sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64


In [128]:
# Calculate the median of "ca" and "thal" columns
data = data.replace('?', np.nan) # replace '?' with NaN
data = data.astype(float) # convert all columns to float
median_ca = data["ca"].median()
median_thal = data["thal"].median()

print("Median of 'ca' column:", median_ca)
print("Median of 'thal' column:", median_thal)

Median of 'ca' column: 0.0
Median of 'thal' column: 3.0


In [129]:
# Preprocess the data
imputer = SimpleImputer(strategy='median')
data = imputer.fit_transform(data)
data = pd.DataFrame(data, columns=names)
data['target'] = np.where(data['target'] > 0, 1, 0) # convert target values to binary



In [130]:
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [131]:
print(data.eq('?').sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [132]:
data.dtypes

age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca          float64
thal        float64
target        int64
dtype: object

In [133]:
# Split the data into training and testing sets
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [134]:
# Show the train and test datasets
print("Training data:")
print(X_train)
print(y_train)
print("Test data:")
print(X_test)
print(y_test)

Training data:
[[58.  1.  4. ...  2.  3.  7.]
 [54.  1.  4. ...  2.  1.  7.]
 [56.  1.  4. ...  2.  1.  3.]
 ...
 [62.  1.  4. ...  2.  2.  7.]
 [54.  1.  4. ...  2.  2.  3.]
 [57.  1.  4. ...  2.  1.  6.]]
[1 1 1 0 1 0 0 1 1 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1 0 0 1 1
 0 0 1 0 1 1 0 1 0 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0
 1 0 0 1 0 1 0 1 1 0 0 1 0 1 1 0 0 1 0 0 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1
 0 1 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 1 0 0 1 1
 0 0 1 1 1 0 1 1 0 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 1 1 0 1
 1 0 1 0 0 1 1 1 1 1 0 1 1 0 0 0 1 0 1 0 1 0 0 0 1 0 1 0 1 0 1 1 1 1 0 1 0
 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1]
Test data:
[[4.30e+01 1.00e+00 4.00e+00 1.10e+02 2.11e+02 0.00e+00 0.00e+00 1.61e+02
  0.00e+00 0.00e+00 1.00e+00 0.00e+00 7.00e+00]
 [6.80e+01 1.00e+00 3.00e+00 1.18e+02 2.77e+02 0.00e+00 0.00e+00 1.51e+02
  0.00e+00 1.00e+00 1.00e+00 1.00e+00 7.00e+00]
 [5.90e+01 1.00e+00 4.00e+00 1.38e+02 2.71e+02

In [135]:
# Define the distance metric (Euclidean distance)
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [136]:
# Define the KNN algorithm
def knn(X_train, y_train, X_test, k):
    # Calculate distances between X_test and all training samples
    distances = []
    for i in range(len(X_train)):
        dist = euclidean_distance(X_train[i], X_test)
        distances.append((dist, y_train[i]))
    
    # Print out the distance and class label for each training sample
    print("Distances to training samples:")
    for d, label in distances:
        print(f"Distance: {d:.2f}, Label: {label}")
    
    # Select the k nearest neighbors
    k_nearest_neighbors = sorted(distances)[:k]
    
    # Count the number of occurrences of each class among the k nearest neighbors
    class_counts = {}
    for _, label in k_nearest_neighbors:
        class_counts[label] = class_counts.get(label, 0) + 1
    
    # Assign the class label of the test sample based on majority vote
    predicted_label = max(class_counts, key=class_counts.get)
    return predicted_label

In [137]:
# Make predictions on the testing set using the KNN algorithm
y_pred = []
for i in range(len(X_test)):
    y_pred.append(knn(X_train, y_train, X_test[i], k=5))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Distance: 27.42, Label: 0
Distance: 106.66, Label: 1
Distance: 74.14, Label: 1
Distance: 99.33, Label: 1
Distance: 85.31, Label: 0
Distance: 39.04, Label: 1
Distance: 52.98, Label: 1
Distance: 64.47, Label: 1
Distance: 72.85, Label: 1
Distance: 79.18, Label: 0
Distance: 52.55, Label: 1
Distance: 99.19, Label: 1
Distance: 39.06, Label: 0
Distance: 62.23, Label: 0
Distance: 91.30, Label: 0
Distance: 102.35, Label: 0
Distance: 37.84, Label: 0
Distance: 47.56, Label: 0
Distance: 50.47, Label: 1
Distance: 159.16, Label: 0
Distance: 36.39, Label: 0
Distance: 158.04, Label: 1
Distance: 18.91, Label: 1
Distance: 53.45, Label: 0
Distance: 83.11, Label: 0
Distance: 107.89, Label: 0
Distance: 17.20, Label: 0
Distance: 261.56, Label: 0
Distance: 43.33, Label: 1
Distance: 53.61, Label: 0
Distance: 42.42, Label: 0
Distance: 29.05, Label: 1
Distance: 19.29, Label: 0
Distance: 72.46, Label: 0
Distance: 45.33, Label: 0
Distance: 68.80, La

In [138]:
# Calculate the accuracy of the KNN algorithm
accuracy = np.sum(y_pred == y_test) / len(y_test)
print(f'Accuracy: {accuracy:.2%}')

Accuracy: 72.13%


In [139]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

In [140]:
# Define the K value
k = 5

In [141]:
# Perform cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_idx, test_idx in kfold.split(X):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    y_pred = []
    for i in range(len(X_test)):
        y_pred.append(knn(X_train, y_train, X_test[i], k))
    cm = confusion_matrix(y_test, y_pred)
    confusion += cm
    score = np.sum(np.diagonal(cm)) / np.sum(cm)
    scores.append(score)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Distance: 121.79, Label: 0
Distance: 36.27, Label: 1
Distance: 103.64, Label: 0
Distance: 84.99, Label: 0
Distance: 61.93, Label: 0
Distance: 29.57, Label: 0
Distance: 35.09, Label: 1
Distance: 157.91, Label: 0
Distance: 79.15, Label: 1
Distance: 103.89, Label: 0
Distance: 83.17, Label: 1
Distance: 82.07, Label: 0
Distance: 110.41, Label: 1
Distance: 88.64, Label: 1
Distance: 86.83, Label: 0
Distance: 143.52, Label: 1
Distance: 70.67, Label: 0
Distance: 42.23, Label: 1
Distance: 95.86, Label: 1
Distance: 75.05, Label: 0
Distance: 70.06, Label: 0
Distance: 34.42, Label: 1
Distance: 123.27, Label: 0
Distance: 82.30, Label: 0
Distance: 89.30, Label: 1
Distance: 78.12, Label: 1
Distance: 69.12, Label: 1
Distance: 119.23, Label: 0
Distance: 45.62, Label: 1
Distance: 70.83, Label: 1
Distance: 63.32, Label: 1
Distance: 117.29, Label: 0
Distance: 80.13, Label: 0
Distance: 72.07, Label: 0
Distance: 65.59, Label: 1
Distance: 64.73,

In [142]:
# Print the results
print(f'Accuracy: {np.mean(scores):.2%} (std={np.std(scores):.2%})')
print(f'Confusion matrix:\n{confusion}')

Accuracy: 65.32% (std=5.38%)
Confusion matrix:
[[118  46]
 [ 59  80]]


In [143]:
# Perform cross-validation to find the best k
kf = KFold(n_splits=5, shuffle=True, random_state=1)
k_values = range(1, 5)

mean_accuracies = []

for k in k_values:
    fold_accuracies = []
    print("k = ",k)
    spl = 1
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        y_pred = []
        for i in range(len(X_test)):
            y_pred.append(knn(X_train, y_train, X_test[i], k))

        fold_accuracy = accuracy_score(y_test, y_pred)
        
        print(f'CV = {spl}, accuracy = {fold_accuracy:.2%}')
        spl+=1
        fold_accuracies.append(fold_accuracy)

    mean_accuracy = np.mean(fold_accuracies)
    mean_accuracies.append(mean_accuracy)
    print(f'k = {k}, mean accuracy = {mean_accuracy:.2%}')

best_k = k_values[np.argmax(mean_accuracies)]
print(f'\nBest k = {best_k}, best mean accuracy = {max(mean_accuracies):.2%}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Distance: 61.93, Label: 0
Distance: 29.57, Label: 0
Distance: 35.09, Label: 1
Distance: 157.91, Label: 0
Distance: 79.15, Label: 1
Distance: 103.89, Label: 0
Distance: 83.17, Label: 1
Distance: 82.07, Label: 0
Distance: 110.41, Label: 1
Distance: 88.64, Label: 1
Distance: 86.83, Label: 0
Distance: 143.52, Label: 1
Distance: 70.67, Label: 0
Distance: 42.23, Label: 1
Distance: 95.86, Label: 1
Distance: 75.05, Label: 0
Distance: 70.06, Label: 0
Distance: 34.42, Label: 1
Distance: 123.27, Label: 0
Distance: 82.30, Label: 0
Distance: 89.30, Label: 1
Distance: 78.12, Label: 1
Distance: 69.12, Label: 1
Distance: 119.23, Label: 0
Distance: 45.62, Label: 1
Distance: 70.83, Label: 1
Distance: 63.32, Label: 1
Distance: 117.29, Label: 0
Distance: 80.13, Label: 0
Distance: 72.07, Label: 0
Distance: 65.59, Label: 1
Distance: 64.73, Label: 0
Distance: 55.79, Label: 0
Distance: 190.94, Label: 0
Distance: 106.01, Label: 0
Distance: 20.97,