In [104]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import random

import matplotlib.pyplot as plt

Data:
* age: age in years 
* sex: sex (1 = male; 0 = female) 
* cp: chest pain type 
 * -- Value 1: typical angina 
 * -- Value 2: atypical angina 
 * -- Value 3: non-anginal pain 
 * -- Value 4: asymptomatic 
* trestbps: resting blood pressure (in mm Hg on admission to the hospital) 
* chol: serum cholestoral in mg/dl 
* fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
* restecg: resting electrocardiographic results 
 * -- Value 0: normal 
 * -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
 * -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 
* thalach: maximum heart rate achieved 
* exang: exercise induced angina (1 = yes; 0 = no) 
* oldpeak = ST depression induced by exercise relative to rest 
* slope: the slope of the peak exercise ST segment 
 * -- Value 1: upsloping 
 * -- Value 2: flat 
 * -- Value 3: downsloping 
* ca: number of major vessels (0-3) colored by flourosopy 
* thal: 3 = normal; 6 = fixed defect; 7 = reversable defect 
* num: diagnosis of heart disease (angiographic disease status) 
 * -- Value 0: absence.
 * -- Value 1,2,3,4: presence of heart disease


In [None]:
# Comma-separated values
df = pd.read_csv('cleveland.csv')

# Rename 'num' column to 'disease'
df = df.rename({'num':'disease'}, axis=1)
df['disease'] = df.disease.apply(lambda x: min(x, 1))
display(df.head(5))

fig, (ax1, ax2) = plt.subplots(2, 1)
# make a little extra space between the subplots
fig.subplots_adjust(hspace=0.5)

ax1.hist(df[df.disease == 0].age);
# ax1.set_xlabel('age');
ax1.set_ylabel('number of patients');
ax1.set_xlim(20, 80);
ax1.set_ylim(0, 45);
ax1.set_title('healthy');

ax2.hist(df[df.disease == 1].age, color='orange');
ax2.set_xlabel('age');
ax2.set_ylabel('number of patients');
ax2.set_xlim(20, 80);
ax2.set_ylim(0, 45);
ax2.set_title('has heart disease');


In [None]:
# Use knn on age
nn = NearestNeighbors(n_neighbors=5, metric='euclidean', algorithm='auto')
X = [[x] for x in df.age]

# This builds an index data structure under the hood for query performance
fit = nn.fit(X)

# Find the k nearest neighbors
distances, indices = fit.kneighbors(44)
display(indices[0])

nbrs = df.iloc[indices[0]]
display(nbrs)

healthy = nbrs[nbrs.disease == 0].count().disease
sick = nbrs[nbrs.disease > 0].count().disease
print('healthy: {}\nsick: {}'.format(healthy, sick))


In [122]:
X = df[['age', 'trestbps', 'disease']].values

# This builds an index data structure under the hood for query performance
fit = nn.fit(X[:,:-1])

# Get a random patient to test on
patient = X[random.randint(0,len(X)-1)]
display(patient)

# Find the k nearest neighbors to the patient
distances, indices = fit.kneighbors([patient[:-1]])
nbrs = df.iloc[indices[0]]
display(nbrs)

healthy = nbrs[nbrs.disease == 0].count().disease
sick = nbrs[nbrs.disease == 1].count().disease
print('healthy: {}\nsick: {}'.format(healthy, sick))
predict = 0 if (healthy > sick) else 1
actual = 0 if (patient[-1] == 0) else 1
success = predict == actual
print(success)

array([ 69., 160.,   0.])

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
196,69.0,1.0,1.0,160.0,234.0,1.0,2.0,131.0,0.0,0.1,2.0,1.0,3.0,0
170,70.0,1.0,3.0,160.0,269.0,0.0,0.0,112.0,1.0,2.9,2.0,1.0,7.0,1
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
42,71.0,0.0,2.0,160.0,302.0,0.0,0.0,162.0,0.0,0.4,1.0,2.0,3.0,0
187,66.0,1.0,2.0,160.0,246.0,0.0,0.0,120.0,1.0,0.0,2.0,3.0,6.0,1


healthy: 2
sick: 3
False
