In [9]:
import pandas as pd
import numpy as np

data_df=pd.read_csv('heart-numerical.csv')

data_df.head(3)

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,ca,disease
0,63,145,233,150,2.3,0,absence
1,67,160,286,108,1.5,3,presence
2,67,120,229,129,2.6,2,presence


In [10]:
# build data matrix

X=data_df.drop('disease', axis=1).values
y=data_df['disease'].values

print('X:', X.shape, X.dtype) # (303, 6) float64
print('y:', y.shape, y.dtype) # dtype object

# Print labels
labels = np.unique(y)
print('Labels:', labels) # ['absence' 'presence']


X: (303, 6) float64
y: (303,) object
Labels: ['absence' 'presence']


In [11]:
from sklearn.model_selection import train_test_split

In [15]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0)
print('Train set:', X_tr.shape, y_tr.shape)
# Prints: (212, 6) (212,)

print('Test set:', X_te.shape, y_te.shape)
# Prints: (91, 6) (91,)

Train set: (212, 6) (212,)
Test set: (91, 6) (91,)


In [18]:
n_absence=np.sum(y_tr == 'absence')
print("Number of absence:", n_absence)

Number of absence: 117


In [26]:
# Probability of 'absence'
p_absence = n_absence / len(y_tr)

print('Probability of absence on train set: {:.1f}%'.format(p_absence*100))
# Prints: 0.55

Probability of absence on train set: 55.2%


In [25]:
# On the test set
p_absence_te = np.sum(y_te == 'absence') / len(y_te)

print('Probability of absence on test set: {:.1f}%'.format(p_absence_te*100))
# Prints: 0.52

Probability of absence on test set: 51.6%


In [27]:
from sklearn.dummy import DummyClassifier

# Create the dummy classifier
dummy = DummyClassifier(strategy='most_frequent')

In [33]:
# Fit it
dummy.fit(X_tr, y_tr)

# Compute test accuracy
accuracy = dummy.score(X_te, y_te)
print('Accuracy: {:.2f}'.format(accuracy))
# Prints: 0.52

Accuracy: 0.52


In [34]:
# "Most-frequent" predictions
y_pred_absence = dummy.predict(X_te)
print('Predicted:', y_pred_absence[:5], '..') # ['absence' 'absence' 'absence' 'absence' 'absence' .. ]
print('True labels:', y_te[:5], '..') # ['absence' 'absence' 'presence' 'absence' 'presence' .. ]

Predicted: ['absence' 'absence' 'absence' 'absence' 'absence'] ..
True labels: ['absence' 'absence' 'presence' 'absence' 'presence'] ..


In [35]:
from sklearn.metrics import confusion_matrix

# Confusion matrix
matrix = confusion_matrix(y_true=y_te, y_pred=y_pred_absence)
print(matrix)
# [[47  0]
#  [44  0]]

[[47  0]
 [44  0]]


In [36]:
# Confusion matrix as a DataFrame
matrix_df = pd.DataFrame(
    matrix, 
    columns=['pred: absence', 'pred: presence'],
    index=['true: absence', 'true: presence']
)
matrix_df

Unnamed: 0,pred: absence,pred: presence
true: absence,47,0
true: presence,44,0


In [37]:
from sklearn.metrics import precision_score

precision_score(y_true=y_te, y_pred=y_pred_absence, pos_label='presence')
# Returns: "UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples."

  'precision', 'predicted', average, warn_for)


0.0

In [38]:
# Precision of the "always predicts presence" baseline
y_pred_presence = np.full_like(y_te, fill_value='presence')
precision_score(y_true=y_te, y_pred=y_pred_presence, pos_label='presence') # ~ 0.48

0.4835164835164835

In [39]:
from sklearn.metrics import recall_score

recall_score(y_true=y_te, y_pred=y_pred_absence, pos_label='presence') # 0.0

0.0

In [40]:
# Recall of the "always predicts presence" baseline
recall_score(y_true=y_te, y_pred=y_pred_presence, pos_label='presence') # 1.0

1.0

In [41]:
from sklearn.metrics import f1_score

f1_score(y_true=y_te, y_pred=y_pred_presence, pos_label='presence') # ~0.65

0.6518518518518518

In [42]:
from sklearn.metrics import classification_report

report = classification_report(y_true=y_te, y_pred=y_pred_presence)
print(report)

              precision    recall  f1-score   support

     absence       0.00      0.00      0.00        47
    presence       0.48      1.00      0.65        44

   micro avg       0.48      0.48      0.48        91
   macro avg       0.24      0.50      0.33        91
weighted avg       0.23      0.48      0.32        91



  'precision', 'predicted', average, warn_for)
