Baseline for classification
---

In [None]:
import pandas as pd

# Load data
data_df = pd.read_csv('heart-numerical.csv')

# First five rows
data_df.head()

In [None]:
import numpy as np

# Create X/y arrays
X = data_df.drop('disease', axis=1).values
y = data_df.disease.values

print('X:', X.shape, X.dtype)
print('y:', y.shape, y.dtype)

# Print labels
labels = np.unique(y)
print('Labels:', labels)

In [None]:
from sklearn.model_selection import train_test_split

# Split data
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.3, random_state=0)

print('Train set:', X_tr.shape, y_tr.shape)
print('Test set:', X_te.shape, y_te.shape)

In [None]:
# Count the number of entries labeled with 'absence'
n_absence = np.sum(y_tr == 'absence')

print('Total absence:', n_absence)

In [None]:
# Probability of 'absence'
p_absence = n_absence / len(y_tr)

print('Probability of absence: {:.2f}'.format(p_absence))

In [None]:
# On the test set
p_absence_te = np.sum(y_te == 'absence') / len(y_te)
print('Probability of absence: {:.2f}'.format(p_absence_te))

In [None]:
# Compute distribution using Pandas
pd.Series(y_tr).value_counts() / len(y_tr)

In [None]:
# Compute distribution using Pandas
pd.Series(y_tr).value_counts(normalize=True)

In [None]:
from sklearn.dummy import DummyClassifier

# Create the dummy classifier
dummy = DummyClassifier(strategy='most_frequent')

# Fit it
dummy.fit(None, y_tr)

# Compute test accuracy
accuracy = dummy.score(None, y_te)
print('Accuracy: {:.2f}'.format(accuracy))

In [None]:
# "Most-frequent" predictions
y_pred_absence = dummy.predict(X_te)
print('Predicted:', y_pred_absence[:5], '..')
print('True labels:', y_te[:5], '..')

In [None]:
from sklearn.metrics import confusion_matrix

# Confusion matrix
matrix = confusion_matrix(y_true=y_te, y_pred=y_pred_absence)
print(matrix)

In [None]:
# Confusion matrix as a DataFrame
matrix_df = pd.DataFrame(
    matrix, 
    columns=['pred: absence', 'pred: presence'],
    index=['true: absence', 'true: presence']
)
matrix_df

In [None]:
from sklearn.metrics import precision_score

precision_score(y_true=y_te, y_pred=y_pred_absence, pos_label='presence')

In [None]:
# Precision of the "always predicts presence" baseline
y_pred_presence = np.full_like(y_te, fill_value='presence')
precision_score(y_true=y_te, y_pred=y_pred_presence, pos_label='presence')

In [None]:
from sklearn.metrics import recall_score

recall_score(y_true=y_te, y_pred=y_pred_absence, pos_label='presence')

In [None]:
# Recall of the "always predicts presence" baseline
recall_score(y_true=y_te, y_pred=y_pred_presence, pos_label='presence')

In [None]:
from sklearn.metrics import f1_score

f1_score(y_true=y_te, y_pred=y_pred_presence, pos_label='presence')

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_true=y_te, y_pred=y_pred_presence)
print(report)