In [31]:
import pandas as pd
import numpy as np
ha = pd.read_csv("https://www.dropbox.com/s/aohbr6yb9ifmc8w/heart_attack.csv?dl=1")

In [32]:
cp_dummies = pd.get_dummies(ha["cp"], prefix="cp")
ha = pd.concat([ha.drop(columns=["cp"]), cp_dummies], axis=1)
ha["asymptomatic"] = ha["cp_0"]
ha["typical_angina"] = ha["cp_1"]
ha["atypical_angina"] = ha["cp_2"]
ha["non-anginal_pain"] = ha["cp_3"]
ha = pd.concat([ha.drop(columns=["cp_0","cp_1","cp_2","cp_3"])])
cp_dummies = pd.get_dummies(ha["restecg"], prefix="re")
ha = pd.concat([ha.drop(columns=["restecg"]), cp_dummies], axis=1)
ha["ecg_normal"] = ha["re_0"]
ha["ecg_abnormality"] = ha["re_1"]
ha["ecg_hypertrophy"] = ha["re_2"]
ha = pd.concat([ha.drop(columns=["re_0","re_1","re_2"])])


In [33]:
ha.head()

Unnamed: 0,age,sex,trtbps,chol,thalach,output,asymptomatic,typical_angina,atypical_angina,non-anginal_pain,ecg_normal,ecg_abnormality,ecg_hypertrophy
0,63,1,145,233,150,1,False,False,False,True,True,False,False
1,37,1,130,250,187,1,False,False,True,False,False,True,False
2,56,1,120,236,178,1,False,True,False,False,False,True,False
3,57,0,120,354,163,1,True,False,False,False,False,True,False
4,57,1,140,192,148,1,True,False,False,False,False,True,False


## Part 1

In [34]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, ConfusionMatrixDisplay, confusion_matrix, RocCurveDisplay

X = ha.drop("output", axis=1)
y = ha["output"]

num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()

preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Part 1 Q1: KNN 

In [35]:
knn_pipe = Pipeline([
    ("preprocess", preprocess),
    ("knn", KNeighborsClassifier())
])

knn_grid = {
    "knn__n_neighbors": [3,5,7,9,11],
    "knn__weights": ["uniform", "distance"]
}

knn_cv = GridSearchCV(knn_pipe, knn_grid, cv=5, scoring="roc_auc")
knn_cv.fit(X_train, y_train)

print("Best KNN Params:", knn_cv.best_params_)
print("Best CV ROC AUC:", knn_cv.best_score_)

knn_best = knn_cv.best_estimator_
y_pred_knn = knn_best.predict(X_test)
print(confusion_matrix(y_test, y_pred_knn))


Best KNN Params: {'knn__n_neighbors': 9, 'knn__weights': 'distance'}
Best CV ROC AUC: 0.8665967908902692
[[18  8]
 [13 16]]


Part 1 Q2: Logistic Regression

In [36]:
log_pipe = Pipeline([
    ("preprocess", preprocess),
    ("logreg", LogisticRegression())
])

log_grid = {
    "logreg__C": [0.01, 0.1, 1, 5, 10],
    "logreg__penalty": ["l2"]
}

log_cv = GridSearchCV(log_pipe, log_grid, cv=5, scoring="roc_auc")
log_cv.fit(X_train, y_train)

print("Best Logistic Regression Params:", log_cv.best_params_)
print("Best CV ROC AUC:", log_cv.best_score_)

log_best = log_cv.best_estimator_
y_pred_log = log_best.predict(X_test)
print(confusion_matrix(y_test, y_pred_log))

final_lr = log_cv.best_estimator_.named_steps["logreg"]
print("Logistic Regression Coefficients:")
print(final_lr.coef_)


Best Logistic Regression Params: {'logreg__C': 0.1, 'logreg__penalty': 'l2'}
Best CV ROC AUC: 0.8970807453416147
[[18  8]
 [11 18]]
Logistic Regression Coefficients:
[[-0.23739389 -0.73383244 -0.09770441 -0.12483801  0.79112446  0.52842378 -0.52854617
  -0.27401039  0.27388799 -0.18963823  0.18951583 -0.06514235  0.06501996  0.01316148
  -0.01328388 -0.0472266   0.0471042   0.03382033 -0.03394272]]


Part 1 Q3: Descision Tree

In [37]:
tree_pipe = Pipeline([
    ("preprocess", preprocess),
    ("tree", DecisionTreeClassifier(random_state=42))
])

tree_grid = {
    "tree__max_depth": [3,5,7,10, None],
    "tree__min_samples_split": [2,5,10],
    "tree__min_samples_leaf": [1,3,5]
}

tree_cv = GridSearchCV(tree_pipe, tree_grid, cv=5, scoring="roc_auc")
tree_cv.fit(X_train, y_train)

print("Best Decision Tree Params:", tree_cv.best_params_)
print("Best CV ROC AUC:", tree_cv.best_score_)

tree_best = tree_cv.best_estimator_
y_pred_tree = tree_best.predict(X_test)
print(confusion_matrix(y_test, y_pred_tree))


Best Decision Tree Params: {'tree__max_depth': 3, 'tree__min_samples_leaf': 1, 'tree__min_samples_split': 2}
Best CV ROC AUC: 0.8276695134575569
[[17  9]
 [11 18]]


Part 1 Q4: Interpretation

In [None]:
ohe = tree_best.named_steps["preprocess"].named_transformers_["cat"]
cat_feature_names = ohe.get_feature_names_out(cat_cols)
feature_names = np.concatenate([num_cols, cat_feature_names])

importances = tree_best.named_steps["tree"].feature_importances_
importance_df = pd.DataFrame({"feature": feature_names, "importance": importances})
importance_df.sort_values("importance", ascending=False).head(10)


Unnamed: 0,feature,importance
5,asymptomatic_False,0.568116
4,thalach,0.210348
1,sex,0.100022
2,trtbps,0.060913
0,age,0.060602
3,chol,0.0
6,asymptomatic_True,0.0
7,typical_angina_False,0.0
8,typical_angina_True,0.0
9,atypical_angina_False,0.0


asymptomatic_False,thalach,sex,trtbps, and age were the 5 most important variables.

Part 1 Q5: ROC Curve

## Part 2 Metrics

In [39]:
from sklearn.metrics import make_scorer, recall_score, precision_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

tpr_scorer = make_scorer(recall_score, pos_label=1)

precision_scorer = make_scorer(precision_score, pos_label=1)

tnr_scorer = make_scorer(recall_score, pos_label=0)

knn_tpr = cross_val_score(knn_best, X, y, cv=5, scoring=tpr_scorer).mean()
knn_precision = cross_val_score(knn_best, X, y, cv=5, scoring=precision_scorer).mean()
knn_tnr = cross_val_score(knn_best, X, y, cv=5, scoring=tnr_scorer).mean()

print("KNN")
print("TPR (Sensitivity):", knn_tpr)
print("Precision:", knn_precision)
print("TNR (Specificity):", knn_tnr)
print("\n")


log_tpr = cross_val_score(log_best, X, y, cv=5, scoring=tpr_scorer).mean()
log_precision = cross_val_score(log_best, X, y, cv=5, scoring=precision_scorer).mean()
log_tnr = cross_val_score(log_best, X, y, cv=5, scoring=tnr_scorer).mean()

print("LOGISTIC REGRESSION")
print("TPR (Sensitivity):", log_tpr)
print("Precision:", log_precision)
print("TNR (Specificity):", log_tnr)
print("\n")

tree_tpr = cross_val_score(tree_best, X, y, cv=5, scoring=tpr_scorer).mean()
tree_precision = cross_val_score(tree_best, X, y, cv=5, scoring=precision_scorer).mean()
tree_tnr = cross_val_score(tree_best, X, y, cv=5, scoring=tnr_scorer).mean()

print("DECISION TREE")
print("TPR (Sensitivity):", tree_tpr)
print("Precision:", tree_precision)
print("TNR (Specificity):", tree_tnr)


KNN
TPR (Sensitivity): 0.7673563218390804
Precision: 0.7936904761904762
TNR (Specificity): 0.7630769230769231


LOGISTIC REGRESSION
TPR (Sensitivity): 0.7949425287356322
Precision: 0.7818471351881663
TNR (Specificity): 0.7301538461538462


DECISION TREE
TPR (Sensitivity): 0.7397701149425286
Precision: 0.7957657923175165
TNR (Specificity): 0.7787692307692308


## Part 3 Discussion

1. In this case I would focus on True Positive Rate (Sensitivity / Recall). We mainly want to avoid false negatives—patients who truly have high heart attack risk but are predicted as “no heart attack.” A high sensitivity means the model is catching most of the patients who are actually positive, even if that means we raise more false alarms.

Based on my cross-validated results, the Logistic Regression model has the highest sensitivity, with a TPR of about 0.795, compared to about 0.767 for KNN and 0.740 for the Decision Tree. I would recommend Logistic Regression.

From the cross-validation, I would expect a sensitivity of around 0.79 on new patients, with a ROC AUC of about 0.90 (0.897) giving strong overall discrimination between high-risk and low-risk patients.


2. Here I would focus on Precision (Positive Predictive Value). When we predict that someone is “high risk,” we want that prediction to be correct as often as possible, because each “high-risk” label uses up a hospital bed. High precision means fewer false positives, so we don’t waste limited resources on people who are actually low risk.

Looking at the cross-validated metrics, the Decision Tree model has the highest precision at about 0.796, slightly higher than KNN at about 0.794 and Logistic Regression at about 0.782. I would recommend the Decision Tree model, since it gives the most reliable positive predictions.

I would expect a precision of around 0.80 for future patients using the Decision Tree, meaning that roughly 80% of patients the model marks as high risk would truly be high risk.

3. For this goal, the main priority is interpretability, not just predictive performance. Instead of focusing on sensitivity or precision, I would focus on models where the relationship between predictors and risk is easy to explain, especially the sign and size of coefficients.

I would recommend the Logistic Regression model here. Its coefficients tell us how each variable relates to the odds of a heart attack (positive coefficients increase risk, negative coefficients decrease risk). Combined with the importance results from the Decision Tree, we see that features like asymptomatic chest pain status, maximum heart rate achieved (thalach), sex, resting blood pressure (trtbps), and age are especially important in predicting heart attack risk. 

Even though explanation is the main goal, it is good that the Logistic Regression still performs well, with a cross-validated ROC AUC of about 0.897. This means the model not only helps us understand root causes, but also separates higher-risk and lower-risk patients quite effectively.

4. In this setting, the model is being used as a benchmark to compare against human diagnoses. I would want a model with strong overall discriminative ability and fairly balanced performance, so the main metric I would use is ROC AUC, which summarizes how well the model ranks patients from low to high risk across all possible thresholds. I might also look at overall accuracy or both sensitivity and specificity together, but ROC AUC is the clean summary.

Since Logistic Regression has the highest cross-validated ROC AUC (about 0.897), better than KNN (~0.867) and the Decision Tree (~0.828), it is the best candidate to serve as the “gold standard” reference model for comparing to new doctors. 

I would expect the Logistic Regression model to maintain a ROC AUC of around 0.90 on future patients. That means, in random pairs of patients where one has a heart attack and one does not, the model should correctly rank the higher-risk patient above the lower-risk patient about 90% of the time. This makes it a strong and consistent benchmark for evaluating how well the new doctors’ diagnoses line up with a well-performing algorithm.

## Part 4: Validation

In [40]:
ha_validation = pd.read_csv("https://www.dropbox.com/s/jkwqdiyx6o6oad0/heart_attack_validation.csv?dl=1")
cp_dummies = pd.get_dummies(ha_validation["cp"], prefix="cp")
ha_validation = pd.concat([ha_validation.drop(columns=["cp"]), cp_dummies], axis=1)
ha_validation["asymptomatic"] = ha_validation["cp_0"]
ha_validation["typical_angina"] = ha_validation["cp_1"]
ha_validation["atypical_angina"] = ha_validation["cp_2"]
ha_validation["non-anginal_pain"] = ha_validation["cp_3"]
ha_validation = pd.concat([ha_validation.drop(columns=["cp_0","cp_1","cp_2","cp_3"])])
restecg_dummies = pd.get_dummies(ha_validation["restecg"], prefix="re")
ha_validation = pd.concat([ha_validation.drop(columns=["restecg"]), restecg_dummies], axis=1)
ha_validation["ecg_normal"] = ha_validation["re_0"]
ha_validation["ecg_abnormality"] = ha_validation["re_1"]
ha_validation["ecg_hypertrophy"] = 0
ha_validation = pd.concat([ha_validation.drop(columns=["re_0","re_1"])])

In [41]:
X_val = ha_validation.drop("output", axis=1)
y_val = ha_validation["output"]

y_pred_knn_val = knn_best.predict(X_val)
y_proba_knn_val = knn_best.predict_proba(X_val)[:, 1]

cm_knn_val = confusion_matrix(y_val, y_pred_knn_val)
knn_val_auc = roc_auc_score(y_val, y_proba_knn_val)
knn_val_precision = precision_score(y_val, y_pred_knn_val, pos_label=1)
knn_val_recall = recall_score(y_val, y_pred_knn_val, pos_label=1)

print("KNN - Validation")
print("Confusion Matrix:")
print(cm_knn_val)
print("ROC AUC:", knn_val_auc)
print("Precision:", knn_val_precision)
print("Recall:", knn_val_recall)
print("\n")

y_pred_log_val = log_best.predict(X_val)
y_proba_log_val = log_best.predict_proba(X_val)[:, 1]

cm_log_val = confusion_matrix(y_val, y_pred_log_val)
log_val_auc = roc_auc_score(y_val, y_proba_log_val)
log_val_precision = precision_score(y_val, y_pred_log_val, pos_label=1)
log_val_recall = recall_score(y_val, y_pred_log_val, pos_label=1)

print("Logistic Regression - Validation")
print("Confusion Matrix:")
print(cm_log_val)
print("ROC AUC:", log_val_auc)
print("Precision:", log_val_precision)
print("Recall:", log_val_recall)
print("\n")

y_pred_tree_val = tree_best.predict(X_val)
y_proba_tree_val = tree_best.predict_proba(X_val)[:, 1]

cm_tree_val = confusion_matrix(y_val, y_pred_tree_val)
tree_val_auc = roc_auc_score(y_val, y_proba_tree_val)
tree_val_precision = precision_score(y_val, y_pred_tree_val, pos_label=1)
tree_val_recall = recall_score(y_val, y_pred_tree_val, pos_label=1)

print("Decision Tree - Validation")
print("Confusion Matrix:")
print(cm_tree_val)
print("ROC AUC:", tree_val_auc)
print("Precision:", tree_val_precision)
print("Recall:", tree_val_recall)

KNN - Validation
Confusion Matrix:
[[ 9  2]
 [ 8 11]]
ROC AUC: 0.8421052631578947
Precision: 0.8461538461538461
Recall: 0.5789473684210527


Logistic Regression - Validation
Confusion Matrix:
[[ 8  3]
 [ 5 14]]
ROC AUC: 0.8708133971291865
Precision: 0.8235294117647058
Recall: 0.7368421052631579


Decision Tree - Validation
Confusion Matrix:
[[ 8  3]
 [ 6 13]]
ROC AUC: 0.7583732057416267
Precision: 0.8125
Recall: 0.6842105263157895


Part 4: Comparison

Overall, the cross-validated metrics did a pretty good job of predicting how the models would perform on the separate validation set. For all three models, the ROC AUC on the validation data was a little lower than the cross-validated ROC AUC, but the ordering stayed the same: Logistic Regression was best, then KNN, then the Decision Tree. For example, Logistic Regression went from a cross-validated ROC AUC of about 0.90 to about 0.87 on the validation set, while KNN went from about 0.87 to 0.84, and the Decision Tree dropped from about 0.83 to about 0.76.

Looking at precision and recall, the patterns are similar. Precision on the validation set is slightly higher than the cross-validated values for all three models, while recall is slightly lower. For KNN, recall dropped more noticeably (from about 0.77 in cross-validation to about 0.58 on validation), which suggests KNN was a bit optimistic about how many true positives it could catch. Logistic Regression and the Decision Tree had smaller changes in recall (both within about 0.06 of their cross-validated values) and similar or slightly better precision. Overall, the differences are within a reasonable range given the small sample size of the validation set, so I’d say our cross-validated measures of model success were approximately correct, especially for Logistic Regression.

## Part 5: Cohen’s Kappa

In [42]:
from sklearn.metrics import cohen_kappa_score

y_pred_knn = knn_best.predict(X_test)       
y_true      = y_test                        

kappa_knn = cohen_kappa_score(y_true, y_pred_knn)
print("KNN Cohen’s Kappa:", kappa_knn)

y_pred_log = log_best.predict(X_test)
kappa_log  = cohen_kappa_score(y_true, y_pred_log)
print("Logistic Regression Cohen’s Kappa:", kappa_log)

y_pred_tree = tree_best.predict(X_test)
kappa_tree  = cohen_kappa_score(y_true, y_pred_tree)
print("Decision Tree Cohen’s Kappa:",  kappa_tree)



KNN Cohen’s Kappa: 0.2416283650689428
Logistic Regression Cohen’s Kappa: 0.31114040870138426
Decision Tree Cohen’s Kappa: 0.27344782034346105


Logistic Regression has the highest Cohen’s Kappa, followed by the Decision Tree, then KNN. This matches the idea from earlier parts that Logistic Regression is the strongest overall model. It also shows that all three models are doing better than chance, which makes sense given that this is a noisy medical outcome and the dataset is not huge.

Cohen’s Kappa is especially useful in scenarios where class imbalance or chance agreement can make accuracy look deceptively good. For example, if almost everyone were “no heart attack,” a model that predicts “no heart attack” for everyone would have high accuracy but a Kappa close to 0. In a hospital setting, Kappa is helpful when we care about overall agreement with the true labels (or with a human expert) and want to discount “easy” agreement that comes from the dominant class.

Judging the models by Cohen’s Kappa does not really change my main conclusions from earlier parts. Logistic Regression was already the best model by ROC AUC, and it is still best by Kappa. The only small change is that KNN and the Decision Tree swap order (KNN had slightly better ROC AUC than the Tree, but a lower Kappa), which just reflects that Kappa and ROC AUC emphasize different things. ROC AUC focuses on ranking patients by risk across thresholds, while Kappa focuses on the actual 0/1 classification at the chosen cutoff and adjusts for chance. It makes sense that they give slightly different rankings for KNN vs. the Tree, but the big picture, Logistic Regression, the strongest overall model stays the same.