In [34]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, cv, Pool
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [35]:
all_data = pd.read_csv('heart.csv')

In [36]:
o2 = pd.read_csv('o2Saturation.csv')

In [37]:
all_data['satur'] = o2

In [38]:
all_data.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,satur
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,98.6
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,98.6
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,98.6
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,98.1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,97.5


### [Dataset Source](https://www.kaggle.com/datasets/rashikrahmanpritom/heart-attack-analysis-prediction-dataset?select=heart.csv)


### Attribute Information:
<ul>
<li><b>Age</b>: Age of the patient</li>
<li><b>Sex</b>: Sex of the patient</li>
<li><b>exang</b>: exercise induced angina (1 = yes; 0 = no)</li>
<li><b>ca</b>: number of major vessels (0-3)</li>
<li><b>cp</b>: Chest Pain type chest pain type
<ul>
<li>Value 1: typical angina</li>
<li>Value 2: atypical angina</li>
<li>Value 3: non-anginal pain</li>
<li>Value 4: asymptomatic</li>
</ul>
</li>
<li><b>trtbps</b>: resting blood pressure (in mm Hg)</li>
<li><b>chol</b>: cholestoral in mg/dl fetched via BMI sensor</li>
<li><b>fbs</b>: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)</li>
<li><b>rest_ecg</b>: resting electrocardiographic results</li>
<ul>
<li>Value 0: normal</li>
<li>Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)</li>
<li>Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria</li>
</ul>
</li>
<li><b>thalach</b>: maximum heart rate achieved</li>
<li><b>target</b>: 0= less chance of heart attack 1= more chance of heart attack</li>
<li><b>satur</b>: oxygen saturation</li>

</ul>

In [39]:
s = ['sex', 'exng', 'cp', 'fbs', 'restecg']
for feature in s:
    all_data[feature] = all_data[feature].astype('object')

ordinal_encoder = OrdinalEncoder(dtype = int)
tmp = all_data.copy()
tmp[s] = ordinal_encoder.fit_transform(tmp[s])

X, y = tmp.drop(columns = 'output'), tmp['output']

In [40]:
skf = StratifiedKFold(shuffle = True, random_state=42)

## Training the models

In [41]:
results = pd.DataFrame(columns = ['f1', 'accuracy'])

### Decision tree

In [42]:
dt_params = {'max_depth': [5, 10, 50, 100, 200],
             'min_samples_split': [2, 10, 20],
             'min_samples_leaf': [1, 5, 10]}

In [43]:
dt_clf = GridSearchCV(tree.DecisionTreeClassifier(),
                      dt_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)
dt_clf.fit(X, y)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


In [44]:
dt_f1_mean = dt_clf.cv_results_['mean_test_f1'][dt_clf.cv_results_['rank_test_f1'].argmin()]
dt_acc_mean = dt_clf.cv_results_['mean_test_accuracy'][dt_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['Decision Tree'] = [dt_f1_mean, dt_acc_mean]

In [45]:
results

Unnamed: 0,f1,accuracy
Decision Tree,0.828439,0.79847


### Random forest

In [46]:
rf_params = {'n_estimators': [100, 200],
             'max_depth': [10, None],
             'min_samples_split': [2, 10],
             'min_samples_leaf': [1, 10]}

In [47]:
rf_clf = GridSearchCV(RandomForestClassifier(),
                      rf_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)
rf_clf.fit(X, y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [48]:
rf_f1_mean = rf_clf.cv_results_['mean_test_f1'][rf_clf.cv_results_['rank_test_f1'].argmin()]
rf_acc_mean = rf_clf.cv_results_['mean_test_accuracy'][rf_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['Random Forest'] = [rf_f1_mean, rf_acc_mean]

### xGboost

In [49]:
xgb_params = {'min_child_weight': [1, 5, 10],
              'gamma': [0.5, 1, 1.5, 2, 5],
              'subsample': [0.6, 0.8, 1.0],
              'colsample_bytree': [0.6, 0.8, 1.0],
              'max_depth': [3, 4, 5]}

In [50]:
xgb = XGBClassifier(learning_rate=0.02,
                    n_estimators=600,
                    objective='binary:logistic',
                    silent=True)

In [51]:
xgb_rs = RandomizedSearchCV(xgb,
                            param_distributions = xgb_params,
                            n_iter = 30,
                            scoring = ['f1', 'accuracy'],
                            refit = 'f1',
                            n_jobs = -1,
                            cv = skf,
                            verbose = 1)
xgb_rs.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
Parameters: { "silent" } are not used.



In [52]:
xgb_f1_mean = xgb_rs.cv_results_['mean_test_f1'][xgb_rs.cv_results_['rank_test_f1'].argmin()]
xgb_acc_mean = xgb_rs.cv_results_['mean_test_accuracy'][xgb_rs.cv_results_['rank_test_f1'].argmin()]

results.loc['xGboost'] = [xgb_f1_mean, xgb_acc_mean]

### CatBoost

In [53]:
cat_features = X.select_dtypes(include=['object']).columns.tolist()
cat = CatBoostClassifier(cat_features=cat_features)

In [54]:
def cv_scores(cv_data):
    cv_data.head(10)

    best_acc_value = cv_data['test-Accuracy-mean'].max()
    best_acc_iter = cv_data['test-Accuracy-mean'].values.argmax()

    best_f1_value = cv_data['test-F1-mean'].max()
    best_f1_iter = cv_data['test-F1-mean'].values.argmax()
    return best_f1_value, best_acc_value

In [55]:
cat_params = {'loss_function': 'Logloss',
              'iterations': 100,
              'custom_loss': ['F1', 'Accuracy'],
              'learning_rate': 0.5,}

In [56]:
train_pool = Pool(data=X, label=y, cat_features=cat_features, has_header=True)
cv_data = cv(params = cat_params,
             pool = train_pool,
             verbose = False,
             folds = skf,
             plot=True)

cat_f1_mean, cat_acc_mean = cv_scores(cv_data)

results.loc['CatBoost'] = [cat_f1_mean, cat_acc_mean]

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]

bestTest = 0.2711022776
bestIteration = 9

Training on fold [1/5]

bestTest = 0.3763781613
bestIteration = 4

Training on fold [2/5]

bestTest = 0.5291622349
bestIteration = 6

Training on fold [3/5]

bestTest = 0.3803022072
bestIteration = 10

Training on fold [4/5]

bestTest = 0.3674683109
bestIteration = 16



### kNN

Since KNN cant handle categorical features, we will one-hot-encode every categorical feature and then reduce dimensionality with svd decomposition.

In [57]:
oh_enc = OneHotEncoder(drop = 'first')
X_oh = oh_enc.fit_transform(X)

In [58]:
decomp = TruncatedSVD()
knn = KNeighborsClassifier()

knn_pipe = Pipeline(steps=[('svd', decomp),
                           ('knn', knn)])

knn_params = {"svd__n_components": [2, 5, 10],
              "svd__n_iter": [5],
              "knn__n_neighbors": [1, 2, 3, 4, 5],
              "knn__weights" : ["uniform", "distance"],
              "knn__metric" : ["euclidean"]}

In [59]:
knn_clf = GridSearchCV(knn_pipe,
                      knn_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)

knn_clf.fit(X_oh, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [60]:
knn_f1_mean = knn_clf.cv_results_['mean_test_f1'][knn_clf.cv_results_['rank_test_f1'].argmin()]
knn_acc_mean = knn_clf.cv_results_['mean_test_accuracy'][knn_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['KNN'] = [knn_f1_mean, knn_acc_mean]

### Naive Bayes

In [61]:
gnb = GaussianNB()
gnb_pipe = Pipeline(steps=[('svd', decomp),
                           ('gnb', gnb)])
gnb_params = {"svd__n_components": [2, 5, 10],
              "svd__n_iter": [5]}

In [62]:
gnb_clf = GridSearchCV(gnb_pipe,
                       gnb_params,
                       cv = skf,
                       n_jobs = -1,
                       scoring = ['f1', 'accuracy'],
                       refit = 'f1',
                       verbose = 1)

gnb_clf.fit(X_oh, y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [63]:
gnb_f1_mean = gnb_clf.cv_results_['mean_test_f1'][gnb_clf.cv_results_['rank_test_f1'].argmin()]
gnb_acc_mean = gnb_clf.cv_results_['mean_test_accuracy'][gnb_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['GaussianNB'] = [gnb_f1_mean, gnb_acc_mean]

### Logistic Regression

In [64]:
lr = LogisticRegression(max_iter=1000)
lr_params = [{'penalty': ['l1', 'l2'],
              'C': [0.1, 1, 10],
              'solver': ['liblinear']},

             {'penalty': [None],
              'solver': ['lbfgs']},]

lr_clf = GridSearchCV(lr,
                      lr_params,
                      cv = skf,
                      n_jobs = -1,
                      scoring = ['f1', 'accuracy'],
                      refit = 'f1',
                      verbose = 1)

lr_clf.fit(X_oh, y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [65]:
lr_f1_mean = lr_clf.cv_results_['mean_test_f1'][lr_clf.cv_results_['rank_test_f1'].argmin()]
lr_acc_mean = lr_clf.cv_results_['mean_test_accuracy'][lr_clf.cv_results_['rank_test_f1'].argmin()]

results.loc['LogisticRegression'] = [lr_f1_mean, lr_acc_mean]

## Results

In [66]:
results

Unnamed: 0,f1,accuracy
Decision Tree,0.828439,0.79847
Random Forest,0.864594,0.844699
xGboost,0.861714,0.841585
CatBoost,0.862593,0.844863
KNN,0.819923,0.805246
GaussianNB,0.831905,0.808689
LogisticRegression,0.862314,0.844973


In [67]:
results.to_csv('heart_attack_resuts.csv')

In [68]:
results.round(3)

Unnamed: 0,f1,accuracy
Decision Tree,0.828,0.798
Random Forest,0.865,0.845
xGboost,0.862,0.842
CatBoost,0.863,0.845
KNN,0.82,0.805
GaussianNB,0.832,0.809
LogisticRegression,0.862,0.845
