In [2]:
#Stratified k-fold cross validation with the aim to improve overall performance

import pandas as pd

df = pd.read_csv('heart.csv')

df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [3]:
from sklearn.preprocessing import LabelEncoder

categorical = df.select_dtypes(include='object').columns.to_list()
print(categorical)
label_encoders = {}
print(df.head())
for col in categorical:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

df.head()

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

#drop all missing values
df = df.dropna()
cols = df.columns.to_list()
cols.remove('HeartDisease')

trans = Pipeline([('scaler', MinMaxScaler())])

preprocessor = ColumnTransformer(
    transformers=[('all', trans, cols)]
)
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score, log_loss, cohen_kappa_score
import numpy as np

def evaluate_classifier(clf_name, classifier_or_pipeline, X_test, y_test, threshold=0.5):
    # Check if predict_proba is implemented
    last_step = classifier_or_pipeline[-1] if isinstance(classifier_or_pipeline, Pipeline) else classifier_or_pipeline
    has_predict_proba = hasattr(last_step, "predict_proba")

    # Generate predictions
    y_pred = classifier_or_pipeline.predict(X_test)

    if has_predict_proba:
        y_pred_proba = classifier_or_pipeline.predict_proba(X_test)[:, 1]
    else:
        y_pred_proba = None

    # Calculate metrics
    cm = confusion_matrix(y_test, y_pred)
    metrics = {
        "Name": clf_name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, zero_division=0),
        "Recall": recall_score(y_test, y_pred, zero_division=0),
        "F1 Score": f1_score(y_test, y_pred, zero_division=0),
        "ROC AUC Score": roc_auc_score(y_test, y_pred_proba) if has_predict_proba else None,
        "Log Loss": log_loss(y_test, y_pred_proba) if has_predict_proba else None,
        "Cohen's Kappa": cohen_kappa_score(y_test, y_pred) if has_predict_proba else None,
        "CM True Negative": cm[0][0],
        "CM False Positive": cm[0][1],
        "CM False Negative": cm[1][0],
        "CM True Positive": cm[1][1]
    }
    return metrics


In [6]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier, NearestCentroid
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.semi_supervised import LabelPropagation, LabelSpreading

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd


classifiers = {
    # Linear Models
    "LRG": LogisticRegression(),
    "RDC": RidgeClassifier(),
    "SGD": SGDClassifier(),

    # Support Vector Machines
    "SVC": SVC(),
    "NSV": NuSVC(),
    "LSV": LinearSVC(),

    # Nearest Neighbors
    "KNN": KNeighborsClassifier(),
#    "RNC": RadiusNeighborsClassifier(),
    "NCT": NearestCentroid(),

    # Naive Bayes
    "GNB": GaussianNB(),
    "MNB": MultinomialNB(),
    "BNB": BernoulliNB(),
    "CNB": ComplementNB(),

    # Decision Trees
    "DTC": DecisionTreeClassifier(),

    # Ensemble Methods
    "RFC": RandomForestClassifier(),
    "GBC": GradientBoostingClassifier(),
    "ABC": AdaBoostClassifier(),
    "ETC": ExtraTreesClassifier(),
    "BGC": BaggingClassifier(),
#    "VTC": VotingClassifier(estimators=[]), # Requires list of (name, estimator) tuples

    # Neural Network Models
#    "MLP": MLPClassifier(),

    # Other
    "QDA": QuadraticDiscriminantAnalysis(),
#    "LBP": LabelPropagation(),
#    "LBS": LabelSpreading()
}




skf = StratifiedKFold(n_splits=5)
#results_df = pd.DataFrame()
records = []

for name, clf in classifiers.items():
    print(f"Classifier: {name}")

    # Handle special cases or default values
    if name == 'VTC':
        print(f"Skipping {name} (VotingClassifier) - requires specific base estimators.")
        continue

    # Full pipeline with preprocessing and model
    pipeline_dt = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', clf)])


    # Perform cross-validation
    cv_scores = cross_val_score(pipeline_dt, X_train, y_train, cv=skf, scoring='accuracy')
    mean_accuracy = np.mean(cv_scores)
    print(f"Average Accuracy: {mean_accuracy:.4f}")

    # Train the classifier on the entire dataset for evaluation
    pipeline_dt.fit(X_train, y_train)
    metrics = evaluate_classifier(name, pipeline_dt, X_test, y_test)
    metrics['Average Training Accuracy'] = mean_accuracy
    records.append(metrics)
    #results_df = pd.concat([results_df, pd.DataFrame([metrics])], ignore_index=True)
    print("-"*30)

results_df = pd.DataFrame.from_records(records)

results_df = results_df.sort_values(by='Average Training Accuracy', ascending=False)

print(results_df)



Classifier: LRG
Average Accuracy: 0.8419
------------------------------
Classifier: RDC
Average Accuracy: 0.8419
------------------------------
Classifier: SGD
Average Accuracy: 0.8228
------------------------------
Classifier: SVC
Average Accuracy: 0.8623
------------------------------
Classifier: NSV
Average Accuracy: 0.8473
------------------------------
Classifier: LSV
Average Accuracy: 0.8446
------------------------------
Classifier: KNN
Average Accuracy: 0.8419
------------------------------
Classifier: NCT
Average Accuracy: 0.8051
------------------------------
Classifier: GNB
Average Accuracy: 0.8473
------------------------------
Classifier: MNB
Average Accuracy: 0.8092
------------------------------
Classifier: BNB
Average Accuracy: 0.7983
------------------------------
Classifier: CNB




Average Accuracy: 0.7956
------------------------------
Classifier: DTC
Average Accuracy: 0.7970
------------------------------
Classifier: RFC
Average Accuracy: 0.8555
------------------------------
Classifier: GBC
Average Accuracy: 0.8446
------------------------------
Classifier: ABC




Average Accuracy: 0.8405
------------------------------
Classifier: ETC
Average Accuracy: 0.8596
------------------------------
Classifier: BGC
Average Accuracy: 0.8228
------------------------------
Classifier: QDA
Average Accuracy: 0.8323
------------------------------
   Name  Accuracy  Precision    Recall  F1 Score  ROC AUC Score  Log Loss  \
3   SVC  0.875000   0.855856  0.931373  0.892019            NaN       NaN   
16  ETC  0.896739   0.895238  0.921569  0.908213       0.940937  0.479322   
13  RFC  0.907609   0.904762  0.931373  0.917874       0.931432  0.339400   
4   NSV  0.858696   0.833333  0.931373  0.879630            NaN       NaN   
8   GNB  0.891304   0.894231  0.911765  0.902913       0.928025  0.506862   
14  GBC  0.885870   0.900990  0.892157  0.896552       0.938008  0.307374   
5   LSV  0.869565   0.848214  0.931373  0.887850            NaN       NaN   
6   KNN  0.880435   0.863636  0.931373  0.896226       0.899749  2.148065   
1   RDC  0.875000   0.876190  0.901

In [7]:
results_df

Unnamed: 0,Name,Accuracy,Precision,Recall,F1 Score,ROC AUC Score,Log Loss,Cohen's Kappa,CM True Negative,CM False Positive,CM False Negative,CM True Positive,Average Training Accuracy
3,SVC,0.875,0.855856,0.931373,0.892019,,,,66,16,7,95,0.862333
16,ETC,0.896739,0.895238,0.921569,0.908213,0.940937,0.479322,0.790257,71,11,8,94,0.859622
13,RFC,0.907609,0.904762,0.931373,0.917874,0.931432,0.3394,0.812335,72,10,7,95,0.85554
4,NSV,0.858696,0.833333,0.931373,0.87963,,,,63,19,7,95,0.847349
8,GNB,0.891304,0.894231,0.911765,0.902913,0.928025,0.506862,0.779482,71,11,9,93,0.847339
14,GBC,0.88587,0.90099,0.892157,0.896552,0.938008,0.307374,0.769286,72,10,11,91,0.844646
5,LSV,0.869565,0.848214,0.931373,0.88785,,,,65,17,7,95,0.844618
6,KNN,0.880435,0.863636,0.931373,0.896226,0.899749,2.148065,0.755674,67,15,7,95,0.841916
1,RDC,0.875,0.87619,0.901961,0.888889,,,,69,13,10,92,0.841869
0,LRG,0.875,0.869159,0.911765,0.889952,0.897657,0.396991,0.74549,68,14,9,93,0.841869
