In [None]:
from google.colab import drive
drive.mount('/content/drive')

base_path = "/content/drive/MyDrive/heartriskx/data/"


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [None]:
heart2020 = pd.read_csv(base_path + "heart_2020_clean.csv")
cardio = pd.read_csv(base_path + "cardio_train_clean.csv")
uci = pd.read_csv(base_path + "uci_cleveland_clean.csv")

print("Heart2020:", heart2020.shape)
print("Cardio:", cardio.shape)
print("UCI:", uci.shape)


Heart2020: (319795, 19)
Cardio: (70000, 13)
UCI: (297, 14)


In [None]:
def run_baseline(X, y, dataset_name):
    # One-hot encode categorical columns (strings)
    X = pd.get_dummies(X, drop_first=True)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    results = {}

    # Logistic Regression
    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(X_train_scaled, y_train)
    y_pred = logreg.predict(X_test_scaled)
    results["LogReg"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
    }

    # Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)   # RF can handle non-scaled numeric data
    y_pred = rf.predict(X_test)
    results["RandomForest"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
    }

    print(f"\nðŸ“Š Results for {dataset_name}:")
    for model, metrics in results.items():
        print(f"{model}: Acc={metrics['accuracy']:.3f}, Prec={metrics['precision']:.3f}, "
              f"Rec={metrics['recall']:.3f}, F1={metrics['f1']:.3f}")

    return results


In [None]:
# Heart2020
X = heart2020.drop(columns=['target'])
y = heart2020['target']
res_heart2020 = run_baseline(X, y, "Heart2020")

# Cardio
X = cardio.drop(columns=['target', 'id'])   # drop 'id' column
y = cardio['target']
res_cardio = run_baseline(X, y, "Cardio")

# UCI Cleveland
X = uci.drop(columns=['target'])
y = uci['target']
res_uci = run_baseline(X, y, "UCI Cleveland")



ðŸ“Š Results for Heart2020:
LogReg: Acc=1.000, Prec=1.000, Rec=1.000, F1=1.000
RandomForest: Acc=1.000, Prec=1.000, Rec=1.000, F1=1.000

ðŸ“Š Results for Cardio:
LogReg: Acc=0.714, Prec=0.731, Rec=0.675, F1=0.702
RandomForest: Acc=0.712, Prec=0.718, Rec=0.699, F1=0.708

ðŸ“Š Results for UCI Cleveland:
LogReg: Acc=0.833, Prec=0.846, Rec=0.786, F1=0.815
RandomForest: Acc=0.850, Prec=0.880, Rec=0.786, F1=0.830
