In [None]:
from google.colab import drive
drive.mount('/content/drive')

base_path = "/content/drive/MyDrive/heartriskx/data/"

Mounted at /content/drive


In [None]:
# COLAB: re-read Cardio with the correct delimiter
import pandas as pd

cardio = pd.read_csv(base_path + "cardio_train.csv", sep=';')
print("Cardio shape:", cardio.shape)
print("Cardio columns:", cardio.columns.tolist()[:10], " ...")


Cardio shape: (70000, 13)
Cardio columns: ['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke']  ...


In [None]:
# Heart2020: HeartDisease Yes/No -> target 1/0
heart2020['target'] = (heart2020['HeartDisease'] == 'Yes').astype(int)

# Cardio: rename 'cardio' -> 'target'
cardio = cardio.rename(columns={'cardio': 'target'})

# UCI: multiclass 0..4 -> binary (0 = no disease, >0 = disease)
uci.columns = ["age","sex","cp","trestbps","chol","fbs","restecg",
               "thalach","exang","oldpeak","slope","ca","thal","target"]
uci = uci.replace("?", pd.NA).dropna()
for c in uci.columns:
    uci[c] = pd.to_numeric(uci[c])

uci['target'] = (uci['target'] > 0).astype(int)

# Show distributions
print("Heart2020 target counts:\n", heart2020['target'].value_counts(), "\n")
print("Cardio target counts:\n", cardio['target'].value_counts(), "\n")
print("UCI target counts:\n", uci['target'].value_counts(), "\n")


Heart2020 target counts:
 target
0    292422
1     27373
Name: count, dtype: int64 

Cardio target counts:
 target
0    35021
1    34979
Name: count, dtype: int64 

UCI target counts:
 target
0    160
1    137
Name: count, dtype: int64 



In [None]:
# Save cleaned datasets back to Drive
heart2020.to_csv(base_path + "heart_2020_clean.csv", index=False)
cardio.to_csv(base_path + "cardio_train_clean.csv", index=False)
uci.to_csv(base_path + "uci_cleveland_clean.csv", index=False)

import os
print("Saved files exist?",
      os.path.exists(base_path + "heart_2020_clean.csv"),
      os.path.exists(base_path + "cardio_train_clean.csv"),
      os.path.exists(base_path + "uci_cleveland_clean.csv"))


Saved files exist? True True True


In [None]:
import os, pandas as pd
base_path = "/content/drive/MyDrive/heartriskx/data/"
paths = {
    "heart2020": base_path + "heart_2020_clean.csv",
    "cardio":    base_path + "cardio_train_clean.csv",
    "uci":       base_path + "uci_cleveland_clean.csv",
}

for name, p in paths.items():
    assert os.path.exists(p), f"❌ Missing: {p}"
    df = pd.read_csv(p)
    assert "target" in df.columns, f"❌ {name} missing 'target' column"
    uniq = set(df["target"].unique())
    print(f"{name} -> shape={df.shape}, target uniques={uniq}")
    assert uniq <= {0,1}, f"❌ {name} target is not binary: {uniq}"
    print(df.isna().sum().sort_values(ascending=False).head(), "\n")

print("✅ All good. Clean datasets with binary targets are ready.")


heart2020 -> shape=(319795, 19), target uniques={np.int64(0), np.int64(1)}
HeartDisease       0
BMI                0
Smoking            0
AlcoholDrinking    0
Stroke             0
dtype: int64 

cardio -> shape=(70000, 13), target uniques={np.int64(0), np.int64(1)}
id        0
age       0
gender    0
height    0
weight    0
dtype: int64 

uci -> shape=(297, 14), target uniques={np.int64(0), np.int64(1)}
age         0
sex         0
cp          0
trestbps    0
chol        0
dtype: int64 

✅ All good. Clean datasets with binary targets are ready.
