# Load Cleaned Dataset

In [8]:
import pandas as pd

df = pd.read_csv("cleaned_cardio.csv")

In [9]:
df["bmi"] = df["weight"] / ((df["height"] / 100) ** 2)
# Gender: Female=0, Male=1
df["gender"] = df["gender"].map({1: 0, 2: 1})

# Lifestyle
df["smoke"] = df["smoke"].map({0: 0, 1: 1})
df["alco"] = df["alco"].map({0: 0, 1: 1})
df["active"] = df["active"].map({0: 0, 1: 1})

In [10]:
features = [
    "age",
    "gender",
    "ap_hi",
    "ap_lo",
    "cholesterol",
    "gluc",
    "smoke",
    "alco",
    "active",
    "bmi"
]

X = df[features]
y = df["cardio"]


# Train-Test Split and Scalling

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define Multiple Models

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000, class_weight="balanced"
    ),
    "Decision Tree": DecisionTreeClassifier(
        max_depth=6, random_state=42
    ),
    "KNN": KNeighborsClassifier(
        n_neighbors=7
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=200, random_state=42
    )
}

# Train & Compare All models

In [13]:
from sklearn.metrics import accuracy_score

results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


Logistic Regression Accuracy: 0.7281
Decision Tree Accuracy: 0.7341
KNN Accuracy: 0.7112
Random Forest Accuracy: 0.6921


# Select Best Model Automatically

In [17]:
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

print("âœ… Best Model Selected:", best_model_name)
print("ðŸŽ¯ Accuracy:", results[best_model_name])


âœ… Best Model Selected: Decision Tree
ðŸŽ¯ Accuracy: 0.7341383095499451


In [18]:
import json

model_report = {
    "models": results,
    "best_model": best_model_name
}

with open("model_report.json", "w") as f:
    json.dump(model_report, f)

# Save only the best model

In [15]:
import joblib

joblib.dump(best_model, "cardio_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']