In [3]:
pip install pandas scikit-learn xgboost joblib tqdm




In [4]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
import joblib
import zipfile

# Upload kaggle.json file before this step
from google.colab import files
uploaded = files.upload()

# ‚úÖ Set up custom Kaggle config path
print("üîë Setting up Kaggle API...")
os.makedirs("/content/.kaggle", exist_ok=True)
with open("/content/.kaggle/kaggle.json", "wb") as f:
    f.write(uploaded["kaggle.json"])
os.environ["KAGGLE_CONFIG_DIR"] = "/content/.kaggle"

# ‚úÖ Download dataset
print("‚¨áÔ∏è Downloading dataset...")
!kaggle datasets download -d sulianova/cardiovascular-disease-dataset

# ‚úÖ Unzip dataset
print("üóÇÔ∏è Unzipping dataset...")
with zipfile.ZipFile("cardiovascular-disease-dataset.zip", "r") as zip_ref:
    zip_ref.extractall("data")

# ‚úÖ Load dataset
print("üìÑ Loading dataset...")
df = pd.read_csv("data/cardio_train.csv", sep=";")

# ‚úÖ Preprocessing
print("üßº Preprocessing...")
df["age"] = (df["age"] / 365).astype(int)
df = df[(df["ap_hi"] > 0) & (df["ap_hi"] < 300)]
df = df[(df["ap_lo"] > 0) & (df["ap_lo"] < 200)]
df = df[df["ap_hi"] >= df["ap_lo"]]
df.drop("id", axis=1, inplace=True)

cat_cols = ["gender", "cholesterol", "gluc", "smoke", "alco", "active"]
num_cols = ["age", "height", "weight", "ap_hi", "ap_lo"]

for col in tqdm(cat_cols, desc="üî§ Encoding categorical"):
    df[col] = LabelEncoder().fit_transform(df[col])

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

X = df.drop("cardio", axis=1)
y = df["cardio"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ‚úÖ Train Random Forest
print("üå≤ Training RandomForest...")
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
print(f"‚úÖ RF Acc: {accuracy_score(y_val, rf.predict(X_val)):.4f} | ROC-AUC: {roc_auc_score(y_val, rf.predict_proba(X_val)[:,1]):.4f}")

# ‚úÖ Train XGBoost
print("‚ö° Training XGBoost...")
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
print(f"‚úÖ XGB Acc: {accuracy_score(y_val, xgb.predict(X_val)):.4f} | ROC-AUC: {roc_auc_score(y_val, xgb.predict_proba(X_val)[:,1]):.4f}")

# ‚úÖ Train Neural Network
print("üß† Training Neural Network...")
nn = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, early_stopping=True, random_state=42)
nn.fit(X_train, y_train)
print(f"‚úÖ NN Acc: {accuracy_score(y_val, nn.predict(X_val)):.4f} | ROC-AUC: {roc_auc_score(y_val, nn.predict_proba(X_val)[:,1]):.4f}")

# Save best model (XGBoost)
joblib.dump(xgb, "best_model.pkl")

# ‚úÖ Prediction function
def predict_from_input(input_dict):
    try:
        input_df = pd.DataFrame([input_dict])
        input_df[num_cols] = scaler.transform(input_df[num_cols])
        for col in cat_cols:
            input_df[col] = LabelEncoder().fit_transform([input_dict[col]])[0]
        input_df = input_df[X.columns.tolist()]
        pred = xgb.predict(input_df)[0]
        return f"üîç Prediction: {'HIGH risk' if pred else 'LOW risk'}"
    except Exception as e:
        return f"‚ùå Prediction error: {e}"

# ‚úÖ Sample prediction
sample = {
    "age": 50, "height": 170, "weight": 70, "ap_hi": 120, "ap_lo": 80,
    "gender": 1, "cholesterol": 1, "gluc": 1, "smoke": 0, "alco": 0, "active": 1
}
print("üì¶ Sample prediction:")
print(predict_from_input(sample))


Saving kaggle.json to kaggle.json
üîë Setting up Kaggle API...
‚¨áÔ∏è Downloading dataset...
Dataset URL: https://www.kaggle.com/datasets/sulianova/cardiovascular-disease-dataset
License(s): unknown
Downloading cardiovascular-disease-dataset.zip to /content
  0% 0.00/742k [00:00<?, ?B/s]
100% 742k/742k [00:00<00:00, 264MB/s]
üóÇÔ∏è Unzipping dataset...
üìÑ Loading dataset...
üßº Preprocessing...


üî§ Encoding categorical: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:00<00:00, 258.54it/s]

üå≤ Training RandomForest...





‚úÖ RF Acc: 0.7279 | ROC-AUC: 0.7981
‚ö° Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ XGB Acc: 0.7296 | ROC-AUC: 0.7955
üß† Training Neural Network...
‚úÖ NN Acc: 0.7290 | ROC-AUC: 0.7955
üì¶ Sample prediction:
üîç Prediction: LOW risk


In [5]:
# STEP 0: Install dependencies
!pip install -q kaggle xgboost tqdm scikit-learn

# STEP 1: Upload your kaggle.json manually or ensure it exists in the runtime
import os, zipfile
from google.colab import files
uploaded = files.upload()

os.makedirs("/root/.kaggle", exist_ok=True)
!mv kaggle.json /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

# STEP 2: Download dataset from Kaggle
print("üîë Setting up Kaggle API...")
!kaggle datasets download -d sulianova/cardiovascular-disease-dataset

# STEP 3: Unzip dataset
print("üóÇÔ∏è Unzipping dataset...")
with zipfile.ZipFile("cardiovascular-disease-dataset.zip", "r") as zip_ref:
    zip_ref.extractall("/content")

# STEP 4: Import and preprocess
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from tqdm import tqdm

print("üìÑ Loading dataset...")
df = pd.read_csv("/content/cardio_train.csv", sep=';')
df.drop(columns=['id'], inplace=True)

print("üßº Preprocessing...")
# Upsample minority class
df_major = df[df.cardio == 0]
df_minor = df[df.cardio == 1]
df_minor_upsampled = resample(df_minor, replace=True, n_samples=len(df_major), random_state=42)
df = pd.concat([df_major, df_minor_upsampled]).sample(frac=1, random_state=42)

# Features and labels
X = df.drop("cardio", axis=1)
y = df["cardio"]

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Models
print("üå≤ Training RandomForest...")
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print("‚ö° Training XGBoost...")
xgb = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.05, use_label_encoder=False, eval_metric="logloss")
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

print("üß† Training Neural Network...")
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', max_iter=300, random_state=42)
mlp.fit(X_train, y_train)
mlp_pred = mlp.predict(X_test)

# Voting Ensemble
print("üß† Combining all (Voting Classifier)...")
ensemble = VotingClassifier(estimators=[
    ('rf', rf), ('xgb', xgb), ('mlp', mlp)
], voting='soft')

ensemble.fit(X_train, y_train)
ensemble_pred = ensemble.predict(X_test)

# Metrics
acc = accuracy_score(y_test, ensemble_pred)
roc = roc_auc_score(y_test, ensemble_pred)
cm = confusion_matrix(y_test, ensemble_pred)

print(f"\n‚úÖ Ensemble Accuracy: {acc:.4f}")
print(f"‚úÖ ROC-AUC Score: {roc:.4f}")
print(f"üßæ Confusion Matrix:\n{cm}")


Saving kaggle.json to kaggle (1).json
üîë Setting up Kaggle API...
Dataset URL: https://www.kaggle.com/datasets/sulianova/cardiovascular-disease-dataset
License(s): unknown
cardiovascular-disease-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
üóÇÔ∏è Unzipping dataset...
üìÑ Loading dataset...
üßº Preprocessing...
üå≤ Training RandomForest...
‚ö° Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


üß† Training Neural Network...




üß† Combining all (Voting Classifier)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



‚úÖ Ensemble Accuracy: 0.7468
‚úÖ ROC-AUC Score: 0.7468
üßæ Confusion Matrix:
[[5497 1509]
 [2038 4965]]


In [6]:
# -------------------- SETUP --------------------
print("üîß Installing & importing dependencies...")
!pip install -q kaggle tqdm xgboost scikit-learn pandas matplotlib seaborn

import os
import zipfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

# -------------------- AUTH & DOWNLOAD --------------------
print("üîë Setting up Kaggle API...")
os.makedirs("/root/.kaggle", exist_ok=True)
!mv kaggle.json /root/.kaggle/kaggle.json
!chmod 600 /root/.kaggle/kaggle.json

print("‚¨áÔ∏è Downloading dataset...")
!kaggle datasets download -d sulianova/cardiovascular-disease-dataset
with zipfile.ZipFile("cardiovascular-disease-dataset.zip", "r") as zip_ref:
    zip_ref.extractall("data")

# -------------------- LOAD & PREPROCESS --------------------
print("üìÑ Loading dataset...")
df = pd.read_csv("data/cardio_train.csv", sep=";")

print("üßº Preprocessing...")
df.drop("id", axis=1, inplace=True)
df["age"] = df["age"] // 365  # Convert age from days to years

# Remove outliers (height and weight beyond quantile thresholds)
df = df[(df["height"] > df["height"].quantile(0.01)) & (df["height"] < df["height"].quantile(0.99))]
df = df[(df["weight"] > df["weight"].quantile(0.01)) & (df["weight"] < df["weight"].quantile(0.99))]

X = df.drop("cardio", axis=1)
y = df["cardio"]

print("üìä Splitting dataset...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------- TRAIN MODELS --------------------
print("üå≤ Training RandomForest...")
rf = RandomForestClassifier(n_estimators=150, max_depth=12, random_state=42)
rf.fit(X_train, y_train)

print("‚ö° Training XGBoost...")
xgb_model = xgb.XGBClassifier(n_estimators=150, learning_rate=0.08, random_state=42)
xgb_model.fit(X_train, y_train)

print("üß† Training NeuralNet...")
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# -------------------- ENSEMBLE --------------------
print("üß† Combining all (Voting Classifier)...")
ensemble = VotingClassifier(estimators=[
    ('rf', rf),
    ('xgb', xgb_model),
    ('nn', mlp)
], voting='soft')

ensemble.fit(X_train, y_train)

# -------------------- EVALUATE --------------------
print("üìà Evaluating ensemble model...")
y_pred = ensemble.predict(X_test)
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, ensemble.predict_proba(X_test)[:, 1])
cm = confusion_matrix(y_test, y_pred)

print(f"\n‚úÖ Ensemble Accuracy: {acc:.4f}")
print(f"‚úÖ ROC-AUC Score: {roc_auc:.4f}")
print("üßæ Confusion Matrix:")
print(cm)

# -------------------- TEST SAMPLE --------------------
def predict_user_input(model, input_dict):
    input_df = pd.DataFrame([input_dict])
    return model.predict(input_df)[0]

sample = {
    "age": 50, "gender": 1, "height": 165, "weight": 72,
    "ap_hi": 120, "ap_lo": 80, "cholesterol": 1,
    "gluc": 1, "smoke": 0, "alco": 0, "active": 1
}

print("\nüì¶ Sample prediction:")
result = predict_user_input(ensemble, sample)
print("üîç Prediction:", "HIGH risk" if result == 1 else "LOW risk")


üîß Installing & importing dependencies...
üîë Setting up Kaggle API...
mv: cannot stat 'kaggle.json': No such file or directory
‚¨áÔ∏è Downloading dataset...
Dataset URL: https://www.kaggle.com/datasets/sulianova/cardiovascular-disease-dataset
License(s): unknown
cardiovascular-disease-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
üìÑ Loading dataset...
üßº Preprocessing...
üìä Splitting dataset...
üå≤ Training RandomForest...
‚ö° Training XGBoost...
üß† Training NeuralNet...
üß† Combining all (Voting Classifier)...
üìà Evaluating ensemble model...

‚úÖ Ensemble Accuracy: 0.7368
‚úÖ ROC-AUC Score: 0.8036
üßæ Confusion Matrix:
[[5234 1500]
 [2025 4633]]

üì¶ Sample prediction:
üîç Prediction: LOW risk


In [7]:
# -------------------- SETUP --------------------
print("üîß Installing dependencies...")
!pip install -q kagglehub tqdm xgboost scikit-learn pandas matplotlib seaborn

import kagglehub
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
import xgboost as xgb

# -------------------- DOWNLOAD DATA --------------------
print("‚¨áÔ∏è Downloading dataset from KaggleHub...")
dataset_path = kagglehub.dataset_download("sulianova/cardiovascular-disease-dataset")
print("üìÅ Path to dataset files:", dataset_path)

# -------------------- LOAD & CLEAN --------------------
print("üìÑ Loading dataset...")
df = pd.read_csv(f"{dataset_path}/cardio_train.csv", sep=";")
df.drop("id", axis=1, inplace=True)
df["age"] = df["age"] // 365  # convert days to years

print("üßº Cleaning and preprocessing...")
# Remove outliers
df = df[(df["height"] > df["height"].quantile(0.01)) & (df["height"] < df["height"].quantile(0.99))]
df = df[(df["weight"] > df["weight"].quantile(0.01)) & (df["weight"] < df["weight"].quantile(0.99))]

# Split data
X = df.drop("cardio", axis=1)
y = df["cardio"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------- TRAIN MODELS --------------------
print("üå≤ Training RandomForest...")
rf = RandomForestClassifier(n_estimators=150, max_depth=12, random_state=42)
rf.fit(X_train, y_train)

print("‚ö° Training XGBoost...")
xgb_model = xgb.XGBClassifier(n_estimators=150, learning_rate=0.08, random_state=42)
xgb_model.fit(X_train, y_train)

print("üß† Training Neural Network...")
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)
mlp.fit(X_train, y_train)

# -------------------- ENSEMBLE --------------------
print("üß† Combining all models with VotingClassifier...")
ensemble = VotingClassifier(estimators=[
    ('rf', rf),
    ('xgb', xgb_model),
    ('mlp', mlp)
], voting='soft')

ensemble.fit(X_train, y_train)

# -------------------- EVALUATE --------------------
print("üìä Evaluating ensemble...")
y_pred = ensemble.predict(X_test)
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, ensemble.predict_proba(X_test)[:, 1])
cm = confusion_matrix(y_test, y_pred)

print(f"\n‚úÖ Ensemble Accuracy: {acc:.4f}")
print(f"‚úÖ ROC-AUC Score: {roc_auc:.4f}")
print("üßæ Confusion Matrix:")
print(cm)

# -------------------- PREDICT SAMPLE --------------------
def predict_sample(model, features: dict):
    df_input = pd.DataFrame([features])
    prediction = model.predict(df_input)[0]
    return "HIGH risk" if prediction == 1 else "LOW risk"

sample = {
    "age": 50, "gender": 1, "height": 165, "weight": 72,
    "ap_hi": 120, "ap_lo": 80, "cholesterol": 1,
    "gluc": 1, "smoke": 0, "alco": 0, "active": 1
}

print("\nüîé Sample Prediction:")
print("üîç Risk level:", predict_sample(ensemble, sample))


üîß Installing dependencies...
‚¨áÔ∏è Downloading dataset from KaggleHub...
Downloading from https://www.kaggle.com/api/v1/datasets/download/sulianova/cardiovascular-disease-dataset?dataset_version_number=1...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 742k/742k [00:00<00:00, 78.5MB/s]

Extracting files...
üìÅ Path to dataset files: /root/.cache/kagglehub/datasets/sulianova/cardiovascular-disease-dataset/versions/1
üìÑ Loading dataset...
üßº Cleaning and preprocessing...





üå≤ Training RandomForest...
‚ö° Training XGBoost...
üß† Training Neural Network...
üß† Combining all models with VotingClassifier...
üìä Evaluating ensemble...

‚úÖ Ensemble Accuracy: 0.7368
‚úÖ ROC-AUC Score: 0.8036
üßæ Confusion Matrix:
[[5234 1500]
 [2025 4633]]

üîé Sample Prediction:
üîç Risk level: LOW risk


In [8]:
# üîß Install dependencies
!pip install -q kagglehub tqdm imbalanced-learn xgboost

# üì¶ Imports
import pandas as pd
import numpy as np
import kagglehub
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# üîÑ Load dataset
print("‚¨áÔ∏è Downloading dataset from KaggleHub...")
path = kagglehub.dataset_download("sulianova/cardiovascular-disease-dataset")
print("üìÅ Path to dataset files:", path)

# üìÑ Load CSV
df = pd.read_csv("/kaggle/input/cardiovascular-disease-dataset/cardio_train.csv", sep=";")

# üßº Preprocess
print("üßº Cleaning and preprocessing...")
if 'id' in df.columns:
    df.drop('id', axis=1, inplace=True)

X = df.drop('cardio', axis=1)
y = df['cardio']

# ‚öñÔ∏è Balance data
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# üéØ Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# üß™ Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# üß† Models
print("üå≤ Training RandomForest...")
rf = RandomForestClassifier(n_estimators=300, max_depth=15, random_state=42)

print("‚ö° Training XGBoost...")
xgb = XGBClassifier(n_estimators=300, max_depth=7, learning_rate=0.05, subsample=0.8,
                    use_label_encoder=False, eval_metric='logloss', random_state=42)

print("üß† Training Neural Network...")
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=400, alpha=0.0005, solver='adam', random_state=42)

# ü§ù Voting Classifier
print("üß† Combining all models with VotingClassifier...")
ensemble = VotingClassifier(estimators=[
    ('rf', rf),
    ('xgb', xgb),
    ('mlp', mlp)
], voting='soft')

# üìà Fit model with tqdm
print("üèãÔ∏è Training Ensemble...")
for _ in tqdm(range(1), desc="Training"):
    ensemble.fit(X_train_scaled, y_train)

# üéØ Evaluate
y_pred = ensemble.predict(X_test_scaled)
acc = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, ensemble.predict_proba(X_test_scaled)[:, 1])
conf_matrix = confusion_matrix(y_test, y_pred)

print("\n‚úÖ Ensemble Accuracy:", round(acc, 4))
print("‚úÖ ROC-AUC Score:", round(roc_auc, 4))
print("üßæ Confusion Matrix:\n", conf_matrix)

# üîç Prediction sample
sample = X_test.iloc[0:1]
risk = ensemble.predict(scaler.transform(sample))[0]
print("\nüîé Sample Prediction:")
print("üîç Risk level:", "HIGH risk" if risk else "LOW risk")


‚¨áÔ∏è Downloading dataset from KaggleHub...
üìÅ Path to dataset files: /kaggle/input/cardiovascular-disease-dataset
üßº Cleaning and preprocessing...
üå≤ Training RandomForest...
‚ö° Training XGBoost...
üß† Training Neural Network...
üß† Combining all models with VotingClassifier...
üèãÔ∏è Training Ensemble...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [06:11<00:00, 371.66s/it]



‚úÖ Ensemble Accuracy: 0.7412
‚úÖ ROC-AUC Score: 0.8025
üßæ Confusion Matrix:
 [[5372 1552]
 [2074 5011]]

üîé Sample Prediction:
üîç Risk level: HIGH risk


In [9]:
# ‚úÖ Full Improved Cardiovascular Disease Classification Pipeline
# ‚úÖ Includes: Scaling, Outlier Handling, SMOTE, RF+XGB+MLP, Stacking, EarlyStopping, TQDM

# ---- STEP 1: Install Required Libraries ----
!pip install -q kaggle kagglehub xgboost lightgbm imbalanced-learn tqdm

# ---- STEP 2: Import Libraries ----
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from tqdm.notebook import tqdm
import kagglehub

# ---- STEP 3: Download Dataset ----
print("‚¨áÔ∏è Downloading dataset from KaggleHub...")
path = kagglehub.dataset_download("sulianova/cardiovascular-disease-dataset")
print("üìÅ Path to dataset files:", path)

# ---- STEP 4: Load Data ----
df = pd.read_csv(f"{path}/cardio_train.csv", sep=';')
df.drop(columns=['id'], inplace=True)

# ---- STEP 5: Clean & Engineer Features ----
print("üßº Cleaning and preprocessing...")
# Remove outliers
q_low, q_hi = df.quantile(0.01), df.quantile(0.99)
for col in ['ap_hi', 'ap_lo', 'weight', 'height']:
    df = df[(df[col] >= q_low[col]) & (df[col] <= q_hi[col])]

# Fix invalid blood pressure
df = df[df['ap_hi'] > df['ap_lo']]

# Feature engineering
df['bmi'] = df['weight'] / ((df['height']/100)**2)

# ---- STEP 6: Split Features ----
X = df.drop(columns=['cardio'])
y = df['cardio']

# Normalize features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# ---- STEP 7: Train-Test Split ----
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

# ---- STEP 8: Balance Data ----
print("üß™ Applying SMOTE to balance classes...")
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# ---- STEP 9: Initialize Models ----
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, early_stopping=True, random_state=42)

# ---- STEP 10: Build Ensemble (Stacking) ----
print("üß† Combining all models with StackingClassifier...")
ensemble = StackingClassifier(
    estimators=[
        ('rf', rf),
        ('xgb', xgb),
        ('mlp', mlp)
    ],
    final_estimator=LogisticRegression(),
    passthrough=True,
    cv=5,
    n_jobs=-1
)

# ---- STEP 11: Train Model ----
print("üèãÔ∏è Training Ensemble...")
for _ in tqdm(range(1), desc="Training"):
    ensemble.fit(X_train_res, y_train_res)

# ---- STEP 12: Evaluation ----
y_pred = ensemble.predict(X_test)
y_prob = ensemble.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_prob)
cm = confusion_matrix(y_test, y_pred)

print(f"\n‚úÖ Ensemble Accuracy: {acc:.4f}")
print(f"‚úÖ ROC-AUC Score: {roc:.4f}")
print("üßæ Confusion Matrix:")
print(cm)

# ---- STEP 13: Sample Prediction ----
sample = X_test[0].reshape(1, -1)
sample_pred = ensemble.predict(sample)[0]
print("\nüîé Sample Prediction:")
print("üîç Risk level:", "HIGH risk" if sample_pred else "LOW risk")


‚¨áÔ∏è Downloading dataset from KaggleHub...
üìÅ Path to dataset files: /kaggle/input/cardiovascular-disease-dataset
üßº Cleaning and preprocessing...
üß™ Applying SMOTE to balance classes...
üß† Combining all models with StackingClassifier...
üèãÔ∏è Training Ensemble...


Training:   0%|          | 0/1 [00:00<?, ?it/s]


‚úÖ Ensemble Accuracy: 0.7308
‚úÖ ROC-AUC Score: 0.7976
üßæ Confusion Matrix:
[[5253 1422]
 [2123 4372]]

üîé Sample Prediction:
üîç Risk level: HIGH risk


In [10]:
!pip install -q imbalanced-learn


In [11]:
# üîß Install dependencies
!pip install -q kagglehub imbalanced-learn xgboost tqdm

# üìö Imports
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
import kagglehub

# ‚¨áÔ∏è Download dataset using KaggleHub
print("‚¨áÔ∏è Downloading dataset from KaggleHub...")
path = kagglehub.dataset_download("sulianova/cardiovascular-disease-dataset")
print("üìÅ Path to dataset files:", path)

# üìÑ Load the dataset
print("üìÑ Loading dataset...")
df = pd.read_csv(os.path.join(path, "cardio_train.csv"), sep=';')

# üßº Preprocessing
print("üßº Cleaning and preprocessing...")
df.drop(columns=["id"], inplace=True)
df["age"] = (df["age"] / 365).astype(int)

# Separate features and target
X = df.drop("cardio", axis=1)
y = df["cardio"]

# üß™ Apply SMOTE
print("üß™ Applying SMOTE to balance classes...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# üîÄ Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# üî® Define models
print("‚öôÔ∏è Defining base models...")
rf = RandomForestClassifier(n_estimators=150, max_depth=20, random_state=42, n_jobs=-1)
xgb = XGBClassifier(n_estimators=150, learning_rate=0.05, max_depth=5, use_label_encoder=False, eval_metric='logloss')
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=300, random_state=42)

# üß† Combine models using Stacking
print("üß† Combining all models with StackingClassifier...")
estimators = [
    ("rf", rf),
    ("xgb", xgb),
    ("mlp", mlp)
]
stacked_model = StackingClassifier(estimators=estimators, final_estimator=RandomForestClassifier(n_estimators=100, random_state=42))

# üèãÔ∏è Training
print("üèãÔ∏è Training Ensemble...")
for _ in tqdm(range(1), desc="Training"):
    stacked_model.fit(X_train, y_train)

# üìà Evaluation
y_pred = stacked_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f"\n‚úÖ Ensemble Accuracy: {acc:.4f}")
print(f"‚úÖ ROC-AUC Score: {roc:.4f}")
print("üßæ Confusion Matrix:")
print(cm)

# üîç Prediction sample
sample_input = {
    "age": 50, "gender": 1, "height": 165, "weight": 70, "ap_hi": 120,
    "ap_lo": 80, "cholesterol": 1, "gluc": 1, "smoke": 0, "alco": 0, "active": 1
}
sample_df = pd.DataFrame([sample_input])
sample_pred = stacked_model.predict(sample_df)[0]
print("\nüîé Sample Prediction:")
print("üîç Risk level:", "HIGH risk" if sample_pred == 1 else "LOW risk")


‚¨áÔ∏è Downloading dataset from KaggleHub...
üìÅ Path to dataset files: /kaggle/input/cardiovascular-disease-dataset
üìÑ Loading dataset...
üßº Cleaning and preprocessing...
üß™ Applying SMOTE to balance classes...
‚öôÔ∏è Defining base models...
üß† Combining all models with StackingClassifier...
üèãÔ∏è Training Ensemble...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [03:35<00:00, 215.76s/it]



‚úÖ Ensemble Accuracy: 0.7176
‚úÖ ROC-AUC Score: 0.7178
üßæ Confusion Matrix:
[[5110 1814]
 [2142 4943]]

üîé Sample Prediction:
üîç Risk level: LOW risk


In [12]:
!pip install kagglehub lightgbm xgboost imbalanced-learn tqdm




In [13]:
# üîß Install dependencies
!pip install -q kagglehub lightgbm xgboost imbalanced-learn tqdm

# üìö Imports
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")

# ‚¨áÔ∏è Download dataset from KaggleHub
print("‚¨áÔ∏è Downloading dataset from KaggleHub...")
path = kagglehub.dataset_download("sulianova/cardiovascular-disease-dataset")
print("üìÅ Path to dataset files:", path)

# üìÑ Load dataset
print("üìÑ Loading dataset...")
df = pd.read_csv(f"{path}/cardio_train.csv", sep=';')
df.drop('id', axis=1, inplace=True)

# üßº Preprocessing
print("üßº Cleaning and preprocessing...")
df = df[(df['ap_hi'] > 0) & (df['ap_lo'] > 0)]
df = df[(df['ap_hi'] < 250) & (df['ap_lo'] < 200)]
df['age'] = (df['age'] / 365).astype(int)  # convert days to years

X = df.drop("cardio", axis=1)
y = df["cardio"]

# üß™ Apply SMOTE to balance classes
print("üß™ Applying SMOTE to balance classes...")
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# üìä Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

# üîÑ Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# ‚öôÔ∏è Define base models
print("‚öôÔ∏è Defining base models...")
base_models = [
    ('rf', RandomForestClassifier(n_estimators=150, max_depth=12, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss')),
    ('lgb', LGBMClassifier(n_estimators=120, max_depth=6, learning_rate=0.1))
]

# üß† Define meta learner
meta_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300, random_state=42)

# üß† Create stacking ensemble
print("üß† Combining all models with StackingClassifier...")
stack = StackingClassifier(estimators=base_models, final_estimator=meta_model, passthrough=True)

# üèãÔ∏è Train ensemble with tqdm
print("üèãÔ∏è Training Ensemble...")
for _ in tqdm(range(1), desc="Training"):
    stack.fit(X_train, y_train)

# üìà Predict and evaluate
y_pred = stack.predict(X_test)
y_proba = stack.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_proba)
cm = confusion_matrix(y_test, y_pred)

print(f"\n‚úÖ Ensemble Accuracy: {acc:.4f}")
print(f"‚úÖ ROC-AUC Score: {roc:.4f}")
print(f"üßæ Confusion Matrix:\n{cm}")

# üîç Sample prediction
print("\nüîé Sample Prediction:")
sample = X_test[0].reshape(1, -1)
sample_pred = stack.predict(sample)[0]
risk = "HIGH" if sample_pred else "LOW"
print(f"üîç Risk level: {risk} risk")


‚¨áÔ∏è Downloading dataset from KaggleHub...
üìÅ Path to dataset files: /kaggle/input/cardiovascular-disease-dataset
üìÑ Loading dataset...
üßº Cleaning and preprocessing...
üß™ Applying SMOTE to balance classes...
‚öôÔ∏è Defining base models...
üß† Combining all models with StackingClassifier...
üèãÔ∏è Training Ensemble...


Training:   0%|          | 0/1 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 27875, number of negative: 27875
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 545
[LightGBM] [Info] Number of data points in the train set: 55750, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 22300, number of negative: 22300
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004287 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 526
[LightGBM] [Info] Number of data points in the train set: 44600, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000