# Imports (consolidated at top)

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Load Data

In [2]:
df = pd.read_csv('cardio_train.csv', sep=';')
df = df.drop('id', axis=1)

# Data Cleaning

In [3]:
df = df.drop_duplicates(ignore_index=True)  # Removed duplicated sum check; do it once
df = df[(df['ap_hi'] >= 90) & (df['ap_hi'] <= 200) &
        (df['ap_lo'] >= 60) & (df['ap_lo'] <= 140) &
        (df['weight'] >= 40) & (df['weight'] <= 200) &
        (df['height'] >= 140) & (df['height'] <= 210)]  # Converted height to cm for consistency
df = df[df['ap_hi'] > df['ap_lo']]

# Feature Engineering

In [4]:
df['age'] = df['age'] / 365  # Age in years
df['height'] = df['height'] / 100  # Height in meters
df['bmi'] = df['weight'] / (df['height'] ** 2)
df['pulse_pressure'] = df['ap_hi'] - df['ap_lo']
df['map'] = (df['ap_hi'] + 2 * df['ap_lo']) / 3

# Categories (as before)

In [5]:
def bmi_category(bmi):
    if bmi < 18.5: return 0
    elif 18.5 <= bmi < 25: return 1
    elif 25 <= bmi < 30: return 2
    else: return 3
df['bmi_category'] = df['bmi'].apply(bmi_category)

def bp_category(row):
    if row['ap_hi'] < 120 and row['ap_lo'] < 80: return 0
    elif 120 <= row['ap_hi'] < 130 and row['ap_lo'] < 80: return 1
    elif (130 <= row['ap_hi'] < 140) or (80 <= row['ap_lo'] < 90): return 2
    else: return 3
df['bp_category'] = df.apply(bp_category, axis=1)

df['age_group'] = pd.cut(df['age'], bins=[0, 40, 50, 60, 70, 100], labels=[0,1,2,3,4])

# Column Lists

In [6]:
num_col = ['age', 'height', 'weight', 'bmi', 'pulse_pressure', 'map']
cat_col = ['cholesterol', 'gluc', 'smoke', 'alco', 'active', 'bmi_category', 'bp_category', 'age_group']

# Preprocessing Pipeline

In [7]:
preprocessor = ColumnTransformer(transformers=[
    ('categorical', OneHotEncoder(drop='first', sparse_output=False), cat_col),
    ('numeric', StandardScaler(), num_col)
])

# Data Split

In [8]:
X = df.drop('cardio', axis=1)
y = df['cardio']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

# XGBoost Model with Tuning (your best performer)

In [9]:
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False))
])

param_grid_xgb = {
    'classifier__n_estimators': [200, 300, 500],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [4, 6, 8],
    'classifier__min_child_weight': [1, 3, 5],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0]
}

grid_search_xgb = GridSearchCV(xgb_pipeline, param_grid_xgb, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_xgb.fit(X_train, y_train)

print("Best XGB Params:", grid_search_xgb.best_params_)
print("Best CV AUC:", grid_search_xgb.best_score_)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best XGB Params: {'classifier__colsample_bytree': 1.0, 'classifier__learning_rate': 0.05, 'classifier__max_depth': 4, 'classifier__min_child_weight': 5, 'classifier__n_estimators': 200, 'classifier__subsample': 0.8}
Best CV AUC: 0.8001619369435081


# Evaluate Best Model

In [12]:
best_xgb = grid_search_xgb.best_estimator_
y_pred = best_xgb.predict(X_test)
y_pred_proba = best_xgb.predict_proba(X_test)[:, 1]

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test AUC:", roc_auc_score(y_test, y_pred_proba))
print(classification_report(y_test, y_pred))

Test Accuracy: 0.7363329913527774
Test AUC: 0.8051696972992293
              precision    recall  f1-score   support

           0       0.72      0.78      0.75      6889
           1       0.75      0.69      0.72      6757

    accuracy                           0.74     13646
   macro avg       0.74      0.74      0.74     13646
weighted avg       0.74      0.74      0.74     13646



# Cross-Validation for Robustness

In [13]:
cv_scores = cross_val_score(best_xgb, X, y, cv=StratifiedKFold(n_splits=5), scoring='roc_auc')
print("CV Mean AUC:", cv_scores.mean())

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


CV Mean AUC: 0.8009405021049549


In [18]:
import joblib

# Save your best trained XGBoost model (the one from GridSearchCV)
joblib.dump(grid_search_xgb.best_estimator_, 'cardio_xgboost_model.joblib')

print("Your model is now saved as 'cardio_xgboost_model.joblib'")
print("You can find this file in your notebook folder!")

Your model is now saved as 'cardio_xgboost_model.joblib'
You can find this file in your notebook folder!


In [20]:
import joblib
import pandas as pd

# Load the model
model = joblib.load('cardio_xgboost_model.joblib')

# New patient data — NOW with all engineered features
new_patient = {
    'age': 55 / 365,                    # Age in years (original was in days, you divided by 365)
    'gender': 1,                        # 1 = female, 2 = male
    'height': 1.65,                     # in meters
    'weight': 70.0,                     # in kg
    'ap_hi': 140,
    'ap_lo': 90,
    'cholesterol': 2,
    'gluc': 1,
    'smoke': 0,
    'alco': 0,
    'active': 1,
    
    # === Add these engineered features manually ===
    'bmi': 70 / (1.65 ** 2),            # BMI = weight / height² ≈ 25.71
    
    'pulse_pressure': 140 - 90,          # ap_hi - ap_lo = 50
    
    'map': (140 + 2*90) / 3,             # Mean Arterial Pressure ≈ 106.67
    
    # Categorical features (use same logic as training)
    'bmi_category': 2,                  # 0:<18.5, 1:18.5-25, 2:25-30, 3:>=30 → 25.71 → 2 (overweight)
    
    'bp_category': 3,                   # From your function: ap_hi=140, ap_lo=90 → Stage 2 hypertension → 3
    
    'age_group': 3                      # pd.cut: 50-60 years → label 3 (check your bins: [0,40,50,60,70,100] → labels [0,1,2,3,4])
}

# Create DataFrame
new_df = pd.DataFrame([new_patient])

# Now predict — this will work!
prediction = model.predict(new_df)
probability = model.predict_proba(new_df)[0][1]

print("Prediction (0=No Disease, 1=Disease):", prediction[0])
print(f"Probability of Cardiovascular Disease: {probability:.2f} ({probability*100:.1f}%)")

Prediction (0=No Disease, 1=Disease): 1
Probability of Cardiovascular Disease: 0.81 (81.4%)
