# Scaling / Standardization,Baseline Modeling

### FINAL MODEL — DO NOT RETUNE
### Recall-optimized, calibrated Gradient Boosting

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append('../src')
import os
from preprocessing import prepare_features_target, split_data, scale_data

In [47]:
df_final=pd.read_csv("../data/processed/cardio_processed.csv")

In [48]:


# Prepare features and target
X, y = prepare_features_target(df_final)
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split data
X_train, X_test, y_train, y_test = split_data(X, y)
print(f"\nTraining set: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Test set: X_test={X_test.shape}, y_test={y_test.shape}")

# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Scale data
X_train_scaled, X_test_scaled, scaler = scale_data(X_train, X_test, save_scaler=True, scaler_path='../models/scaler.pkl')

print("\n" + "="*60)
print("DATA PREPROCESSING COMPLETE")
print("="*60)
print(f"Training set shape: X_train_scaled={X_train_scaled.shape}, y_train={y_train.shape}")
print(f"Test set shape: X_test_scaled={X_test_scaled.shape}, y_test={y_test.shape}")
print(f"\nTarget distribution in training set:\n{y_train.value_counts(normalize=True)}")
print(f"\nTarget distribution in test set:\n{y_test.value_counts(normalize=True)}")

Features shape: (65429, 11)
Target shape: (65429,)

Training set: X_train=(52343, 11), y_train=(52343,)
Test set: X_test=(13086, 11), y_test=(13086,)

DATA PREPROCESSING COMPLETE
Training set shape: X_train_scaled=(52343, 11), y_train=(52343,)
Test set shape: X_test_scaled=(13086, 11), y_test=(13086,)

Target distribution in training set:
cardio
1    0.507097
0    0.492903
Name: proportion, dtype: float64

Target distribution in test set:
cardio
1    0.507107
0    0.492893
Name: proportion, dtype: float64


# Baseline Modeling

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)

#init
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',   # important for healthcare (recall focus)
    random_state=42
)

#training the model
log_reg.fit(X_train_scaled, y_train)

print("✓ Logistic Regression model trained")

#predicitons
y_train_pred = log_reg.predict(X_train_scaled)
y_test_pred = log_reg.predict(X_test_scaled)

y_test_proba = log_reg.predict_proba(X_test_scaled)[:, 1]

#eval
print("\n--- TRAIN METRICS ---")
print("Accuracy :", accuracy_score(y_train, y_train_pred))
print("Recall   :", recall_score(y_train, y_train_pred))

print("\n--- TEST METRICS ---")
print("Accuracy :", accuracy_score(y_test, y_test_pred))
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall   :", recall_score(y_test, y_test_pred))
print("F1-score :", f1_score(y_test, y_test_pred))
print("ROC-AUC  :", roc_auc_score(y_test, y_test_proba))

#confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix (Test Set):")
print(cm)


print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


✓ Logistic Regression model trained

--- TRAIN METRICS ---
Accuracy : 0.7256175610874425
Recall   : 0.6799532833515428

--- TEST METRICS ---
Accuracy : 0.7174079168577105
Precision: 0.7459812458137978
Recall   : 0.6713381555153707
F1-score : 0.7066941624365483
ROC-AUC  : 0.7808365925115999

Confusion Matrix (Test Set):
[[4933 1517]
 [2181 4455]]

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.69      0.76      0.73      6450
           1       0.75      0.67      0.71      6636

    accuracy                           0.72     13086
   macro avg       0.72      0.72      0.72     13086
weighted avg       0.72      0.72      0.72     13086



In [6]:
# ======================================================================
# STEP 15: BASELINE MODELING (Recall-focused)
# ======================================================================

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import recall_score, accuracy_score, classification_report, confusion_matrix

# Store results
results = {}

# ===============================
# 1. Logistic Regression
# ===============================
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)
y_pred_lr = log_reg.predict(X_test_scaled)
results['LogisticRegression'] = {
    'recall': recall_score(y_test, y_pred_lr),
    'accuracy': accuracy_score(y_test, y_pred_lr)
}

# ===============================
# 2. KNN
# ===============================
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)
results['KNN'] = {
    'recall': recall_score(y_test, y_pred_knn),
    'accuracy': accuracy_score(y_test, y_pred_knn)
}

# ===============================
# 3. SVM
# ===============================
svm = SVC()
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
results['SVM'] = {
    'recall': recall_score(y_test, y_pred_svm),
    'accuracy': accuracy_score(y_test, y_pred_svm)
}

# ===============================
# 4. Decision Tree (NO scaling needed, but scaled is fine)
# ===============================
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
results['DecisionTree'] = {
    'recall': recall_score(y_test, y_pred_dt),
    'accuracy': accuracy_score(y_test, y_pred_dt)
}

# ===============================
# 5. Random Forest
# ===============================
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
results['RandomForest'] = {
    'recall': recall_score(y_test, y_pred_rf),
    'accuracy': accuracy_score(y_test, y_pred_rf)
}


In [16]:
def Table():
    table = pd.DataFrame(results).T
    table = table.sort_values(by='recall', ascending=False)
    return table

In [21]:
Table()

Unnamed: 0,recall,accuracy
GradientBoosting,0.697709,0.725737
RandomForest,0.687613,0.686841
LogisticRegression,0.680681,0.718707
SVM,0.67616,0.722757
KNN,0.673448,0.677442
DecisionTree,0.608047,0.614091


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid_lr = {
    'C': [0.01, 0.1, 1, 10]
}

lr = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42
)

grid_lr = GridSearchCV(
    lr,
    param_grid_lr,
    scoring='recall',
    cv=5,
    n_jobs=-1
)

grid_lr.fit(X_train_scaled, y_train)
best_lr = grid_lr.best_estimator_

print("Best Logistic Regression Params:", grid_lr.best_params_)


Best Logistic Regression Params: {'C': 0.01}


In [28]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4]
}

gb = GradientBoostingClassifier(random_state=42)

grid_gb = GridSearchCV(
    gb,
    param_grid_gb,
    scoring='recall',
    cv=5,
    n_jobs=-1
)

grid_gb.fit(X_train, y_train)
best_gb = grid_gb.best_estimator_

print("Best Gradient Boosting Params:", grid_gb.best_params_)


Best Gradient Boosting Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}


In [29]:
from sklearn.metrics import (
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)

# Logistic Regression
lr_preds = best_lr.predict(X_test_scaled)
lr_probs = best_lr.predict_proba(X_test_scaled)[:, 1]

# Gradient Boosting
gb_preds = best_gb.predict(X_test)
gb_probs = best_gb.predict_proba(X_test)[:, 1]

print("\n--- Logistic Regression ---")
print("Recall:", recall_score(y_test, lr_preds))
print("ROC-AUC:", roc_auc_score(y_test, lr_probs))
print(confusion_matrix(y_test, lr_preds))

print("\n--- Gradient Boosting (FINAL MODEL) ---")
print("Recall:", recall_score(y_test, gb_preds))
print("ROC-AUC:", roc_auc_score(y_test, gb_probs))
print(confusion_matrix(y_test, gb_preds))



--- Logistic Regression ---
Recall: 0.6725437010247137
ROC-AUC: 0.780720500348113
[[4923 1527]
 [2173 4463]]

--- Gradient Boosting (FINAL MODEL) ---
Recall: 0.7008740204942736
ROC-AUC: 0.790705734751952
[[4867 1583]
 [1985 4651]]


In [30]:
cm = confusion_matrix(y_test, gb_preds)
tn, fp, fn, tp = cm.ravel()

print("False Negatives (Critical):", fn)
print("False Positives:", fp)


False Negatives (Critical): 1985
False Positives: 1583


In [31]:
import pandas as pd

feature_importance = pd.Series(
    best_gb.feature_importances_,
    index=X_train.columns
).sort_values(ascending=False)

feature_importance


ap_hi          0.729311
age            0.124260
cholesterol    0.076493
bmi            0.029059
ap_lo          0.019007
height         0.006038
gluc           0.005583
active         0.004650
smoke          0.002742
alco           0.002065
gender         0.000793
dtype: float64

In [36]:
train_preds = best_gb.predict(X_train)
print("Train Recall:", recall_score(y_train, train_preds))
print("Test Recall :", recall_score(y_test, gb_preds))
print("Train Accuracy:", accuracy_score(y_train, train_preds))

Train Recall: 0.7102060806992427
Test Recall : 0.7008740204942736
Train Accuracy: 0.7373478784173624


In [37]:
train_preds1=best_lr.predict(X_train_scaled)
print("Train Recall:", recall_score(y_train, train_preds1))
print("Test Recall :", recall_score(y_test, lr_preds))
# accuracy
print("Train Accuracy:", accuracy_score(y_train, train_preds1))

Train Recall: 0.6801039822175338
Test Recall : 0.6725437010247137
Train Accuracy: 0.7255029325793325


In [38]:
import numpy as np
from sklearn.metrics import recall_score

thresholds = np.arange(0.1, 0.9, 0.05)
recalls = []

for t in thresholds:
    preds = (gb_probs >= t).astype(int)
    recalls.append(recall_score(y_test, preds))

list(zip(thresholds, recalls))


[(np.float64(0.1), 0.995177817962628),
 (np.float64(0.15000000000000002), 0.982218203737191),
 (np.float64(0.20000000000000004), 0.9632308619650392),
 (np.float64(0.25000000000000006), 0.9332429174201327),
 (np.float64(0.30000000000000004), 0.8847197106690777),
 (np.float64(0.3500000000000001), 0.8417721518987342),
 (np.float64(0.40000000000000013), 0.78541289933695),
 (np.float64(0.45000000000000007), 0.7438215792646172),
 (np.float64(0.5000000000000001), 0.7008740204942736),
 (np.float64(0.5500000000000002), 0.642706449668475),
 (np.float64(0.6000000000000002), 0.591922845087402),
 (np.float64(0.6500000000000001), 0.5513863773357445),
 (np.float64(0.7000000000000002), 0.5207956600361664),
 (np.float64(0.7500000000000002), 0.488245931283906),
 (np.float64(0.8000000000000002), 0.41787221217600967),
 (np.float64(0.8500000000000002), 0.20524412296564196)]

In [39]:
final_threshold = 0.35
final_preds = (gb_probs >= final_threshold).astype(int)

print("Recall:", recall_score(y_test, final_preds))
print(confusion_matrix(y_test, final_preds))


Recall: 0.8417721518987342
[[3544 2906]
 [1050 5586]]


In [40]:
from sklearn.calibration import CalibratedClassifierCV

cal_gb = CalibratedClassifierCV(best_gb, method='isotonic', cv=5)
cal_gb.fit(X_train, y_train)

cal_probs = cal_gb.predict_proba(X_test)[:, 1]


In [41]:
cal_probs

array([0.23294827, 0.51889497, 0.29235561, ..., 0.55447044, 0.48213095,
       0.12305351], shape=(13086,))

In [49]:
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, cal_probs)



In [50]:
print(len(precision), len(recall), len(thresholds))


4535 4535 4534


In [51]:

pr_df = pd.DataFrame({
    "threshold": thresholds,
    "precision": precision[:-1],
    "recall": recall[:-1]
})

pr_df.sample(10)


Unnamed: 0,threshold,precision,recall
3657,0.811639,0.840096,0.368143
1325,0.309442,0.63577,0.86513
1365,0.313698,0.638395,0.86091
4248,0.856127,0.850355,0.180681
2524,0.513196,0.75772,0.672996
1138,0.286868,0.614957,0.899638
998,0.266161,0.597424,0.922694
1986,0.406364,0.696409,0.783303
2506,0.508763,0.751028,0.688216
3886,0.827367,0.847817,0.301386


In [52]:
FINAL_THRESHOLD = 0.36
final_preds = (cal_probs >= FINAL_THRESHOLD).astype(int)

from sklearn.metrics import recall_score, precision_score, confusion_matrix

print("FINAL Recall   :", recall_score(y_test, final_preds))
print("FINAL Precision:", precision_score(y_test, final_preds))
print(confusion_matrix(y_test, final_preds))


FINAL Recall   : 0.8340867992766727
FINAL Precision: 0.6626361786184605
[[3632 2818]
 [1101 5535]]


In [55]:
import pickle
import os

os.makedirs("models", exist_ok=True)

with open("models/ultra_final_model.pkl", "wb") as f:
    pickle.dump({
        "model": cal_gb,
        "scaler": scaler,
        "threshold": 0.36,
        "features": list(X_train.columns)
    }, f)


print("Final model and scaler saved.")


Final model and scaler saved.
