In [14]:
import os
import json
import platform
from datetime import datetime, timezone

import joblib
import numpy as np
import pandas as pd

import optuna
import xgboost as xgb
import lightgbm as lgb

from sklearn import set_config
from sklearn.datasets import fetch_openml
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import IsolationForest

In [15]:
# standard libs
import os
import json
import platform

# core libs
import numpy as np
import pandas as pd
import optuna

# sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import KFold, train_test_split
from sklearn import set_config

# helpers
from helper_func import (
    rmsle,
    build_preprocessor,
    build_model,
    suggest_params,
    train_with_es_get_best_iter,
    refit_on_full_training,
    predict_artifact,
)


In [16]:
set_config(transform_output="pandas")
RANDOM_STATE = 41
try:
    from google.colab import drive
    drive.mount('/content/drive')
    BASE_PATH = "/content/drive/MyDrive/Ames_ML_Production_Final/"
except ImportError:
    BASE_PATH = "./artifacts"
    
os.makedirs(BASE_PATH, exist_ok=True)

ARTIFACT_MODEL  = os.path.join(BASE_PATH, "production_model_final.joblib")
ARTIFACT_CARD   = os.path.join(BASE_PATH, "model_card.json")
ARTIFACT_REPORT = os.path.join(BASE_PATH, "nestedcv_report.json")

OUTER_SPLITS = 4
INNER_SPLITS = 3
N_TRIALS_INNER = 10
EARLY_STOPPING_ROUNDS = 50
ES_SPLIT = 0.15

In [17]:
data = fetch_openml(name="house_prices", as_frame=True, parser="auto")
X_full = data.data.copy()
y_full = pd.to_numeric(data.target, errors="coerce").astype(float)

if y_full.name in X_full.columns:
    raise ValueError(f"Leakage: target column '{y_full.name}' exists in X.")

num_cols = X_full.select_dtypes(include=np.number).columns.tolist()
cat_cols = X_full.select_dtypes(exclude=np.number).columns.tolist()

In [18]:
# nested cv arch
outer_cv = KFold(n_splits=OUTER_SPLITS, shuffle=True, random_state=RANDOM_STATE)
inner_cv = KFold(n_splits=INNER_SPLITS, shuffle=True, random_state=RANDOM_STATE)

outer_scores = []
outer_best_params = []
outer_best_inner_scores = []

sampler = optuna.samplers.TPESampler(multivariate=False,seed=RANDOM_STATE)
pruner = optuna.pruners.SuccessiveHalvingPruner(min_resource=1,reduction_factor=3,min_early_stopping_rate=0)


In [19]:
outer_scores = []
outer_best_params = []   

In [20]:
for outer_fold, (tr_idx, te_idx) in enumerate(
        outer_cv.split(X_full, y_full), start=1):

    X_outer_tr = X_full.iloc[tr_idx].copy()
    y_outer_tr = y_full.iloc[tr_idx].copy()
    X_outer_te = X_full.iloc[te_idx].copy()
    y_outer_te = y_full.iloc[te_idx].copy()

    def inner_objective(trial):
        params = suggest_params(trial)
        scores = []

        for inner_fold, (itr_idx, iva_idx) in enumerate(
                inner_cv.split(X_outer_tr, y_outer_tr)):

            X_in_tr = X_outer_tr.iloc[itr_idx].copy()
            y_in_tr = y_outer_tr.iloc[itr_idx].copy()
            X_in_va = X_outer_tr.iloc[iva_idx].copy()
            y_in_va = y_outer_tr.iloc[iva_idx].copy()

            artifact_tmp = refit_on_full_training(
                X_train=X_in_tr,
                y_train=y_in_tr,
                params=params,
                num_cols=num_cols,
                cat_cols=cat_cols,
                random_state=RANDOM_STATE,
                es_split=ES_SPLIT,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            )

            preds = predict_artifact(artifact_tmp, X_in_va)
            scores.append(rmsle(y_in_va.values, preds))

            trial.report(float(np.mean(scores)), step=inner_fold)
            if trial.should_prune():
                raise optuna.exceptions.TrialPruned()

        return float(np.mean(scores))

    study = optuna.create_study(
        direction="minimize",
        sampler=sampler,
        pruner=pruner
    )
    study.optimize(inner_objective, n_trials=N_TRIALS_INNER)

    best_params = study.best_params
    outer_best_params.append(best_params)

    final_art = refit_on_full_training(
        X_train=X_outer_tr,
        y_train=y_outer_tr,
        params=best_params,
        num_cols=num_cols,
        cat_cols=cat_cols,
        random_state=RANDOM_STATE,
        es_split=ES_SPLIT,
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    )

    outer_preds = predict_artifact(final_art, X_outer_te)
    outer_score = rmsle(y_outer_te.values, outer_preds)

    outer_scores.append(outer_score)

    print(
        f"[Outer {outer_fold}/{OUTER_SPLITS}] "
        f"OuterTest={outer_score:.5f}"
    )


[I 2026-01-04 14:14:43,066] A new study created in memory with name: no-name-73f324ca-b48b-4a06-85ad-589f7971b1d9


[I 2026-01-04 14:21:54,621] Trial 0 finished with value: 0.35751457792688673 and parameters: {'model_type': 'XGB', 'pca_var': 0.9379861113443726, 'loo_smooth': 1.1738779425790082, 'rare_min_freq': 0.00791059257916328, 'n_estimators': 1800, 'learning_rate': 0.008489330288212587, 'xgb_max_depth': 7, 'xgb_subsample': 0.9669791395483587, 'xgb_colsample': 0.7675120347797212, 'xgb_reg_lambda': 0.021332384366491854, 'xgb_min_child_weight': 1.167361197480349}. Best is trial 0 with value: 0.35751457792688673.
[I 2026-01-04 14:22:19,474] Trial 1 pruned. 
[I 2026-01-04 14:22:47,950] Trial 2 pruned. 
[I 2026-01-04 14:23:15,481] Trial 3 pruned. 
[I 2026-01-04 14:23:47,447] Trial 4 pruned. 
[I 2026-01-04 14:24:13,974] Trial 5 pruned. 
[I 2026-01-04 14:24:43,745] Trial 6 pruned. 
[I 2026-01-04 14:25:15,469] Trial 7 finished with value: 0.36679018719034034 and parameters: {'model_type': 'LGBM', 'pca_var': 0.9578440815999327, 'loo_smooth': 4.0669590260487505, 'rare_min_freq': 0.018164694301604323, 'n_e

[Outer 1/4] OuterTest=0.31597


[I 2026-01-04 14:32:37,141] Trial 0 finished with value: 0.4336310677634783 and parameters: {'model_type': 'XGB', 'pca_var': 0.9031555599383071, 'loo_smooth': 1.5316930998203855, 'rare_min_freq': 0.006110926682073255, 'n_estimators': 1800, 'learning_rate': 0.053797701944560454, 'xgb_max_depth': 7, 'xgb_subsample': 0.9044872741110384, 'xgb_colsample': 0.8683458703130046, 'xgb_reg_lambda': 0.0023337097845099212, 'xgb_min_child_weight': 3.2968648822238884}. Best is trial 0 with value: 0.4336310677634783.
[I 2026-01-04 14:34:16,845] Trial 1 pruned. 
[I 2026-01-04 14:34:53,951] Trial 2 finished with value: 0.36386344618174743 and parameters: {'model_type': 'LGBM', 'pca_var': 0.9462384846318956, 'loo_smooth': 2.775060688937487, 'rare_min_freq': 0.0174461038778595, 'n_estimators': 600, 'learning_rate': 0.037011540374944515, 'lgb_num_leaves': 84, 'lgb_min_child_samples': 6, 'lgb_subsample': 0.8501936671852284, 'lgb_colsample': 0.9564358335369122, 'lgb_reg_lambda': 0.005455865545232723}. Best i

[Outer 2/4] OuterTest=0.36045


[I 2026-01-04 14:45:42,641] Trial 0 finished with value: 0.35245718700254325 and parameters: {'model_type': 'LGBM', 'pca_var': 0.9167376473796037, 'loo_smooth': 3.171368985402559, 'rare_min_freq': 0.012321901458845822, 'n_estimators': 2400, 'learning_rate': 0.0604792285796818, 'lgb_num_leaves': 75, 'lgb_min_child_samples': 39, 'lgb_subsample': 0.7031828306120208, 'lgb_colsample': 0.7612251620393458, 'lgb_reg_lambda': 0.0011766972553881791}. Best is trial 0 with value: 0.35245718700254325.
[I 2026-01-04 14:46:29,637] Trial 1 finished with value: 0.3454514417861776 and parameters: {'model_type': 'LGBM', 'pca_var': 0.9376970887789999, 'loo_smooth': 1.2152672340892225, 'rare_min_freq': 0.02212365911354152, 'n_estimators': 1800, 'learning_rate': 0.009849624169902179, 'lgb_num_leaves': 81, 'lgb_min_child_samples': 9, 'lgb_subsample': 0.7992121263117878, 'lgb_colsample': 0.6748621910035884, 'lgb_reg_lambda': 2.749057705185422}. Best is trial 1 with value: 0.3454514417861776.
[I 2026-01-04 14:

[Outer 3/4] OuterTest=0.36656


[I 2026-01-04 14:54:35,233] Trial 0 finished with value: 0.36496054745217793 and parameters: {'model_type': 'LGBM', 'pca_var': 0.9441539776374148, 'loo_smooth': 2.140411637770543, 'rare_min_freq': 0.02101648399108131, 'n_estimators': 2400, 'learning_rate': 0.005143995614157298, 'lgb_num_leaves': 49, 'lgb_min_child_samples': 58, 'lgb_subsample': 0.8500922917189233, 'lgb_colsample': 0.8766046232906693, 'lgb_reg_lambda': 0.011799271480435771}. Best is trial 0 with value: 0.36496054745217793.
[I 2026-01-04 14:54:57,740] Trial 1 pruned. 
[I 2026-01-04 14:55:21,122] Trial 2 pruned. 
[I 2026-01-04 14:55:50,448] Trial 3 finished with value: 0.3578970177963275 and parameters: {'model_type': 'LGBM', 'pca_var': 0.9701746615314029, 'loo_smooth': 3.0871614286418025, 'rare_min_freq': 0.006431954982694783, 'n_estimators': 600, 'learning_rate': 0.03288133901227763, 'lgb_num_leaves': 28, 'lgb_min_child_samples': 30, 'lgb_subsample': 0.6737471096840747, 'lgb_colsample': 0.9506343726769089, 'lgb_reg_lamb

[Outer 4/4] OuterTest=0.37231


In [21]:
report = {
    "created_at_utc": datetime.now(timezone.utc).isoformat(),
    "outer_splits": OUTER_SPLITS,
    "inner_splits": INNER_SPLITS,
    "n_trials_inner_per_outer_fold": N_TRIALS_INNER,
    "outer_scores_rmsle": outer_scores,
    "outer_mean_rmsle": float(np.mean(outer_scores)),
    "outer_std_rmsle": float(np.std(outer_scores)),
    "outer_best_inner_scores": outer_best_inner_scores,
    "outer_best_params": outer_best_params,
}


In [None]:
from collections import Counter

def aggregate_outer_params(outer_best_params):
    if not outer_best_params:
        raise ValueError("outer_best_params is empty")

    model_types = [p["model_type"] for p in outer_best_params]
    dominant_model = Counter(model_types).most_common(1)[0][0]

    filtered = [p for p in outer_best_params if p["model_type"] == dominant_model]

    agg = {"model_type": dominant_model}

    for k in filtered[0]:
        if k == "model_type":
            continue

        values = [p[k] for p in filtered]

        if isinstance(values[0], (int, float)):
            agg[k] = float(np.median(values))
        else:
            agg[k] = Counter(values).most_common(1)[0][0]

    return agg


agg_params = aggregate_outer_params(outer_best_params)

print(agg_params)

{'model_type': 'LGBM', 'pca_var': 0.9462384846318956, 'loo_smooth': 2.775060688937487, 'rare_min_freq': 0.0174461038778595, 'n_estimators': 600.0, 'learning_rate': 0.03288133901227763, 'lgb_num_leaves': 81.0, 'lgb_min_child_samples': 9.0, 'lgb_subsample': 0.7992121263117878, 'lgb_colsample': 0.9506343726769089, 'lgb_reg_lambda': 0.007002791522009404}


In [26]:
def full_objective(trial):
    params = suggest_params(trial)
    cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    scores = []

    for fold, (tr_idx, va_idx) in enumerate(cv.split(X_full, y_full)):
        X_tr = X_full.iloc[tr_idx].copy()
        y_tr = y_full.iloc[tr_idx].copy()
        X_va = X_full.iloc[va_idx].copy()
        y_va = y_full.iloc[va_idx].copy()

        art = refit_on_full_training(
            X_train=X_tr,
            y_train=y_tr,
            params=params,
            num_cols=num_cols,
            cat_cols=cat_cols,
            random_state=RANDOM_STATE,
            es_split=ES_SPLIT,
            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        )

        preds = predict_artifact(art, X_va)
        scores.append(rmsle(y_va.values, preds))

        trial.report(float(np.mean(scores)), step=fold)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return float(np.mean(scores))



final_study = optuna.create_study(
    direction="minimize",
    sampler=sampler,
    pruner=pruner,
)

#warm-start using robust aggregated params
final_study.enqueue_trial(agg_params)

#also enqueue each outer-fold best param set
for p in outer_best_params:
    final_study.enqueue_trial(p)

final_study.optimize(
    full_objective,
    n_trials=max(30, N_TRIALS_INNER),
    show_progress_bar=True,
)

final_params = final_study.best_params
final_artifact = refit_on_full_training(
    X_train=X_full,
    y_train=y_full,
    params=final_params,
    num_cols=num_cols,
    cat_cols=cat_cols,
    random_state=RANDOM_STATE,
    es_split=ES_SPLIT,
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
)

joblib.dump(final_artifact, ARTIFACT_MODEL)
print("\nSaved production model artifact:", ARTIFACT_MODEL)


[I 2026-01-04 15:44:55,425] A new study created in memory with name: no-name-f502040c-b403-4e0d-b666-44b19bb2c57b
Best trial: 0. Best value: 0.358592:   3%|▎         | 1/30 [02:11<1:03:28, 131.32s/it]

[I 2026-01-04 15:47:06,916] Trial 0 finished with value: 0.358591658677695 and parameters: {'model_type': 'LGBM', 'pca_var': 0.9462384846318956, 'loo_smooth': 2.775060688937487, 'rare_min_freq': 0.0174461038778595, 'n_estimators': 600.0, 'learning_rate': 0.03288133901227763, 'lgb_num_leaves': 81.0, 'lgb_min_child_samples': 9.0, 'lgb_subsample': 0.7992121263117878, 'lgb_colsample': 0.9506343726769089, 'lgb_reg_lambda': 0.007002791522009404}. Best is trial 0 with value: 0.358591658677695.


Best trial: 0. Best value: 0.358592:   7%|▋         | 2/30 [08:04<2:02:11, 261.84s/it]

[I 2026-01-04 15:53:00,116] Trial 1 pruned. 


Best trial: 2. Best value: 0.358309:  10%|█         | 3/30 [09:48<1:25:24, 189.78s/it]

[I 2026-01-04 15:54:44,155] Trial 2 finished with value: 0.35830859466654696 and parameters: {'model_type': 'LGBM', 'pca_var': 0.9462384846318956, 'loo_smooth': 2.775060688937487, 'rare_min_freq': 0.0174461038778595, 'n_estimators': 600, 'learning_rate': 0.037011540374944515, 'lgb_num_leaves': 84, 'lgb_min_child_samples': 6, 'lgb_subsample': 0.8501936671852284, 'lgb_colsample': 0.9564358335369122, 'lgb_reg_lambda': 0.005455865545232723}. Best is trial 2 with value: 0.35830859466654696.


Best trial: 2. Best value: 0.358309:  13%|█▎        | 4/30 [10:32<57:19, 132.27s/it]  

[I 2026-01-04 15:55:28,260] Trial 3 pruned. 


Best trial: 4. Best value: 0.346979:  17%|█▋        | 5/30 [11:45<46:09, 110.77s/it]

[I 2026-01-04 15:56:40,912] Trial 4 finished with value: 0.34697890321762503 and parameters: {'model_type': 'LGBM', 'pca_var': 0.9701746615314029, 'loo_smooth': 3.0871614286418025, 'rare_min_freq': 0.006431954982694783, 'n_estimators': 600, 'learning_rate': 0.03288133901227763, 'lgb_num_leaves': 28, 'lgb_min_child_samples': 30, 'lgb_subsample': 0.6737471096840747, 'lgb_colsample': 0.9506343726769089, 'lgb_reg_lambda': 0.007002791522009404}. Best is trial 4 with value: 0.34697890321762503.


Best trial: 4. Best value: 0.346979:  20%|██        | 6/30 [24:19<2:11:49, 329.55s/it]

[I 2026-01-04 16:09:15,140] Trial 5 pruned. 


Best trial: 4. Best value: 0.346979:  23%|██▎       | 7/30 [26:54<1:44:26, 272.46s/it]

[I 2026-01-04 16:11:50,057] Trial 6 pruned. 


Best trial: 4. Best value: 0.346979:  27%|██▋       | 8/30 [27:40<1:13:29, 200.44s/it]

[I 2026-01-04 16:12:36,281] Trial 7 pruned. 


Best trial: 4. Best value: 0.346979:  30%|███       | 9/30 [30:27<1:06:31, 190.07s/it]

[I 2026-01-04 16:15:23,567] Trial 8 pruned. 


Best trial: 4. Best value: 0.346979:  33%|███▎      | 10/30 [30:57<46:49, 140.48s/it] 

[I 2026-01-04 16:15:53,007] Trial 9 pruned. 


Best trial: 4. Best value: 0.346979:  37%|███▋      | 11/30 [32:12<38:07, 120.38s/it]

[I 2026-01-04 16:17:07,802] Trial 10 finished with value: 0.34697979323010375 and parameters: {'model_type': 'LGBM', 'pca_var': 0.972982386855553, 'loo_smooth': 4.92689083893039, 'rare_min_freq': 0.011695557351027168, 'n_estimators': 1200, 'learning_rate': 0.07865593868349696, 'lgb_num_leaves': 22, 'lgb_min_child_samples': 54, 'lgb_subsample': 0.6190444397187912, 'lgb_colsample': 0.8448656419961645, 'lgb_reg_lambda': 0.06550997410914035}. Best is trial 4 with value: 0.34697890321762503.


Best trial: 11. Best value: 0.339661:  40%|████      | 12/30 [33:09<30:19, 101.08s/it]

[I 2026-01-04 16:18:04,738] Trial 11 finished with value: 0.3396611374921218 and parameters: {'model_type': 'LGBM', 'pca_var': 0.9796090660269953, 'loo_smooth': 4.886387910813595, 'rare_min_freq': 0.012457242817826378, 'n_estimators': 1200, 'learning_rate': 0.07110731447912398, 'lgb_num_leaves': 24, 'lgb_min_child_samples': 58, 'lgb_subsample': 0.616596760473645, 'lgb_colsample': 0.833177005551333, 'lgb_reg_lambda': 0.05741675273635176}. Best is trial 11 with value: 0.3396611374921218.


Best trial: 11. Best value: 0.339661:  43%|████▎     | 13/30 [33:42<22:48, 80.52s/it] 

[I 2026-01-04 16:18:37,930] Trial 12 pruned. 


Best trial: 11. Best value: 0.339661:  47%|████▋     | 14/30 [33:59<16:23, 61.46s/it]

[I 2026-01-04 16:18:55,379] Trial 13 pruned. 


Best trial: 11. Best value: 0.339661:  50%|█████     | 15/30 [34:15<11:56, 47.79s/it]

[I 2026-01-04 16:19:11,500] Trial 14 pruned. 


Best trial: 11. Best value: 0.339661:  53%|█████▎    | 16/30 [34:32<08:56, 38.32s/it]

[I 2026-01-04 16:19:27,826] Trial 15 pruned. 


Best trial: 11. Best value: 0.339661:  57%|█████▋    | 17/30 [34:49<06:55, 31.95s/it]

[I 2026-01-04 16:19:44,965] Trial 16 pruned. 


Best trial: 11. Best value: 0.339661:  60%|██████    | 18/30 [35:08<05:35, 27.96s/it]

[I 2026-01-04 16:20:03,621] Trial 17 pruned. 


Best trial: 11. Best value: 0.339661:  63%|██████▎   | 19/30 [36:00<06:27, 35.26s/it]

[I 2026-01-04 16:20:55,901] Trial 18 pruned. 


Best trial: 11. Best value: 0.339661:  67%|██████▋   | 20/30 [36:31<05:40, 34.00s/it]

[I 2026-01-04 16:21:26,963] Trial 19 pruned. 


Best trial: 11. Best value: 0.339661:  70%|███████   | 21/30 [36:51<04:29, 29.92s/it]

[I 2026-01-04 16:21:47,369] Trial 20 pruned. 


Best trial: 11. Best value: 0.339661:  73%|███████▎  | 22/30 [37:34<04:30, 33.86s/it]

[I 2026-01-04 16:22:30,423] Trial 21 pruned. 


Best trial: 11. Best value: 0.339661:  77%|███████▋  | 23/30 [38:28<04:38, 39.82s/it]

[I 2026-01-04 16:23:24,136] Trial 22 finished with value: 0.3412962427770093 and parameters: {'model_type': 'LGBM', 'pca_var': 0.9786601187406067, 'loo_smooth': 4.978048856369792, 'rare_min_freq': 0.015994249560642444, 'n_estimators': 1200, 'learning_rate': 0.06677518698670457, 'lgb_num_leaves': 20, 'lgb_min_child_samples': 60, 'lgb_subsample': 0.6537486551463045, 'lgb_colsample': 0.8543444205288419, 'lgb_reg_lambda': 0.11053775945479093}. Best is trial 11 with value: 0.3396611374921218.


Best trial: 11. Best value: 0.339661:  80%|████████  | 24/30 [39:08<03:59, 39.95s/it]

[I 2026-01-04 16:24:04,410] Trial 23 pruned. 


Best trial: 11. Best value: 0.339661:  83%|████████▎ | 25/30 [39:32<02:55, 35.13s/it]

[I 2026-01-04 16:24:28,267] Trial 24 pruned. 


Best trial: 11. Best value: 0.339661:  87%|████████▋ | 26/30 [40:12<02:25, 36.39s/it]

[I 2026-01-04 16:25:07,609] Trial 25 pruned. 


Best trial: 11. Best value: 0.339661:  90%|█████████ | 27/30 [41:08<02:06, 42.29s/it]

[I 2026-01-04 16:26:03,668] Trial 26 pruned. 


Best trial: 11. Best value: 0.339661:  93%|█████████▎| 28/30 [41:36<01:16, 38.20s/it]

[I 2026-01-04 16:26:32,317] Trial 27 pruned. 


Best trial: 11. Best value: 0.339661:  97%|█████████▋| 29/30 [42:05<00:35, 35.24s/it]

[I 2026-01-04 16:27:00,661] Trial 28 pruned. 


Best trial: 11. Best value: 0.339661: 100%|██████████| 30/30 [42:43<00:00, 85.46s/it]


[I 2026-01-04 16:27:39,256] Trial 29 pruned. 

Saved production model artifact: ./artifacts\production_model_final.joblib


In [27]:
card = {
    "model_name": "Ames Housing - Nested CV Production Artifact",
    "created_at_utc": datetime.now(timezone.utc).isoformat(),
    "task": "Regression with log1p target transform",
    "primary_metric": "RMSLE",

    "nested_cv": {
        "outer_mean_rmsle": report["outer_mean_rmsle"],
        "outer_std_rmsle": report["outer_std_rmsle"],
        "outer_scores": report["outer_scores_rmsle"],
    },

    "final_training": {
        "final_tuning_cv_rmsle": float(final_study.best_value),
        "final_params": final_params,
        "final_best_iter": int(final_artifact["best_iter"]),
        "note": "Final model refit on full data with ES-derived best iteration."
    },

    "leakage_controls": [
        "Outer test fold never used in tuning.",
        "All preprocessing + LOO encoding fitted only on training partitions inside CV.",
        "Early stopping uses validation split from training only (never outer test)."
    ],

    "data_summary": {
        "source": "OpenML: house_prices (Ames Housing)",
        "n_rows": int(X_full.shape[0]),
        "n_features": int(X_full.shape[1]),
        "n_numeric": int(len(num_cols)),
        "n_categorical": int(len(cat_cols)),
    },

    "environment": {
        "python": platform.python_version(),
        "numpy": np.__version__,
        "pandas": pd.__version__,
        "optuna": optuna.__version__,
        "xgboost": xgb.__version__,
        "lightgbm": lgb.__version__,
    }
}

with open(ARTIFACT_CARD, "w") as f:
    json.dump(card, f, indent=2)

print("Saved model card:", ARTIFACT_CARD)

Saved model card: ./artifacts\model_card.json
