In [10]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.linear_model import RidgeClassifierCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

# -----------------------------
# XGBoost (‡∏ó‡∏≥‡πÉ‡∏´‡πâ‡πÑ‡∏°‡πà‡∏•‡πâ‡∏°‡πÅ‡∏°‡πâ‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏ï‡∏¥‡∏î‡∏ï‡∏±‡πâ‡∏á)
# -----------------------------
try:
    from xgboost import XGBClassifier  # pip install xgboost
    HAS_XGB = True
except Exception:
    HAS_XGB = False

# =============================
# PATH / Utils / Labels / Flags
# =============================
file_path = r"C:\Users\piriy\Desktop\dataset.xlsx"
out_path  = r"C:\Users\piriy\Desktop\dataset_with_final_weights_groupaware_1.xlsx"

RANDOM_SEED = 20250913
rng = np.random.default_rng(RANDOM_SEED)

# --- Production flags & Policies ---
ENABLE_NESTED_CV_CHECK   = True     # ‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô Nested-CV ‡∏Ç‡∏≠‡∏á threshold tuning (‡∏Ñ‡∏ß‡∏£‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏≠‡∏±‡∏ô‡∏ô‡∏µ‡πâ‡∏°‡∏≤‡∏Å‡∏Å‡∏ß‡πà‡∏≤)
MIN_GAP                  = 0.08     # ‡∏•‡∏î‡∏Å‡∏±‡∏ô‡∏ä‡πà‡∏ß‡∏á‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡πà‡∏≥‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÑ‡∏°‡πà‡πÉ‡∏´‡πâ threshold ‡πÅ‡∏Ç‡πá‡∏á‡πÄ‡∏Å‡∏¥‡∏ô‡πÑ‡∏õ
COARSE_STEP              = 0.15     # coarse grid (‡πÉ‡∏ä‡πâ‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ï‡∏≠‡∏ô‡∏Ñ‡πâ‡∏ô‡∏´‡∏≤ ‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πâ‡∏Å‡∏±‡∏ö policy-lock)
POLICY_LOCK_QUANTILES    = True     # ‡πÉ‡∏ä‡πâ q20/40/60/80 (‡∏´‡∏£‡∏∑‡∏≠ target-quantile) ‡∏à‡∏≤‡∏Å‡∏ù‡∏±‡πà‡∏á damage ‡πÇ‡∏î‡∏¢‡∏ï‡∏£‡∏á
USE_TARGET_PREVALENCE    = True     # ‡πÉ‡∏ä‡πâ "‡πÄ‡∏õ‡πâ‡∏≤‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô" ‡∏ï‡πà‡∏≠‡∏ä‡∏±‡πâ‡∏ô‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏Ñ‡∏∏‡∏°‡∏†‡∏≤‡∏£‡∏∞‡∏á‡∏≤‡∏ô (‡∏ñ‡πâ‡∏≤ False ‡∏à‡∏∞‡πÉ‡∏ä‡πâ q20/40/60/80)
OPTIMIZE_FOR             = "f1"     # "f1" ‡∏´‡∏£‡∏∑‡∏≠ "cost" (‡∏ñ‡πâ‡∏≤ "cost" ‡∏à‡∏∞‡πÉ‡∏ä‡πâ COST_MATRIX ‡∏î‡πâ‡∏≤‡∏ô‡∏•‡πà‡∏≤‡∏á)

# ‡πÄ‡∏õ‡πâ‡∏≤‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô (‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ù‡∏±‡πà‡∏á "‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢") ‡∏£‡∏ß‡∏° 100
# [‡∏ô‡πâ‡∏≠‡∏¢‡∏°‡∏≤‡∏Å, ‡∏ô‡πâ‡∏≠‡∏¢, ‡∏õ‡∏≤‡∏ô‡∏Å‡∏•‡∏≤‡∏á, ‡∏£‡∏∏‡∏ô‡πÅ‡∏£‡∏á, ‡∏ß‡∏¥‡∏Å‡∏§‡∏ï]
# ‡∏´‡∏°‡∏≤‡∏¢‡πÄ‡∏´‡∏ï‡∏∏: top2 = 8%+4% = 12%
TARGET_PREVALENCE = np.array([46, 30, 12, 8, 4], dtype=float)

# Cost matrix ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÇ‡∏´‡∏°‡∏î "cost" (‡∏Ñ‡πà‡∏≤‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á ‡∏´‡∏•‡∏á‡∏ó‡∏≤‡∏á‡∏´‡∏ô‡∏±‡∏Å‡πÄ‡∏°‡∏∑‡πà‡∏≠‡∏û‡∏•‡∏≤‡∏î‡∏ä‡∏±‡πâ‡∏ô‡∏ö‡∏ô)
# ‡πÅ‡∏Å‡∏ô‡πÅ‡∏ñ‡∏ß = true, ‡πÅ‡∏Å‡∏ô‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå = pred ; ‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏Ñ‡∏•‡∏≤‡∏™ = labels_6_order ‡∏î‡πâ‡∏≤‡∏ô‡∏•‡πà‡∏≤‡∏á
COST_MATRIX = np.array([
# pred:   0    1    2    3    4    5
          [0,   1,   2,   3,   4,   5],  # true 0 (‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢)
          [1,   0,   1,   2,   3,   4],  # true 1
          [2,   1,   0,   1,   2,   3],  # true 2
          [3,   2,   1,   0,   2,   3],  # true 3
          [5,   4,   3,   2,   0,   2],  # true 4
          [8,   6,   5,   4,   2,   0],  # true 5 (‡∏ß‡∏¥‡∏Å‡∏§‡∏ï)
], dtype=float)

labels_5 = ["‡∏ô‡πâ‡∏≠‡∏¢‡∏°‡∏≤‡∏Å", "‡∏ô‡πâ‡∏≠‡∏¢", "‡∏õ‡∏≤‡∏ô‡∏Å‡∏•‡∏≤‡∏á", "‡∏£‡∏∏‡∏ô‡πÅ‡∏£‡∏á", "‡∏ß‡∏¥‡∏Å‡∏§‡∏ï"]
labels_6_order = ["‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢"] + labels_5
code_map = {"‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢": 0, "‡∏ô‡πâ‡∏≠‡∏¢‡∏°‡∏≤‡∏Å": 1, "‡∏ô‡πâ‡∏≠‡∏¢": 2, "‡∏õ‡∏≤‡∏ô‡∏Å‡∏•‡∏≤‡∏á": 3, "‡∏£‡∏∏‡∏ô‡πÅ‡∏£‡∏á": 4, "‡∏ß‡∏¥‡∏Å‡∏§‡∏ï": 5}

# ---- sanitize thresholds (‡∏°‡∏µ‡∏Å‡∏±‡∏ô‡∏ä‡πà‡∏ß‡∏á‡∏ö‡∏≤‡∏á) & ‡∏Ñ‡πà‡∏≤‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ï‡πâ‡∏ô ----
def sanitize_thresholds(th, eps=1e-6, min_gap=MIN_GAP):
    th = np.asarray(th, dtype=float).tolist()
    th = [min(max(0.0, t), 1.0) for t in th]  # clip [0,1]
    th = sorted(th)

    # ‡πÄ‡∏î‡∏¥‡∏ô‡∏´‡∏ô‡πâ‡∏≤: ‡∏ö‡∏±‡∏á‡∏Ñ‡∏±‡∏ö‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ô‡πâ‡∏≠‡∏¢ min_gap
    for i in range(1, len(th)):
        need = th[i-1] + max(eps, min_gap)
        if th[i] < need:
            th[i] = min(1.0, need)

    # ‡πÑ‡∏•‡πà‡∏Å‡∏•‡∏±‡∏ö: ‡∏Ñ‡∏á min_gap ‡∏à‡∏≤‡∏Å‡∏Ç‡∏ß‡∏≤‡πÑ‡∏õ‡∏ã‡πâ‡∏≤‡∏¢
    for i in range(len(th)-2, -1, -1):
        need = th[i+1] - max(eps, min_gap)
        if th[i] > need:
            th[i] = max(0.0, need)
    return th

THRESHOLDS = sanitize_thresholds([0.20, 0.40, 0.60, 0.80])  # ‡∏à‡∏∞‡∏ñ‡∏π‡∏Å‡∏≠‡∏±‡∏û‡πÄ‡∏î‡∏ï‡∏†‡∏≤‡∏¢‡∏´‡∏•‡∏±‡∏á

# ---- nudge_top2: ‡∏à‡∏π‡∏ô t3/t4 ‡πÉ‡∏´‡πâ‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô top2 ‡πÉ‡∏Å‡∏•‡πâ‡πÄ‡∏õ‡πâ‡∏≤‡∏´‡∏°‡∏≤‡∏¢ ----
def nudge_top2(th, X01, mask_dmg, target_pct=12.0, tol=0.2, step=0.002, max_iter=800):
    """
    ‡∏Ç‡∏¢‡∏±‡∏ö t3/t4 ‡πÄ‡∏•‡πá‡∏Å‡∏ô‡πâ‡∏≠‡∏¢‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏î‡∏±‡∏ô‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô top2 (‡∏£‡∏∏‡∏ô‡πÅ‡∏£‡∏á+‡∏ß‡∏¥‡∏Å‡∏§‡∏ï) ‡πÉ‡∏´‡πâ‡πÉ‡∏Å‡∏•‡πâ target_pct (‡∏ö‡∏ô‡∏ù‡∏±‡πà‡∏á‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢)
    - ‡∏Ç‡∏∂‡πâ‡∏ô t3/t4  -> ‡∏•‡∏î top2
    - ‡∏•‡∏á t3/t4    -> ‡πÄ‡∏û‡∏¥‡πà‡∏° top2
    """
    th = np.array(sanitize_thresholds(th), dtype=float)
    X01 = np.asarray(X01, dtype=float)
    mb = np.asarray(mask_dmg, dtype=bool)
    Xd = X01[mb]
    if Xd.size == 0:
        return th.tolist()

    def top2_pct(th_):
        lab = bin_by_thresholds(Xd, thresholds=th_, labels=labels_5)  # pandas.Categorical
        arr = np.asarray(lab, dtype=object)                            # -> ndarray (object)
        p = np.isin(arr, ["‡∏£‡∏∏‡∏ô‡πÅ‡∏£‡∏á", "‡∏ß‡∏¥‡∏Å‡∏§‡∏ï"]).mean() * 100.0         # ‡πÉ‡∏ä‡πâ np.isin ‡πÅ‡∏ó‡∏ô .isin
        return float(p)

    cur = top2_pct(th)
    for _ in range(max_iter):
        if abs(cur - target_pct) <= tol:
            break
        if cur > target_pct:   # ‡∏°‡∏≤‡∏Å‡πÑ‡∏õ ‚Üí ‡∏î‡∏±‡∏ô‡πÄ‡∏Å‡∏ì‡∏ë‡πå‡∏Ç‡∏∂‡πâ‡∏ô
            th[3] = min(1.0, th[3] + step)                   # ‡∏Ç‡∏¢‡∏±‡∏ö t4 ‡∏Å‡πà‡∏≠‡∏ô
            th[2] = min(th[3] - MIN_GAP, th[2] + step/2)     # ‡∏£‡∏±‡∏Å‡∏©‡∏≤ MIN_GAP
        else:                  # ‡∏ô‡πâ‡∏≠‡∏¢‡πÑ‡∏õ ‚Üí ‡∏ú‡πà‡∏≠‡∏ô‡πÄ‡∏Å‡∏ì‡∏ë‡πå‡∏•‡∏á
            th[3] = max(th[2] + MIN_GAP, th[3] - step)
            th[2] = max(th[1] + MIN_GAP, th[2] - step/2)
        th = np.array(sanitize_thresholds(th), dtype=float)
        cur = top2_pct(th)
    return th.tolist()


def normalize_weights(w: np.ndarray) -> np.ndarray:
    w = np.asarray(w, dtype=float)
    w[w < 0] = 0.0
    s = w.sum()
    return (w / s) if s > 0 else np.ones_like(w) / len(w)

def new_splitter(X, y, groups):
    unique_groups = getattr(groups, "nunique", lambda: pd.Series(groups).nunique())()
    if unique_groups >= 5:
        return GroupKFold(n_splits=5).split(X, y, groups)
    else:
        print(f"‚ö†Ô∏è ‡∏Å‡∏•‡∏∏‡πà‡∏°‡∏°‡∏µ‡πÄ‡∏û‡∏µ‡∏¢‡∏á {unique_groups} < 5 ‚Üí ‡πÉ‡∏ä‡πâ StratifiedKFold(5)")
        return StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X, y)

# -------- robust CV helper (‡πÉ‡∏ä‡πâ LogisticRegression ‡πÄ‡∏õ‡πá‡∏ô meta-evaluator) --------
def macro_f1_cv_robust(X: np.ndarray, y: np.ndarray, groups: pd.Series) -> float:
    splitter = list(new_splitter(X, y, groups))
    scores = []
    for tr_idx, va_idx in splitter:
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        if np.unique(y_va).shape[0] < 2:
            scores.append(0.0)
            continue

        clf = LogisticRegression(
            max_iter=2000,
            class_weight="balanced",
            random_state=RANDOM_SEED,
            solver="lbfgs",
        )
        clf.fit(X_tr, y_tr)
        y_pred = clf.predict(X_va)
        f1 = f1_score(y_va, y_pred, average="macro", zero_division=0)
        scores.append(float(f1))
    return float(np.mean(scores)) if len(scores) > 0 else 0.0

# -------- ‡πÉ‡∏ä‡πâ robust CV ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô weights --------
def evaluate_weights(w: np.ndarray, X, y, groups) -> float:
    w = np.asarray(w, dtype=float)
    if w.shape[0] != X.shape[1]:
        raise ValueError(f"weight length={len(w)} != n_features={X.shape[1]}")
    dmg = X @ w
    mn, mx = np.nanmin(dmg), np.nanmax(dmg)
    dmg_norm = (dmg - mn) / (mx - mn) if mx > mn else np.zeros_like(dmg)
    X_feat = dmg_norm.reshape(-1, 1)
    return macro_f1_cv_robust(X_feat, np.asarray(y, dtype=int), groups)

def bin_by_thresholds(arr01: np.ndarray, thresholds=THRESHOLDS, labels=labels_5) -> pd.Categorical:
    """
    ‡∏ï‡∏¥‡∏î‡∏õ‡πâ‡∏≤‡∏¢ 5 ‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏ö‡∏ô‡∏™‡πÄ‡∏Å‡∏• [0,1]: [0,t1) [t1,t2) [t2,t3) [t3,t4) [t4,1]
    """
    thresholds = sanitize_thresholds(thresholds)
    arr = np.nan_to_num(np.asarray(arr01, dtype=float), nan=0.0, posinf=1.0, neginf=0.0)
    bins = np.digitize(arr, thresholds, right=False)  # 0..len(thresholds)
    bins = np.clip(bins, 0, len(labels)-1)
    lab = np.array(labels, dtype=object)[bins]
    return pd.Categorical(lab, categories=labels, ordered=True)

# =============================
# Load + Check
# =============================
df = pd.read_excel(file_path)

base_cols = ["UP_DMG", "WALL_DMG", "FLOOR_DMG", "PILLAR_DMG", "STRUCTURE_"]
missing = [c for c in base_cols if c not in df.columns]
if missing:
    raise ValueError(f"‡πÑ‡∏°‡πà‡∏û‡∏ö‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏ó‡∏µ‡πà‡∏à‡∏≥‡πÄ‡∏õ‡πá‡∏ô: {missing}")

df[base_cols] = df[base_cols].apply(pd.to_numeric, errors="coerce").fillna(0)

# =============================
# ‡πÅ‡∏¢‡∏Å‡πÅ‡∏ñ‡∏ß‡∏°‡∏µ/‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢
# =============================
mask_has_damage = df[base_cols].sum(axis=1) > 0
print(f"‚úÖ ‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢: {int(mask_has_damage.sum())} ‡πÅ‡∏ñ‡∏ß | ‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢: {int((~mask_has_damage).sum())} ‡πÅ‡∏ñ‡∏ß")

# =============================
# Normalize: q95 ‡∏à‡∏≤‡∏Å‡∏ù‡∏±‡πà‡∏á‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢ ‡πÅ‡∏•‡πâ‡∏ß‡πÉ‡∏ä‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏Å‡πâ‡∏≠‡∏ô
# =============================
norm_q95 = {}
df_damage_for_norm = df.loc[mask_has_damage].copy()
for c in base_cols:
    q95 = df_damage_for_norm[c].quantile(0.95)
    q95 = 1.0 if pd.isna(q95) or q95 == 0 else float(q95)
    norm_q95[c] = q95
    df[f"{c}_NORM"] = df[c].clip(upper=q95) / q95

norm_cols = [f"{c}_NORM" for c in base_cols]
df[norm_cols] = df[norm_cols].fillna(0.0)

# =============================
# Target 6 ‡∏Ñ‡∏•‡∏≤‡∏™ (seed label ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ï‡πâ‡∏ô‡∏î‡πâ‡∏ß‡∏¢ thresholds ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ï‡πâ‡∏ô)
# =============================
tmp_damage_score = df.loc[mask_has_damage, norm_cols].mean(axis=1).to_numpy()
dmg_levels_5_fixed = bin_by_thresholds(tmp_damage_score, thresholds=THRESHOLDS, labels=labels_5)

y6 = pd.Series(index=df.index, dtype=object)
y6.loc[~mask_has_damage] = "‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢"
y6.loc[mask_has_damage]  = dmg_levels_5_fixed.astype(str)

le6 = LabelEncoder()
le6.fit(labels_6_order)  # ‡∏•‡πá‡∏≠‡∏Å‡∏•‡∏≥‡∏î‡∏±‡∏ö‡∏Ñ‡∏•‡∏≤‡∏™ (‡πÑ‡∏ó‡∏¢)
y6_enc = le6.transform(y6.astype(str))

# =============================
# ‡∏Å‡∏•‡∏∏‡πà‡∏°‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö GroupKFold
# =============================
if "DISTRICT" in df.columns and "SUB_DISTRI" in df.columns:
    groups_all = df["DISTRICT"].astype(str) + " | " + df["SUB_DISTRI"].astype(str)
else:
    print("‚ö†Ô∏è ‡πÑ‡∏°‡πà‡∏û‡∏ö DISTRICT ‡∏´‡∏£‡∏∑‡∏≠ SUB_DISTRI ‚Üí ‡∏à‡∏∞‡πÉ‡∏ä‡πâ StratifiedKFold")
    groups_all = pd.Series(["__all__"] * len(df))

# =============================
# ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå + Weights ‡∏à‡∏≤‡∏Å‡∏´‡∏•‡∏≤‡∏¢‡∏ß‡∏¥‡∏ò‡∏µ (6 ‡∏Ñ‡∏•‡∏≤‡∏™‡∏ó‡∏±‡πâ‡∏á‡∏Å‡πâ‡∏≠‡∏ô)
# =============================
X_all = df[norm_cols].to_numpy()

# 1) Expert prior
expert_weights = normalize_weights(np.array([0.15, 0.10, 0.10, 0.25, 0.40]))

# 2) Mutual Information
mi6 = mutual_info_classif(X_all, y6_enc, random_state=42, discrete_features=False)
mi6_weights = normalize_weights(mi6)

# 3) Ridge (‡∏™‡∏£‡∏∏‡∏õ‡∏Ñ‡πà‡∏≤‡∏™‡∏±‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå‡∏Ç‡πâ‡∏≤‡∏°‡∏ä‡∏±‡πâ‡∏ô)
ridge6 = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
ridge6.fit(X_all, y6_enc)
ridge6_coef = ridge6.coef_
ridge6_feat = np.abs(ridge6_coef) if ridge6_coef.ndim == 1 else np.abs(ridge6_coef).sum(axis=0)
ridge6_weights = normalize_weights(ridge6_feat)

# 4) RandomForest
rf6 = RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced", n_jobs=-1)
rf6.fit(X_all, y6_enc)
rf6_weights = normalize_weights(rf6.feature_importances_)

# 5) XGBoost (‡∏ñ‡πâ‡∏≤‡πÑ‡∏°‡πà‡∏°‡∏µ ‡πÉ‡∏´‡πâ‡∏Ç‡πâ‡∏≤‡∏°)
if HAS_XGB:
    xgb6 = XGBClassifier(
        n_estimators=400, learning_rate=0.05, max_depth=3,
        subsample=0.8, colsample_bytree=0.8,
        eval_metric="mlogloss", random_state=42,
        n_jobs=-1,
    )
    xgb6.fit(X_all, y6_enc)
    xgb6_weights = normalize_weights(xgb6.feature_importances_)
else:
    xgb6_weights = None

weights_dict6 = {
    "‡∏ú‡∏π‡πâ‡πÄ‡∏ä‡∏µ‡πà‡∏¢‡∏ß‡∏ä‡∏≤‡∏ç": expert_weights,
    "MI":     mi6_weights,
    "Ridge":  ridge6_weights,
    "RF":     rf6_weights,
}
if xgb6_weights is not None:
    weights_dict6["XGB"] = xgb6_weights

# =============================
# Evaluate weights (6-class, group-aware) ‚Üí Best method
# =============================
scores6 = {name: evaluate_weights(w, X_all, y6_enc, groups_all) for name, w in weights_dict6.items()}
cv_df6 = pd.DataFrame.from_dict(scores6, orient="index", columns=["Macro-F1 (6 ‡∏Ñ‡∏•‡∏≤‡∏™)"]).sort_values("Macro-F1 (6 ‡∏Ñ‡∏•‡∏≤‡∏™)", ascending=False)
print("üìä ‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô CV Macro-F1 (6 ‡∏Ñ‡∏•‡∏≤‡∏™, group-aware):")
print(cv_df6)

best_method6 = cv_df6.index[0]
final_weights6 = weights_dict6[best_method6]
print(f"\n‚úÖ ‡∏ß‡∏¥‡∏ò‡∏µ‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (6 ‡∏Ñ‡∏•‡∏≤‡∏™): {best_method6}")

# =============================
# ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì‡∏™‡∏Å‡∏≠‡∏£‡πå‡∏£‡∏ß‡∏°‡∏î‡πâ‡∏ß‡∏¢ weights ‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (‡πÑ‡∏ß‡πâ‡πÉ‡∏ä‡πâ '‡∏à‡∏π‡∏ô thresholds')
# =============================
TOTAL_all = X_all @ final_weights6
mn_all, mx_all = np.nanmin(TOTAL_all), np.nanmax(TOTAL_all)
total_all_norm = (TOTAL_all - mn_all) / (mx_all - mn_all) if mx_all > mn_all else np.zeros_like(TOTAL_all)

# --------- ‡∏ü‡∏±‡∏á‡∏Å‡πå‡∏ä‡∏±‡∏ô‡∏ß‡∏±‡∏î‡∏ú‡∏•‡∏ï‡∏≤‡∏° thresholds ----------
def macro_f1_for_thresholds_full(th):
    th = sanitize_thresholds(th)
    y6_th = pd.Series(index=df.index, dtype=object)
    y6_th.loc[~mask_has_damage] = "‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢"
    y6_th.loc[mask_has_damage] = bin_by_thresholds(
        total_all_norm[mask_has_damage], thresholds=th, labels=labels_5
    ).astype(str)
    le = LabelEncoder().fit(labels_6_order)
    y_enc = le.transform(y6_th.astype(str))
    X_feat = total_all_norm.reshape(-1, 1)
    return macro_f1_cv_robust(X_feat, y_enc, groups_all)

def expected_cost_for_thresholds(th):
    th = sanitize_thresholds(th)
    y_pred = pd.Series("‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢", index=df.index, dtype=object)
    y_pred.loc[mask_has_damage] = bin_by_thresholds(
        total_all_norm[mask_has_damage], thresholds=th, labels=labels_5
    ).astype(str)
    enc = LabelEncoder().fit(labels_6_order)
    yp = enc.transform(y_pred.astype(str))
    yt = enc.transform(y6.astype(str))
    return COST_MATRIX[yt, yp].mean()

def coarse_grid_candidates(step=COARSE_STEP):
    grid = np.arange(step, 1.0, step)
    cand = []
    for t1 in grid:
        for t2 in grid:
            if t2 <= t1: continue
            for t3 in grid:
                if t3 <= t2: continue
                for t4 in grid:
                    if t4 <= t3: continue
                    cand.append([t1, t2, t3, t4])
    return cand

def local_refine(th_best, n_iter=300, sigma=0.04, objective="f1"):
    best = np.array(th_best, dtype=float)
    if objective == "f1":
        best_score = macro_f1_for_thresholds_full(best)
        better = lambda s, b: s > b
    else:
        best_score = expected_cost_for_thresholds(best)
        better = lambda s, b: s < b  # minimize cost

    for _ in range(n_iter):
        prop = best + rng.normal(0, sigma, size=4)
        prop = sanitize_thresholds(prop)
        score = macro_f1_for_thresholds_full(prop) if objective == "f1" else expected_cost_for_thresholds(prop)
        if better(score, best_score):
            best, best_score = np.array(prop), score
    return best.tolist(), best_score

# ============ 1) ‡∏´‡∏≤ "‡∏à‡∏∏‡∏î‡πÄ‡∏£‡∏¥‡πà‡∏°" ‡∏Ç‡∏≠‡∏á thresholds ============
if mask_has_damage.any():
    if POLICY_LOCK_QUANTILES and USE_TARGET_PREVALENCE:
        q = np.cumsum(TARGET_PREVALENCE[:-1] / TARGET_PREVALENCE.sum())
        start_th = sanitize_thresholds(np.quantile(total_all_norm[mask_has_damage], q))
        # ‡∏à‡∏π‡∏ô‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î‡πÉ‡∏´‡πâ top2 ‚âà 12% (‡∏ö‡∏ô‡∏ù‡∏±‡πà‡∏á‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢)
        start_th = nudge_top2(start_th, total_all_norm, mask_has_damage,
                              target_pct=12.0, tol=0.2)
    else:
        q = np.quantile(total_all_norm[mask_has_damage], [0.2, 0.4, 0.6, 0.8])
        start_th = sanitize_thresholds(q)
else:
    start_th = THRESHOLDS

# ============ 2) ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÄ‡∏Å‡∏ì‡∏ë‡πå‡∏ï‡∏≤‡∏° Policy / ‡∏´‡∏£‡∏∑‡∏≠‡∏õ‡∏£‡∏±‡∏ö‡∏à‡∏π‡∏ô ============
if POLICY_LOCK_QUANTILES:
    THRESHOLDS = start_th
    if OPTIMIZE_FOR == "cost":
        score_txt = expected_cost_for_thresholds(THRESHOLDS)
        print(f"üîí ‡πÄ‡∏Å‡∏ì‡∏ë‡πå(Thresholds) policy-lock: {THRESHOLDS} | Expected Cost = {score_txt:.4f}")
    else:
        score_txt = macro_f1_for_thresholds_full(THRESHOLDS)
        print(f"üîí ‡πÄ‡∏Å‡∏ì‡∏ë‡πå(Thresholds) policy-lock: {THRESHOLDS} | CV Macro-F1(6 ‡∏Ñ‡∏•‡∏≤‡∏™) = {score_txt:.4f}")
else:
    cands = [[*start_th]] + coarse_grid_candidates(step=COARSE_STEP)
    if OPTIMIZE_FOR == "cost":
        best_th, best_score = None, np.inf
        for th in cands:
            s = expected_cost_for_thresholds(th)
            if s < best_score:
                best_th, best_score = th, s
        best_th, best_score = local_refine(best_th, n_iter=300, sigma=0.04, objective="cost")
        THRESHOLDS = sanitize_thresholds(best_th)
        print(f"üéØ ‡πÄ‡∏Å‡∏ì‡∏ë‡πå(Thresholds) ‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (min Expected Cost, step={COARSE_STEP}, min_gap={MIN_GAP:.02f}): {THRESHOLDS} | Cost = {best_score:.4f}")
    else:
        best_th, best_score = None, -1.0
        for th in cands:
            s = macro_f1_for_thresholds_full(th)
            if s > best_score:
                best_th, best_score = th, s
        best_th, best_score = local_refine(best_th, n_iter=300, sigma=0.04, objective="f1")
        THRESHOLDS = sanitize_thresholds(best_th)
        print(f"üéØ ‡πÄ‡∏Å‡∏ì‡∏ë‡πå(Thresholds) ‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (‡∏õ‡∏£‡∏±‡∏ö‡∏à‡∏π‡∏ô F1, step={COARSE_STEP}, min_gap={MIN_GAP:.02f}): {THRESHOLDS} | CV Macro-F1 = {best_score:.4f}")

# ============ 3) Nested-CV (‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏≠‡∏±‡∏ô‡∏ô‡∏µ‡πâ‡∏°‡∏≤‡∏Å‡∏Å‡∏ß‡πà‡∏≤) ============
nested_cv_score = None
if ENABLE_NESTED_CV_CHECK:
    splitter_outer = list(new_splitter(total_all_norm.reshape(-1,1), le6.transform(y6.astype(str)), groups_all))
    scores_nested = []
    groups_series = groups_all if isinstance(groups_all, pd.Series) else pd.Series(groups_all)

    for tr_idx, va_idx in splitter_outer:
        X_tr_norm = total_all_norm[tr_idx]
        mask_tr = mask_has_damage.values[tr_idx]
        groups_tr = groups_series.iloc[tr_idx]

        # ‡∏ï‡∏±‡πâ‡∏á‡∏ï‡πâ‡∏ô threshold ‡∏ö‡∏ô train
        if mask_tr.any():
            if POLICY_LOCK_QUANTILES and USE_TARGET_PREVALENCE:
                qt = np.cumsum(TARGET_PREVALENCE[:-1] / TARGET_PREVALENCE.sum())
                start_th0 = sanitize_thresholds(np.quantile(X_tr_norm[mask_tr], qt))
            else:
                start_th0 = sanitize_thresholds(np.quantile(X_tr_norm[mask_tr], [0.2,0.4,0.6,0.8]))
        else:
            start_th0 = THRESHOLDS

        if POLICY_LOCK_QUANTILES:
            best_th0 = start_th0
        else:
            def score_th_on_train(th):
                y_tr_series = pd.Series("‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢", index=groups_tr.index, dtype=object)
                y_tr_series.loc[mask_tr] = bin_by_thresholds(
                    X_tr_norm[mask_tr], thresholds=th, labels=labels_5
                ).astype(str)
                le_tmp = LabelEncoder().fit(labels_6_order)
                y_tr_enc = le_tmp.transform(y_tr_series.astype(str))
                # ‡πÉ‡∏ô‡πÇ‡∏´‡∏°‡∏î cost ‡πÉ‡∏´‡πâ‡∏Å‡∏•‡∏±‡∏ö‡∏´‡∏±‡∏ß‡πÄ‡∏õ‡πá‡∏ô -cost ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏ä‡πâ logic ‡πÄ‡∏î‡∏¥‡∏° (maximize)
                if OPTIMIZE_FOR == "cost":
                    yp = le_tmp.transform(y_tr_series.astype(str))
                    yt = le_tmp.transform(y6.iloc[tr_idx].astype(str))
                    return -COST_MATRIX[yt, yp].mean()
                return macro_f1_cv_robust(X_tr_norm.reshape(-1,1), y_tr_enc, groups_tr)

            cands0 = [[*start_th0]] + coarse_grid_candidates(step=COARSE_STEP)
            best_th0, best_sc0 = None, -np.inf
            for th in cands0:
                sc = score_th_on_train(th)
                if sc > best_sc0:
                    best_th0, best_sc0 = th, sc
            # refine
            obj = "cost" if OPTIMIZE_FOR == "cost" else "f1"
            best_th0, _ = local_refine(best_th0, n_iter=150, sigma=0.03, objective=obj)

        # ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ö‡∏ô valid fold
        X_va_norm = total_all_norm[va_idx]
        mask_va = mask_has_damage.values[va_idx]
        groups_va = groups_series.iloc[va_idx]

        y_va_pred = pd.Series("‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢", index=groups_va.index, dtype=object)
        y_va_pred.loc[mask_va] = bin_by_thresholds(
            X_va_norm[mask_va], thresholds=best_th0, labels=labels_5
        ).astype(str)

        y_va_true = y6.iloc[va_idx]
        le_tmp = LabelEncoder().fit(labels_6_order)

        if OPTIMIZE_FOR == "cost":
            yt = le_tmp.transform(y_va_true.astype(str))
            yp = le_tmp.transform(y_va_pred.astype(str))
            scores_nested.append(float(-COST_MATRIX[yt, yp].mean()))  # ‡πÄ‡∏Å‡πá‡∏ö‡πÄ‡∏õ‡πá‡∏ô‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏°‡∏≤‡∏Å‡∏î‡∏µ‡∏Å‡∏ß‡πà‡∏≤
        else:
            f1_va = f1_score(
                le_tmp.transform(y_va_true.astype(str)),
                le_tmp.transform(y_va_pred.astype(str)),
                average="macro", zero_division=0
            )
            scores_nested.append(float(f1_va))

    nested_cv_score = float(np.mean(scores_nested)) if scores_nested else None
    mode_txt = ("policy-lock + target-quantile" if (POLICY_LOCK_QUANTILES and USE_TARGET_PREVALENCE)
                else ("policy-lock q20/40/60/80" if POLICY_LOCK_QUANTILES
                      else f"tuned (step={COARSE_STEP})"))
    metric_txt = "Macro-F1" if OPTIMIZE_FOR == "f1" else "-ExpectedCost (‡∏¢‡∏¥‡πà‡∏á‡∏°‡∏≤‡∏Å‡∏¢‡∏¥‡πà‡∏á‡∏î‡∏µ)"
    print(f"üß™ Nested-CV {metric_txt} (threshold mode: {mode_txt}, min_gap={MIN_GAP:.02f}): {nested_cv_score:.4f}")

# =============================
# Bootstrap Stability (‡πÄ‡∏ä‡πá‡∏Ñ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏ñ‡∏µ‡∏¢‡∏£‡∏Ç‡∏≠‡∏á thresholds/‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô)
# =============================
def bootstrap_threshold_stability(X01, mask_dmg, groups, thresholds, B=200, seed=2025):
    rng = np.random.default_rng(seed)
    groups = pd.Series(groups)
    uniq = groups.unique()
    th_list = []
    counts = []
    for _ in range(B):
        sel_g = rng.choice(uniq, size=len(uniq), replace=True)
        idx = groups.isin(sel_g).values
        Xb = X01[idx]; mb = mask_dmg.values[idx]
        if mb.any():
            qb = np.quantile(Xb[mb], [0.2,0.4,0.6,0.8]) if not USE_TARGET_PREVALENCE else \
                 np.quantile(Xb[mb], np.cumsum(TARGET_PREVALENCE[:-1] / TARGET_PREVALENCE.sum()))
            qb = sanitize_thresholds(qb)
        else:
            qb = sanitize_thresholds(thresholds)
        th_list.append(qb)
        # class counts under this qb
        lab = pd.Series("‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢", index=np.arange(Xb.shape[0]), dtype=object)
        lab.loc[mb] = bin_by_thresholds(Xb[mb], thresholds=qb, labels=labels_5).astype(str)
        counts.append(lab.value_counts().reindex(["‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢"]+labels_5, fill_value=0).to_dict())
    th_arr = np.array(th_list)
    return th_arr.mean(0), th_arr.std(0), counts

th_mean, th_std, boot_counts = bootstrap_threshold_stability(
    total_all_norm, mask_has_damage, groups_all, THRESHOLDS, B=200, seed=2025
)
print(f"üîÅ Bootstrap thresholds mean: {np.round(th_mean, 4).tolist()} | std: {np.round(th_std, 4).tolist()}")

# =============================
# Apply weights ‚Üí Final Score & Class (‡πÉ‡∏ä‡πâ thresholds ‡∏ó‡∏µ‡πà‡∏™‡∏£‡∏∏‡∏õ‡πÑ‡∏î‡πâ)
# =============================
df["TOTAL_DMG_FINAL"] = TOTAL_all
df["TOTAL_DMG_FINAL_NORM"] = np.nan_to_num(total_all_norm, nan=0.0, posinf=1.0, neginf=0.0)

df["DMG_LEVEL_FINAL"] = None
df.loc[~mask_has_damage, "DMG_LEVEL_FINAL"] = "‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢"
df.loc[mask_has_damage, "DMG_LEVEL_FINAL"] = bin_by_thresholds(
    df.loc[mask_has_damage, "TOTAL_DMG_FINAL_NORM"].to_numpy(),
    thresholds=THRESHOLDS, labels=labels_5
).astype(str)

df["DMG_CODE"] = df["DMG_LEVEL_FINAL"].map(code_map).astype("Int64")

print("\n‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢ (‡∏´‡∏•‡∏±‡∏á‡∏ï‡∏±‡πâ‡∏á‡πÄ‡∏Å‡∏ì‡∏ë‡πå):")
dist_final = df["DMG_LEVEL_FINAL"].value_counts(dropna=False)
print(dist_final)

# =============================
# ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡∏£‡∏ß‡∏° (‡∏≠‡∏¥‡∏á label seed y6)
# =============================
le_final = LabelEncoder().fit(labels_6_order)
y_true_enc = le_final.transform(y6.astype(str))

pred_labels = pd.Series("‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢", index=df.index, dtype=object)
pred_labels.loc[mask_has_damage] = bin_by_thresholds(
    df.loc[mask_has_damage, "TOTAL_DMG_FINAL_NORM"].to_numpy(),
    thresholds=THRESHOLDS, labels=labels_5
).astype(str)
y_pred_enc = le_final.transform(pred_labels.astype(str))

cls_report = classification_report(
    y_true_enc, y_pred_enc, target_names=labels_6_order, zero_division=0, output_dict=True
)
report_df = pd.DataFrame(cls_report).transpose()
# ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏ä‡∏∑‡πà‡∏≠‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡πÄ‡∏õ‡πá‡∏ô‡πÑ‡∏ó‡∏¢
report_df = report_df.rename(columns={
    "precision": "‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥ (Precision)",
    "recall": "‡∏Å‡∏≤‡∏£‡∏Ñ‡∏£‡∏≠‡∏ö‡∏Ñ‡∏•‡∏∏‡∏° (Recall)",
    "f1-score": "F1-Score",
    "support": "‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á",
})

cm = confusion_matrix(
    y_true_enc, y_pred_enc,
    labels=list(range(len(labels_6_order)))
)
cm_df = pd.DataFrame(cm, index=labels_6_order, columns=labels_6_order)

# =============================
# ‡∏ï‡∏≤‡∏£‡∏≤‡∏á‡πÄ‡∏™‡∏£‡∏¥‡∏°: ‡∏ô‡πâ‡∏≥‡∏´‡∏ô‡∏±‡∏Å, ‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå, ‡πÄ‡∏°‡∏ï‡∏≤, Stability ‡∏™‡∏£‡∏∏‡∏õ
# =============================
weights_table = pd.DataFrame({"‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå": base_cols, "‡∏ô‡πâ‡∏≥‡∏´‡∏ô‡∏±‡∏Å": final_weights6})
norm_params = pd.DataFrame({
    "‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå": list(norm_q95.keys()),
    "‡∏Ñ‡πà‡∏≤ q95 ‡∏ó‡∏µ‡πà‡πÉ‡∏ä‡πâ": list(norm_q95.values()),
    "‡∏´‡∏°‡∏≤‡∏¢‡πÄ‡∏´‡∏ï‡∏∏": ["‡∏ï‡∏±‡∏î‡∏ö‡∏ô‡∏ó‡∏µ‡πà q95 (‡∏à‡∏≤‡∏Å‡∏ù‡∏±‡πà‡∏á‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢) ‡πÅ‡∏•‡πâ‡∏ß‡∏´‡∏≤‡∏£‡∏î‡πâ‡∏ß‡∏¢ q95"] * len(norm_q95)
})

# ‡πÅ‡∏û‡πá‡∏Ñ‡∏£‡∏≤‡∏¢‡∏•‡∏∞‡πÄ‡∏≠‡∏µ‡∏¢‡∏î stability ‡∏Ç‡∏≠‡∏á‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏ä‡∏±‡πâ‡∏ô‡∏à‡∏≤‡∏Å bootstrap (‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢)
def summarize_boot_counts(boot_counts_list):
    keys = ["‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢"] + labels_5
    arr = np.array([[d[k] for k in keys] for d in boot_counts_list], dtype=float)
    mean = arr.mean(0); std = arr.std(0)
    total = mean.sum()
    pct = mean / total * 100.0
    return pd.DataFrame({
        "‡∏ä‡∏±‡πâ‡∏ô": keys,
        "‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢ (bootstrap)": np.round(mean, 2),
        "‡∏™‡πà‡∏ß‡∏ô‡πÄ‡∏ö‡∏µ‡πà‡∏¢‡∏á‡πÄ‡∏ö‡∏ô‡∏°‡∏≤‡∏ï‡∏£‡∏ê‡∏≤‡∏ô": np.round(std, 2),
        "‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢ (%)": np.round(pct, 2),
    })

stability_df = summarize_boot_counts(boot_counts)
th_stability_df = pd.DataFrame({
    "Threshold": ["t1","t2","t3","t4"],
    "‡∏Ñ‡πà‡∏≤‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢ (bootstrap)": np.round(th_mean, 6),
    "‡∏™‡πà‡∏ß‡∏ô‡πÄ‡∏ö‡∏µ‡πà‡∏¢‡∏á‡πÄ‡∏ö‡∏ô‡∏°‡∏≤‡∏ï‡∏£‡∏ê‡∏≤‡∏ô": np.round(th_std, 6),
})

# ‡∏™‡∏£‡πâ‡∏≤‡∏á meta
meta = pd.DataFrame({
    "‡∏ß‡∏¥‡∏ò‡∏µ‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (6 ‡∏Ñ‡∏•‡∏≤‡∏™)": [best_method6],
    "‡πÇ‡∏´‡∏°‡∏î Threshold": [("policy-lock + target-quantile" if (POLICY_LOCK_QUANTILES and USE_TARGET_PREVALENCE)
                    else ("policy-lock q20/40/60/80" if POLICY_LOCK_QUANTILES else f"tuned (step={COARSE_STEP})"))],
    "‡πÄ‡∏Å‡∏ì‡∏ë‡πå Thresholds": [str(THRESHOLDS)],
    "‡∏ï‡∏±‡∏ß‡∏ä‡∏µ‡πâ‡∏ß‡∏±‡∏î‡∏´‡∏•‡∏±‡∏Å": [("Macro-F1" if OPTIMIZE_FOR == "f1" else "Expected Cost (‡∏¢‡∏¥‡πà‡∏á‡∏ô‡πâ‡∏≠‡∏¢‡∏¢‡∏¥‡πà‡∏á‡∏î‡∏µ)")],
    "‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô CV Macro-F1 (6 ‡∏Ñ‡∏•‡∏≤‡∏™)": [float(cv_df6.iloc[0, 0])],
    "‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô Nested-CV ‡∏´‡∏•‡∏±‡∏Å": [nested_cv_score if nested_cv_score is not None else np.nan],
    "‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÅ‡∏ñ‡∏ß‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î": [len(df)],
    "‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢": [int(mask_has_damage.sum())],
    "‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÅ‡∏ñ‡∏ß‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢": [int((~mask_has_damage).sum())],
    "‡∏Ñ‡πà‡∏≤ Random Seed": [RANDOM_SEED],
    "‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å": [
        ("‡∏≠‡πâ‡∏≤‡∏á‡∏≠‡∏¥‡∏á Nested-CV ‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡∏±‡∏ß‡∏ï‡∏±‡∏î‡∏™‡∏¥‡∏ô‡πÉ‡∏à‡∏´‡∏•‡∏±‡∏Å; "
         + ("‡∏Ñ‡∏∏‡∏°‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô‡∏î‡πâ‡∏ß‡∏¢ target-quantile" if USE_TARGET_PREVALENCE else "‡πÉ‡∏ä‡πâ‡∏Ñ‡∏ß‡∏≠‡∏ô‡πÑ‡∏ó‡∏•‡πå 20/40/60/80")
         + f"; min_gap={MIN_GAP:.02f}; optimize_for={OPTIMIZE_FOR}")
    ],
})

# =============================
# Export ‚Üí Excel (‡∏ä‡∏∑‡πà‡∏≠‡πÑ‡∏ó‡∏¢)
# =============================
df_out = df.rename(columns={
    "TOTAL_DMG_FINAL": "‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏£‡∏ß‡∏°‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢",
    "TOTAL_DMG_FINAL_NORM": "‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô‡∏£‡∏ß‡∏°‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢(‡∏õ‡∏£‡∏±‡∏ö‡∏™‡πÄ‡∏Å‡∏•)",
    "DMG_LEVEL_FINAL": "‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢(‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢)",
    "DMG_CODE": "‡∏£‡∏´‡∏±‡∏™‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢",
})

with pd.ExcelWriter(out_path) as writer:
    df_out.to_excel(writer, sheet_name="‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•", index=False)
    weights_table.to_excel(writer, sheet_name="‡∏ô‡πâ‡∏≥‡∏´‡∏ô‡∏±‡∏Å‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢", index=False)
    cv_df6.to_excel(writer, sheet_name="‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ôCV-6‡∏Ñ‡∏•‡∏≤‡∏™")
    norm_params.to_excel(writer, sheet_name="‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏°‡∏≤‡∏ï‡∏£‡∏ê‡∏≤‡∏ô", index=False)
    meta.to_excel(writer, sheet_name="‡πÄ‡∏°‡∏ï‡∏≤‡∏î‡∏≤‡∏ï‡πâ‡∏≤", index=False)
    report_df.to_excel(writer, sheet_name="‡∏£‡∏≤‡∏¢‡∏á‡∏≤‡∏ô‡∏Å‡∏≤‡∏£‡∏à‡∏≥‡πÅ‡∏ô‡∏Å")
    cm_df.to_excel(writer, sheet_name="‡πÄ‡∏°‡∏ó‡∏£‡∏¥‡∏Å‡∏ã‡πå‡∏™‡∏±‡∏ö‡∏™‡∏ô")
    stability_df.to_excel(writer, sheet_name="Bootstrap-‡∏ä‡∏±‡πâ‡∏ô", index=False)
    th_stability_df.to_excel(writer, sheet_name="Bootstrap-Thresholds", index=False)



print(f"\nüìÅ ‡πÄ‡∏Ç‡∏µ‡∏¢‡∏ô‡πÑ‡∏ü‡∏•‡πå‡∏´‡∏•‡∏≤‡∏¢‡∏ä‡∏µ‡∏ï‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢ (‡∏ä‡∏∑‡πà‡∏≠‡∏ä‡∏µ‡∏ï/‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢): {out_path}")


‚úÖ ‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢: 2248 ‡πÅ‡∏ñ‡∏ß | ‡πÑ‡∏°‡πà‡∏°‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢: 4317 ‡πÅ‡∏ñ‡∏ß
üìä ‡∏Ñ‡∏∞‡πÅ‡∏ô‡∏ô CV Macro-F1 (6 ‡∏Ñ‡∏•‡∏≤‡∏™, group-aware):
              Macro-F1 (6 ‡∏Ñ‡∏•‡∏≤‡∏™)
XGB                    0.946563
RF                     0.863923
Ridge                  0.830022
MI                     0.670828
‡∏ú‡∏π‡πâ‡πÄ‡∏ä‡∏µ‡πà‡∏¢‡∏ß‡∏ä‡∏≤‡∏ç           0.614804

‚úÖ ‡∏ß‡∏¥‡∏ò‡∏µ‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (6 ‡∏Ñ‡∏•‡∏≤‡∏™): XGB
üîí ‡πÄ‡∏Å‡∏ì‡∏ë‡πå(Thresholds) policy-lock: [0.20724839898564215, 0.3258889165172987, 0.44001517176695937, 0.6472635707526015] | CV Macro-F1(6 ‡∏Ñ‡∏•‡∏≤‡∏™) = 0.9706
üß™ Nested-CV Macro-F1 (threshold mode: policy-lock + target-quantile, min_gap=0.08): 0.6952
üîÅ Bootstrap thresholds mean: [0.2055, 0.3251, 0.4335, 0.6487] | std: [0.0022, 0.0017, 0.0173, 0.0209]

‡∏Å‡∏≤‡∏£‡∏Å‡∏£‡∏∞‡∏à‡∏≤‡∏¢‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡∏¢‡∏´‡∏≤‡∏¢ (‡∏´‡∏•‡∏±‡∏á‡∏ï‡∏±‡πâ‡∏á‡πÄ‡∏Å‡∏ì‡∏ë‡πå):
DMG_LEVEL_FINAL
‡πÑ‡∏°‡πà