## XGBoost

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# =====================================================================
# XGBOOST — PRECISION-CONTROLLED v72s4 (GPU) con BIG-MODE (TRAIN/TEST)
#   • TRAIN big: streaming + filtro (positivos, cercanos, lejanos muestreados)
#   • TEST 2024 streaming: sin rolling (delta, cummax, age_days)
#   • SSD (RAM): rolling + normalización por modelo + categóricas nativas
#   • HDD (BIG): sin rolling + categóricos mapeados a códigos estables
# =====================================================================

import os, gc, json, warnings, re
from datetime import datetime
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import pyarrow.parquet as pq

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    precision_recall_curve, average_precision_score, precision_score,
    recall_score, f1_score, confusion_matrix
)
import joblib
import xgboost as xgb

warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None


# ===================== UTILS =====================

def cleanup():
    gc.collect()

def downcast_df(df: pd.DataFrame) -> pd.DataFrame:
    for c in df.columns:
        if pd.api.types.is_float_dtype(df[c]):
            df[c] = pd.to_numeric(df[c], downcast="float")
        elif pd.api.types.is_integer_dtype(df[c]):
            df[c] = pd.to_numeric(df[c], downcast="integer")
    return df


# ===================== IO Helpers (streaming parquet) =====================

def _list_parquet_files(path: str) -> List[str]:
    if os.path.isdir(path):
        return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".parquet")]
    return [path]

def _peek_row_group(path: str) -> pd.DataFrame:
    files = _list_parquet_files(path)
    for f in files:
        pf = pq.ParquetFile(f)
        if pf.num_row_groups > 0:
            tb = pf.read_row_group(0, columns=None)
            return tb.to_pandas().head(100)
    return pd.DataFrame()

def discover_smart_columns(path: str) -> List[str]:
    peek = _peek_row_group(path)
    if peek.empty:
        return []
    cols = peek.columns.tolist()
    return sorted([c for c in cols if ("smart" in c.lower()) and c.endswith("_raw")])

def _iter_parquet_row_groups(path: str, years: List[int], columns: Optional[List[str]] = None):
    files = _list_parquet_files(path)
    for f in files:
        pf = pq.ParquetFile(f)
        for i in range(pf.num_row_groups):
            tb = pf.read_row_group(i, columns=columns) if columns else pf.read_row_group(i)
            df = tb.to_pandas()
            if "date" not in df.columns:
                continue
            df["date"] = pd.to_datetime(df["date"], errors="coerce")
            mask = df["date"].dt.year.isin(years)
            if mask.any():
                yield downcast_df(df.loc[mask].reset_index(drop=True))
            del df, tb
            cleanup()

def load_data(path: str, years: List[int], columns: Optional[List[str]] = None) -> pd.DataFrame:
    print(f"Loading {years} from {path}...")
    chunks = []
    for df in _iter_parquet_row_groups(path, years, columns=columns):
        chunks.append(df)
        if len(chunks) >= 16:
            chunks = [pd.concat(chunks, ignore_index=True)]
            cleanup()
    out = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
    print(f"Loaded {len(out):,} rows")
    return out


# ===================== BIG MODE (TRAIN y TEST) =====================

def scan_fail_dates(path: str, years: List[int]) -> Dict[str, pd.Timestamp]:
    """Primera pasada: PRIMERA fecha de fallo por serial (streaming)."""
    print("Scanning earliest failure dates (streaming)...")
    fail_map: Dict[str, pd.Timestamp] = {}
    for df in _iter_parquet_row_groups(path, years, columns=["serial_number", "date", "failure"]):
        df = df[df["failure"] == 1]
        if df.empty:
            continue
        sns = df["serial_number"].astype(str).values
        dts = pd.to_datetime(df["date"], errors="coerce")
        for sn, dt in zip(sns, dts):
            if pd.isna(dt):
                continue
            if sn not in fail_map or dt < fail_map[sn]:
                fail_map[sn] = dt
    print(f"  Found {len(fail_map):,} failed serials")
    return fail_map


def load_data_big_filtered(
    path: str,
    years: List[int],
    fail_map: Dict[str, pd.Timestamp],
    lookahead_days: int,
    hard_window: int,
    neg_random_keep_rate: float = 0.0025,
    columns: Optional[List[str]] = None,
) -> pd.DataFrame:
    """
    TRAIN big: conserva todos positivos, negativos cercanos (hard) y
    una muestra de negativos lejanos. NaT-safe.
    """
    print(f"Loading BIG-FILTERED {years} from {path} (keep_rate={neg_random_keep_rate})...")
    rng = np.random.default_rng(42)
    kept = []
    total_rows = 0
    kept_rows = 0

    for df in _iter_parquet_row_groups(path, years, columns=columns):
        total_rows += len(df)
        sn_ser = df["serial_number"].astype(str)
        dt = pd.to_datetime(df["date"], errors="coerce").to_numpy(dtype="datetime64[D]")

        # dtf con fail_map (NaT-safe)
        fdt = pd.to_datetime(sn_ser.map(fail_map), errors="coerce").to_numpy(dtype="datetime64[D]")
        dtf = np.full(len(df), 10**9, dtype=np.int64)
        valid = ~np.isnat(fdt)
        if valid.any():
            dd = (fdt[valid] - dt[valid]).astype("timedelta64[D]").astype("int64")
            dtf[valid] = dd

        failure = (df["failure"].values == 1)
        pos_mask = failure | ((dtf >= 0) & (dtf <= lookahead_days))
        near_mask = (dtf > lookahead_days) & (dtf <= hard_window)

        keep = pos_mask | near_mask
        far_neg = (~keep) & (~failure)
        if far_neg.any():
            sample = rng.random(far_neg.sum()) < neg_random_keep_rate
            sel = np.zeros_like(far_neg, dtype=bool)
            sel[np.where(far_neg)[0]] = sample
            keep = keep | sel

        kept.append(df.loc[keep].reset_index(drop=True))
        kept_rows += int(keep.sum())

        if sum(len(x) for x in kept) > 3_000_000:
            kept = [pd.concat(kept, ignore_index=True)]
            cleanup()

    out = pd.concat(kept, ignore_index=True) if kept else pd.DataFrame()
    rate = 100 * kept_rows / max(1, total_rows)
    print(f"  BIG-FILTERED kept {kept_rows:,}/{total_rows:,} rows (~{rate:.2f}%)")
    return out


# ===================== PREP & LABELS =====================

def prepare_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["serial_number"] = df["serial_number"].astype(str)
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.dropna(subset=["serial_number", "date"])
    df = df.sort_values(["serial_number", "date"]).reset_index(drop=True)
    return df

def compute_days_to_failure(dfs: pd.DataFrame) -> np.ndarray:
    fail_map: Dict[str, pd.Timestamp] = {}
    fails = dfs[dfs["failure"] == 1]
    for sn, dt in zip(fails["serial_number"], fails["date"]):
        fail_map[sn] = min(dt, fail_map.get(sn, dt))

    dtf = np.full(len(dfs), 10**9, dtype=np.int64)
    for i, (sn, dt) in enumerate(zip(dfs["serial_number"], dfs["date"])):
        if sn in fail_map:
            dtf[i] = (fail_map[sn] - dt).days
    return dtf

def create_labels_from_dtf(dtf: np.ndarray, lookahead: int = 7) -> np.ndarray:
    return ((dtf >= 0) & (dtf <= lookahead)).astype(np.int8)


# ===================== CATEGÓRICOS (map a códigos para BIG) =====================

def extract_vendor(series: pd.Series) -> pd.Series:
    s = series.astype(str).str.strip()
    return s.str.extract(r"^([A-Za-z]+)", expand=False).fillna("UNK")

def fit_category_maps(df: pd.DataFrame) -> Dict[str, Dict[str, int]]:
    maps: Dict[str, Dict[str, int]] = {}
    if "model" in df.columns:
        models = pd.Index(df["model"].astype(str).unique())
        maps["model"] = {m: i for i, m in enumerate(models)}
        vendors = pd.Index(extract_vendor(df["model"]).unique())
        maps["vendor"] = {v: i for i, v in enumerate(vendors)}
    return maps

def apply_category_maps(df: pd.DataFrame, maps: Optional[Dict[str, Dict[str, int]]]) -> Tuple[pd.Series, pd.Series]:
    if maps and "model" in df.columns and "model" in maps and "vendor" in maps:
        m = df["model"].astype(str)
        v = extract_vendor(m)
        model_map = maps["model"]; vendor_map = maps["vendor"]
        model_codes = m.map(model_map).fillna(-1).astype(int)
        vendor_codes = v.map(vendor_map).fillna(-1).astype(int)
        return model_codes, vendor_codes
    n = len(df)
    return pd.Series(np.zeros(n, dtype=np.int32), index=df.index), pd.Series(np.zeros(n, dtype=np.int32), index=df.index)


# ===================== FEATURES: RAM (SSD) vs BIG (HDD) =====================

def create_features_joined_ram_v72(
    df_train: pd.DataFrame, df_dev: pd.DataFrame, df_test: pd.DataFrame,
    add_rolling: bool
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, List[str], List[str]]:
    """
    RAM (SSD): TRAIN∪DEV∪TEST → deltas/cummax (+ rolling), age, calendario,
    categóricas nativas (model, vendor). Compatible con XGB enable_categorical=True.
    """
    df_train = df_train.copy(); df_train["__subset__"]="train"
    df_dev   = df_dev.copy();   df_dev["__subset__"]="dev"
    df_test  = df_test.copy();  df_test["__subset__"]="test"
    df_all = pd.concat([df_train, df_dev, df_test], ignore_index=True)
    df_all.sort_values(["serial_number","date"], inplace=True)

    all_cols = df_all.columns.tolist()
    smart_cols = [c for c in all_cols if ("smart" in c.lower()) and ("_raw" in c.lower())]
    for c in smart_cols:
        df_all[c] = pd.to_numeric(df_all[c], errors="coerce").fillna(0.0)

    for c in smart_cols:
        g = df_all.groupby("serial_number", sort=False)[c]
        df_all[f"delta_{c}"] = g.diff().fillna(0.0)
        df_all[f"max_{c}"]   = g.cummax()

    if add_rolling:
        for c in smart_cols:
            r = df_all.groupby("serial_number", sort=False)[c]
            df_all[f"rmean7_{c}"] = r.rolling(window=7, min_periods=2).mean().reset_index(level=0, drop=True).fillna(0.0)
            df_all[f"rstd7_{c}"]  = r.rolling(window=7, min_periods=2).std().reset_index(level=0, drop=True).fillna(0.0)

    df_all["age_days"] = df_all.groupby("serial_number", sort=False).cumcount()
    d = df_all["date"]
    df_all["month"] = d.dt.month
    df_all["day_of_week"] = d.dt.dayofweek

    # Categóricas nativas
    if "model" not in df_all.columns: df_all["model"] = "UNK"
    df_all["model"] = df_all["model"].astype(str).fillna("UNK")
    df_all["vendor"] = extract_vendor(df_all["model"]).astype(str).fillna("UNK")

    # Ensamble de X
    drop_cols = ["serial_number","date","failure"]
    X_all = df_all.drop(columns=[c for c in drop_cols if c in df_all.columns], errors="ignore")

    # Dtypes: num -> float32, cat -> category
    cat_cols = [c for c in ["model","vendor"] if c in X_all.columns]
    for c in X_all.columns:
        if c in cat_cols:
            X_all[c] = X_all[c].astype("category")
        else:
            X_all[c] = pd.to_numeric(X_all[c], errors="coerce").fillna(0.0).astype(np.float32)

    # Quitar constantes
    num_cols = [c for c in X_all.columns if c not in cat_cols]
    var = X_all[num_cols].var()
    keep_num = var[var > 0].index.tolist()
    X_all = pd.concat([X_all[cat_cols], X_all[keep_num]], axis=1)

    X_tr = X_all[df_all["__subset__"]=="train"].reset_index(drop=True)
    X_dev= X_all[df_all["__subset__"]=="dev"].reset_index(drop=True)
    X_te = X_all[df_all["__subset__"]=="test"].reset_index(drop=True)

    feature_names = list(X_all.columns)
    return X_tr, X_dev, X_te, feature_names, cat_cols


def create_features_joined_big_codes(
    df_train: pd.DataFrame, df_dev: pd.DataFrame, df_test: pd.DataFrame,
    add_rolling: bool, fit_cats_on_train: bool = True, category_maps: Optional[Dict[str, Dict[str, int]]] = None
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, List[str], Dict[str, Dict[str, int]]]:
    """
    BIG (HDD): TRAIN∪DEV∪TEST → deltas/cummax (sin rolling), age, calendario,
    categóricas mapeadas a códigos (model_code/vendor_code). Robusto y liviano.
    """
    df_train = df_train.copy(); df_train["__subset__"]="train"
    df_dev   = df_dev.copy();   df_dev["__subset__"]="dev"
    df_test  = df_test.copy();  df_test["__subset__"]="test"
    df_all = pd.concat([df_train, df_dev, df_test], ignore_index=True)
    df_all.sort_values(["serial_number","date"], inplace=True)

    all_cols = df_all.columns.tolist()
    smart_cols = [c for c in all_cols if ("smart" in c.lower()) and ("_raw" in c.lower())]
    for c in smart_cols:
        df_all[c] = pd.to_numeric(df_all[c], errors="coerce").fillna(0.0)

    # deltas/cummax (no rolling en BIG)
    for c in smart_cols:
        g = df_all.groupby("serial_number", sort=False)[c]
        df_all[f"delta_{c}"] = g.diff().fillna(0.0)
        df_all[f"max_{c}"]   = g.cummax()

    df_all["age_days"] = df_all.groupby("serial_number", sort=False).cumcount()
    d = df_all["date"]
    df_all["month"] = d.dt.month
    df_all["day_of_week"] = d.dt.dayofweek

    # Categóricos → códigos
    if fit_cats_on_train:
        cat_maps = fit_category_maps(df_train)
    else:
        cat_maps = category_maps or {}
    model_code, vendor_code = apply_category_maps(df_all, cat_maps)
    df_all["model_code"] = model_code
    df_all["vendor_code"] = vendor_code

    drop_cols = ["serial_number","date","failure","model"]
    X_all = df_all.drop(columns=[c for c in drop_cols if c in df_all.columns], errors="ignore")

    for c in X_all.columns:
        X_all[c] = pd.to_numeric(X_all[c], errors="coerce").fillna(0.0).astype(np.float32)

    # quitar constantes
    var = X_all.var()
    keep = var[var > 0].index.tolist()
    X_all = X_all[keep].astype(np.float32)

    X_tr = X_all[df_all["__subset__"]=="train"].reset_index(drop=True)
    X_dev= X_all[df_all["__subset__"]=="dev"].reset_index(drop=True)
    X_te = X_all[df_all["__subset__"]=="test"].reset_index(drop=True)

    feature_names = list(X_all.columns)
    return X_tr, X_dev, X_te, feature_names, cat_maps


# ========== STREAMING FEATURES (TEST BIG MODE, sin rolling) ==========

class StreamDeltaCummaxBuilderXGB:
    """
    Construye features para TEST en streaming (BIG) con mapeo categórico fijo.
    Salida: smart, delta_*, max_*, age_days, month, day_of_week, model_code, vendor_code.
    """
    def __init__(self, smart_cols: List[str], cat_maps: Dict[str, Dict[str, int]]):
        self.smart_cols = smart_cols
        self.cat_maps = cat_maps
        self.last_vals: Dict[str, Dict[str, float]] = {}
        self.cummax_vals: Dict[str, Dict[str, float]] = {}
        self.age: Dict[str, int] = {}

    def _ensure_serial(self, sn: str):
        if sn not in self.last_vals:
            self.last_vals[sn] = {c: 0.0 for c in self.smart_cols}
            self.cummax_vals[sn] = {c: 0.0 for c in self.smart_cols}
            self.age[sn] = 0

    def transform_chunk(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.sort_values(["serial_number","date"]).reset_index(drop=True)

        # Categóricos (códigos fijos)
        model_code, vendor_code = apply_category_maps(df, self.cat_maps)
        df["model_code"] = model_code
        df["vendor_code"] = vendor_code

        # Calendario
        df["month"] = df["date"].dt.month
        df["day_of_week"] = df["date"].dt.dayofweek

        # Cast SMART
        for c in self.smart_cols:
            df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)

        deltas = {f"delta_{c}": [] for c in self.smart_cols}
        cummaxs= {f"max_{c}": [] for c in self.smart_cols}
        ages = []

        serials = df["serial_number"].astype(str).values
        for idx, sn in enumerate(serials):
            self._ensure_serial(sn)
            ages.append(self.age[sn]); self.age[sn] += 1
            for c in self.smart_cols:
                v = float(df.at[idx, c])
                d = v - self.last_vals[sn][c]
                self.last_vals[sn][c] = v
                self.cummax_vals[sn][c] = max(self.cummax_vals[sn][c], v)
                deltas[f"delta_{c}"].append(d)
                cummaxs[f"max_{c}"].append(self.cummax_vals[sn][c])

        for k, arr in deltas.items(): df[k] = arr
        for k, arr in cummaxs.items(): df[k] = arr
        df["age_days"] = ages

        feat_cols = (
            self.smart_cols
            + [f"delta_{c}" for c in self.smart_cols]
            + [f"max_{c}" for c in self.smart_cols]
            + ["age_days","month","day_of_week","model_code","vendor_code"]
        )
        return df[feat_cols].astype(np.float32)


# ===================== CV, SAMPLING, THRESHOLDING =====================

def make_group_folds(serials: pd.Series, y: np.ndarray, n_splits: int = 5, random_state: int = 42):
    serials = serials.astype(str).values
    uniq_serials, inverse = np.unique(serials, return_inverse=True)
    y_disk = np.zeros(len(uniq_serials), dtype=np.int8)
    for idx_row, disk_idx in enumerate(inverse):
        if y[idx_row] == 1:
            y_disk[disk_idx] = 1
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for tr_d, va_d in skf.split(uniq_serials, y_disk):
        tr_mask = np.isin(inverse, tr_d)
        va_mask = np.isin(inverse, va_d)
        yield np.where(tr_mask)[0], np.where(va_mask)[0]

def sample_negatives_hard(
    X: pd.DataFrame, y: np.ndarray, dtf: np.ndarray, lookahead: int,
    neg_pos_ratio: int = 3, hard_window: int = 60, hard_fraction: float = 0.7,
    seed: int = 42,
):
    rng = np.random.default_rng(seed)
    pos_idx = np.where(y == 1)[0]
    if len(pos_idx) == 0:
        raise ValueError("No positives in training fold for sampling")
    n_pos = len(pos_idx)
    n_neg_needed = max(n_pos * neg_pos_ratio, 1)
    hard_mask = (dtf > lookahead) & (dtf <= hard_window)
    hard_idx = np.where((y == 0) & hard_mask)[0]
    easy_idx = np.where((y == 0) & (~hard_mask))[0]
    n_hard = min(int(n_neg_needed * hard_fraction), len(hard_idx))
    n_easy = min(n_neg_needed - n_hard, len(easy_idx))
    chosen_hard = rng.choice(hard_idx, size=n_hard, replace=False) if n_hard > 0 else np.empty(0, dtype=int)
    chosen_easy = rng.choice(easy_idx, size=n_easy, replace=False) if n_easy > 0 else np.empty(0, dtype=int)
    sel_idx = np.sort(np.concatenate([pos_idx, chosen_hard, chosen_easy]))
    Xb = X.iloc[sel_idx].reset_index(drop=True); yb = y[sel_idx]
    return Xb, yb, sel_idx

def pick_threshold_precision_first(
    y_true: np.ndarray, proba: np.ndarray,
    min_precision: float = 0.90, min_recall: float = 0.03,
    top_k_rate: float = 1e-4, min_alerts: int = 5
):
    precision, recall, thr = precision_recall_curve(y_true, proba)
    pr_auc = average_precision_score(y_true, proba)
    valid = (precision >= min_precision) & (recall >= min_recall)
    if valid.any():
        idxs = np.where(valid)[0]
        idx = idxs[-1] - 1 if idxs[-1] >= len(thr) else idxs[-1]
        idx = max(0, min(idx, len(thr)-1))
        chosen = thr[idx]
    else:
        k = max(max(1, min_alerts), int(len(proba) * max(top_k_rate, 1e-6)))
        chosen = float(np.partition(proba, -k)[-k])
    return float(chosen), float(pr_auc)

def metrics_at_threshold(y_true: np.ndarray, proba: np.ndarray, thr: float) -> Dict:
    y_pred = (proba >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {
        'precision': float(precision_score(y_true, y_pred, zero_division=0)),
        'recall': float(recall_score(y_true, y_pred, zero_division=0)),
        'f1': float(f1_score(y_true, y_pred, zero_division=0)),
        'confusion_matrix': [[int(tn), int(fp)], [int(fn), int(tp)]],
        'fpr': float(fp / (fp + tn)) if (fp + tn) > 0 else 0.0
    }


# ===================== XGBoost GPU (xgb.train + wrapper robusto) =====================

def get_xgb_default_params(
    random_state: int = 42,
    enable_categorical: bool = True,
    max_depth: int = 8,
    n_estimators: int = 1500,
    learning_rate: float = 0.06,
    min_child_weight: float = 8.0,
    subsample: float = 0.8,
    colsample_bytree: float = 0.6,
    reg_lambda: float = 1.0,
    reg_alpha: float = 0.0,
    max_bin: int = 256,
    max_cat_to_onehot: int = 16,
):
    params = dict(
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        objective='binary:logistic',
        eval_metric='aucpr',
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        max_bin=max_bin,
        enable_categorical=enable_categorical,
        max_cat_to_onehot=max_cat_to_onehot,
        random_state=random_state,
        nthread=4,
        verbosity=0,
    )
    return params, int(n_estimators)

class XGBBoosterWrapper:
    """Wrapper sobre xgb.Booster con predict_proba robusto a ausencia de early stopping."""
    def __init__(self, booster: xgb.Booster):
        self.booster = booster
    def predict_proba(self, X: pd.DataFrame | np.ndarray) -> np.ndarray:
        # Ajuste de categorías si llegan como object (RAM path)
        if isinstance(X, pd.DataFrame):
            X2 = X.copy()
            for c in X2.columns:
                if str(X2[c].dtype) == 'object' and X2[c].nunique() < 10000:
                    X2[c] = X2[c].astype('category')
            d = xgb.DMatrix(X2, enable_categorical=True)
        else:
            d = xgb.DMatrix(X)
        best_it = getattr(self.booster, "best_iteration", None)
        if best_it is not None:
            try:
                pred = self.booster.predict(d, iteration_range=(0, int(best_it)+1))
            except TypeError:
                ntree_limit = getattr(self.booster, "best_ntree_limit", int(best_it)+1)
                pred = self.booster.predict(d, ntree_limit=int(ntree_limit))
        else:
            pred = self.booster.predict(d)
        return np.vstack([1 - pred, pred]).T
    def __getstate__(self):
        return {'raw': self.booster.save_raw()}
    def __setstate__(self, state):
        bst = xgb.Booster(); bst.load_model(bytearray(state['raw'])); self.booster = bst

def train_xgb_gpu(
    X_tr: pd.DataFrame, y_tr: np.ndarray,
    X_va: Optional[pd.DataFrame] = None, y_va: Optional[np.ndarray] = None,
    params: Optional[Dict] = None, n_estimators: Optional[int] = None,
    early_stopping_rounds: int = 200
) -> XGBBoosterWrapper:
    base_params, base_n = get_xgb_default_params()
    if params: base_params.update(params)
    if n_estimators is None: n_estimators = base_n

    # Ajuste de dtypes para categóricas si enable_categorical=True
    X_tr2 = X_tr.copy()
    if base_params.get("enable_categorical", False) and isinstance(X_tr2, pd.DataFrame):
        for c in X_tr2.columns:
            if str(X_tr2[c].dtype) == 'object':
                X_tr2[c] = X_tr2[c].astype('category')

    dtrain = xgb.DMatrix(X_tr2, label=y_tr, enable_categorical=base_params.get("enable_categorical", False))
    evals = []
    if X_va is not None and y_va is not None:
        X_va2 = X_va.copy()
        if base_params.get("enable_categorical", False) and isinstance(X_va2, pd.DataFrame):
            for c in X_va2.columns:
                if str(X_va2[c].dtype) == 'object':
                    X_va2[c] = X_va2[c].astype('category')
        dvalid = xgb.DMatrix(X_va2, label=y_va, enable_categorical=base_params.get("enable_categorical", False))
        evals = [(dtrain, 'train'), (dvalid, 'valid')]

    booster = xgb.train(
        params=base_params,
        dtrain=dtrain,
        num_boost_round=int(n_estimators),
        evals=evals,
        early_stopping_rounds=early_stopping_rounds if evals else 0,
        verbose_eval=False
    )
    return XGBBoosterWrapper(booster)


# ===================== MAIN PIPELINE (v72s4 con BIG-MODE) =====================

def train_xgb_precision_pipeline_v72s4(
    train_parquet: str,
    test_parquet: Optional[str],
    dataset_type: str,

    # Splits por año
    train_years: List[int] = [2020, 2021, 2022, 2023],
    dev_years:   List[int] = [2024],
    test_years:  List[int] = [2025],

    lookahead_days: int = 7,
    n_splits: int = 5,

    # Hard-neg
    neg_pos_ratio: int = 3,        # HDD sugerido 5
    hard_window: int = 60,         # HDD 90
    hard_fraction: float = 0.7,

    # XGB
    xgb_params: Optional[Dict] = None,
    xgb_n_estimators: int = 1500,
    early_stopping_rounds: int = 200,

    # Objetivo de umbral
    min_precision: float = 0.90,
    min_recall: float = 0.03,
    top_k_rate: float = 1e-4,
    min_alerts: int = 20,

    # BIG switches
    big_mode: bool = False,              # TRAIN big (HDD)
    test_big_mode: bool = False,         # TEST streaming (2024)
    neg_random_keep_rate: float = 0.0025,
    add_rolling_ram: bool = True,        # RAM (SSD) usa rolling; BIG (HDD) siempre sin rolling

    output_dir: str = './models_xgb_precision_v72s4',
    random_state: int = 42,
):
    print("="*100)
    print(f"XGBOOST (GPU, PRECISION v72s4) - {dataset_type.upper()}")
    print(f"Big-mode TRAIN={big_mode} | TEST streaming={test_big_mode}")
    print("="*100)
    os.makedirs(output_dir, exist_ok=True)

    # Descubrir SMART para lectura minimal
    smart_cols = discover_smart_columns(train_parquet)
    base_cols = ['serial_number','date','failure','model']
    read_cols = base_cols + smart_cols

    # ---------- LOAD TRAIN ----------
    if big_mode:
        fail_map_train = scan_fail_dates(train_parquet, train_years)
        df_train_raw = load_data_big_filtered(
            train_parquet, train_years, fail_map_train,
            lookahead_days=lookahead_days, hard_window=hard_window,
            neg_random_keep_rate=neg_random_keep_rate, columns=read_cols
        )
    else:
        df_train_raw = load_data(train_parquet, train_years, columns=read_cols)

    # ---------- LOAD DEV/TEST (RAM si test_big_mode=False) ----------
    df_dev_raw  = load_data(train_parquet, dev_years, columns=read_cols) if dev_years and (not big_mode) else pd.DataFrame()
    df_test_raw = load_data(test_parquet, test_years, columns=read_cols) if (test_parquet and test_years and (not test_big_mode)) else pd.DataFrame()

    if df_train_raw.empty:
        raise ValueError("Training data is empty!")

    # ---------- PREP ----------
    df_tr  = prepare_df(df_train_raw)
    df_dev = prepare_df(df_dev_raw) if not df_dev_raw.empty else pd.DataFrame()
    df_te  = prepare_df(df_test_raw) if not df_test_raw.empty else pd.DataFrame()

    # ---------- LABELS ----------
    dtf_tr = compute_days_to_failure(df_tr)
    y_tr   = create_labels_from_dtf(dtf_tr, lookahead_days)
    print(f"TRAIN labels: {int(y_tr.sum()):,} pos ({100*y_tr.mean():.4f}%)")
    if y_tr.sum() < 50:
        raise ValueError(f"Insufficient positive samples in TRAIN: {y_tr.sum()}")

    # ---------- FEATURES ----------
    if big_mode:
        # BIG: sin rolling + códigos
        X_tr, X_dev, X_te, feature_names, cat_maps = create_features_joined_big_codes(
            df_tr, df_dev, df_te, add_rolling=False, fit_cats_on_train=True, category_maps=None
        )
        feature_style = "codes_no_rolling"
        enable_categorical = False
    else:
        # RAM: rolling + categóricas nativas
        X_tr, X_dev, X_te, feature_names, cat_cols = create_features_joined_ram_v72(
            df_tr, df_dev, df_te, add_rolling=add_rolling_ram
        )
        cat_maps = {}    # no se usan en RAM
        feature_style = "categorical_with_rolling"
        enable_categorical = True

    # ---------- CV por disco ----------
    serials_tr = df_tr['serial_number']
    oof_proba = np.zeros(len(y_tr), dtype=np.float32)
    fold_metrics = []

    for fold, (tr_idx, va_idx) in enumerate(make_group_folds(serials_tr, y_tr, n_splits=n_splits, random_state=random_state), start=1):
        X_tr_fold, y_tr_fold = X_tr.iloc[tr_idx].reset_index(drop=True), y_tr[tr_idx]
        dtf_tr_fold = dtf_tr[tr_idx]
        X_va_fold, y_va_fold = X_tr.iloc[va_idx].reset_index(drop=True), y_tr[va_idx]

        print(f"\nFold {fold}/{n_splits}: train={len(y_tr_fold):,} (pos={int(y_tr_fold.sum()):,}) | "
              f"val={len(y_va_fold):,} (pos={int(y_va_fold.sum()):,})")

        # Hard-neg
        Xb, yb, _ = sample_negatives_hard(
            X_tr_fold, y_tr_fold, dtf_tr_fold,
            lookahead=lookahead_days,
            neg_pos_ratio=neg_pos_ratio if dataset_type.upper()=='SSD' else max(neg_pos_ratio, 5),
            hard_window=hard_window if dataset_type.upper()=='SSD' else max(hard_window, 90),
            hard_fraction=hard_fraction,
            seed=random_state,
        )
        print(f"  After hard-neg sampling: {len(yb):,} (pos={int(yb.sum()):,}, neg={len(yb)-int(yb.sum()):,})")

        # XGB params
        params, _ndef = get_xgb_default_params(enable_categorical=enable_categorical)
        if xgb_params: params.update(xgb_params)

        model = train_xgb_gpu(
            Xb, yb, X_va_fold, y_va_fold,
            params=params, n_estimators=xgb_n_estimators,
            early_stopping_rounds=early_stopping_rounds
        )

        proba_va = model.predict_proba(X_va_fold)[:, 1]
        oof_proba[va_idx] = proba_va

        thr_fold, pr_auc = pick_threshold_precision_first(
            y_va_fold, proba_va, min_precision=min_precision, min_recall=min_recall,
            top_k_rate=top_k_rate, min_alerts=min_alerts
        )
        m = metrics_at_threshold(y_va_fold, proba_va, thr_fold)
        m.update({'pr_auc': float(pr_auc), 'threshold': float(thr_fold), 'fold': int(fold)})
        fold_metrics.append(m)
        print(f"  Fold {fold} @thr={thr_fold:.4f} | P={m['precision']:.3f} R={m['recall']:.3f} F1={m['f1']:.3f} PR-AUC={pr_auc:.4f}")

        del X_tr_fold, y_tr_fold, X_va_fold, y_va_fold, Xb, yb, model
        cleanup()

    # ---------- OOF (umbral global provisional) ----------
    thr_oof_global, pr_auc_oof = pick_threshold_precision_first(
        y_tr, oof_proba, min_precision=min_precision, min_recall=min_recall,
        top_k_rate=top_k_rate, min_alerts=min_alerts
    )
    agg_metrics = metrics_at_threshold(y_tr, oof_proba, thr_oof_global)
    agg_metrics.update({'pr_auc': float(pr_auc_oof), 'threshold': float(thr_oof_global)})
    print("\nOOF (diag):")
    print(json.dumps(agg_metrics, indent=2))

    # ---------- Calibración en DEV (si está disponible y no estamos en big_mode) ----------
    thr_prod = float(thr_oof_global)
    dev_metrics = None
    if not df_dev.empty and not big_mode:
        # Reentrena con todo TRAIN (hard-neg) y calibra umbral en DEV
        Xb_full, yb_full, _ = sample_negatives_hard(
            X_tr, y_tr, dtf_tr,
            lookahead=lookahead_days,
            neg_pos_ratio=neg_pos_ratio,
            hard_window=hard_window if dataset_type.upper()=='SSD' else max(hard_window, 90),
            hard_fraction=hard_fraction, seed=random_state
        )
        params, _ = get_xgb_default_params(enable_categorical=enable_categorical)
        if xgb_params: params.update(xgb_params)
        final_model_dev = train_xgb_gpu(
            Xb_full, yb_full, params=params, n_estimators=xgb_n_estimators, early_stopping_rounds=0
        )
        proba_dev = final_model_dev.predict_proba(X_dev)[:, 1]
        thr_prod, pr_auc_dev = pick_threshold_precision_first(
            y_true=create_labels_from_dtf(compute_days_to_failure(df_dev), lookahead_days),
            proba=proba_dev, min_precision=min_precision, min_recall=min_recall,
            top_k_rate=top_k_rate, min_alerts=min_alerts
        )
        dev_metrics = metrics_at_threshold(create_labels_from_dtf(compute_days_to_failure(df_dev), lookahead_days), proba_dev, thr_prod)
        dev_metrics.update({'pr_auc': float(pr_auc_dev), 'threshold': float(thr_prod)})
        print("\nDEV calibration metrics (used for PROD threshold):")
        print(json.dumps(dev_metrics, indent=2))
        del final_model_dev; cleanup()

    # ---------- FINAL (entrena con todo TRAIN y guarda artefactos) ----------
    Xb_full, yb_full, _ = sample_negatives_hard(
        X_tr, y_tr, dtf_tr,
        lookahead=lookahead_days,
        neg_pos_ratio=neg_pos_ratio if dataset_type.upper()=='SSD' else max(neg_pos_ratio, 5),
        hard_window=hard_window if dataset_type.upper()=='SSD' else max(hard_window, 90),
        hard_fraction=hard_fraction, seed=random_state
    )
    params, _ = get_xgb_default_params(enable_categorical=enable_categorical)
    if xgb_params: params.update(xgb_params)

    final_model = train_xgb_gpu(
        Xb_full, yb_full, params=params, n_estimators=xgb_n_estimators, early_stopping_rounds=0
    )

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_prefix = os.path.join(output_dir, f"{dataset_type}_xgb_v72s4_{timestamp}")
    joblib.dump(final_model, f"{model_prefix}_model.pkl")
    final_model.booster.save_model(f"{model_prefix}_model.json")

    features_meta = {
        'feature_names': feature_names,
        'feature_style': feature_style,    # 'codes_no_rolling' | 'categorical_with_rolling'
        'threshold': float(thr_prod),
        'category_maps': cat_maps,         # vacío en RAM
        'smart_cols': smart_cols,
        'xgb_params': params | {'n_estimators': int(xgb_n_estimators)}
    }
    with open(f"{model_prefix}_features.json", 'w') as f:
        json.dump(features_meta, f, indent=2)
    print(f"\n✓ Final model saved: {model_prefix}_model.pkl/.json")

    # ---------- TEST (solo si no es streaming) ----------
    test_metrics = None
    if not df_te.empty and (not test_big_mode):
        y_test = create_labels_from_dtf(compute_days_to_failure(df_te), lookahead_days)
        proba_test = final_model.predict_proba(X_te)[:, 1]
        test_metrics = metrics_at_threshold(y_test, proba_test, float(thr_prod))
        test_metrics.update({'pr_auc': float(average_precision_score(y_test, proba_test)), 'threshold_used': float(thr_prod)})
        print("\nTEST metrics:")
        print(json.dumps(test_metrics, indent=2))
    else:
        print("\nTEST skipped (streaming test can be run later with evaluate_saved_model_2024_streaming_xgb).")

    # ---------- Metadata ----------
    metadata = {
        'dataset_type': dataset_type,
        'train_years': train_years,
        'dev_years': dev_years,
        'test_years': test_years,
        'lookahead_days': lookahead_days,
        'n_splits': n_splits,
        'hard_negative_sampling': {'neg_pos_ratio': neg_pos_ratio, 'hard_window': hard_window, 'hard_fraction': hard_fraction},
        'xgb_params': params | {'n_estimators': int(xgb_n_estimators)},
        'min_precision': min_precision, 'min_recall': min_recall,
        'top_k_rate': top_k_rate, 'min_alerts': min_alerts,
        'oof_metrics': {
            'precision': agg_metrics['precision'],
            'recall': agg_metrics['recall'],
            'f1': agg_metrics['f1'],
            'confusion_matrix': agg_metrics['confusion_matrix'],
            'fpr': agg_metrics['fpr'],
            'pr_auc': agg_metrics['pr_auc'],
            'threshold': agg_metrics['threshold']
        },
        'dev_metrics': dev_metrics,
        'test_metrics': test_metrics,
        'feature_names': feature_names,
        'feature_style': feature_style,
        'smart_cols': smart_cols
    }
    meta_path = os.path.join(output_dir, f"{dataset_type}_xgb_v72s4_{timestamp}_metadata.json")
    with open(meta_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"\n✓ Metadata saved: {meta_path}")
    print("="*100)
    return metadata


# ===================== TEST-ONLY 2024 (STREAMING, BIG) =====================

def evaluate_saved_model_2024_streaming_xgb(
    model_path: str,
    features_meta_path: str,   # *_features.json (feature_names, feature_style, thr, cat_maps, smart_cols)
    parquet_path: str,
    test_years: List[int] = [2024],
    lookahead_days: int = 7,
    chunk_limit_rows: int = 0   # 0 = todos los row-groups; útil para “probar” en pequeño
):
    """
    Evalúa TEST (p.ej. 2024) en streaming:
      • Requiere modelos entrenados en estilo 'codes_no_rolling' (HDD big-mode)
      • Construye labels con fail_map
      • Genera features sin rolling con estado por serial
    """
    print("="*100)
    print("EVALUATE SAVED MODEL — TEST 2024 (STREAMING, XGB)")
    print("="*100)

    model: XGBBoosterWrapper = joblib.load(model_path)
    with open(features_meta_path, "r") as f:
        meta = json.load(f)
    feature_names = meta["feature_names"]
    feature_style = meta.get("feature_style", "codes_no_rolling")
    if feature_style != "codes_no_rolling":
        raise ValueError(f"evaluate_saved_model_2024_streaming_xgb requiere feature_style='codes_no_rolling', got '{feature_style}'")
    thr = float(meta.get("threshold", 0.5))
    cat_maps = meta.get("category_maps", {})
    smart_cols = meta.get("smart_cols", discover_smart_columns(parquet_path))

    base_cols = ['serial_number','date','failure','model']
    read_cols = base_cols + smart_cols

    # Labeling con fail_map del TEST
    fail_map_test = scan_fail_dates(parquet_path, test_years)

    builder = StreamDeltaCummaxBuilderXGB(smart_cols=smart_cols, cat_maps=cat_maps)

    tn=fp=fn=tp=0
    disk_stat: Dict[str, Tuple[float, int]] = {}

    processed_groups = 0
    for df in _iter_parquet_row_groups(parquet_path, test_years, columns=read_cols):
        if chunk_limit_rows and processed_groups >= chunk_limit_rows:
            break

        sn_ser = df["serial_number"].astype(str)
        dt = pd.to_datetime(df["date"], errors="coerce").to_numpy(dtype="datetime64[D]")
        fdt = pd.to_datetime(sn_ser.map(fail_map_test), errors="coerce").to_numpy(dtype="datetime64[D]")

        dtf = np.full(len(df), 10**9, dtype=np.int64)
        valid = ~np.isnat(fdt)
        if valid.any():
            dd = (fdt[valid] - dt[valid]).astype("timedelta64[D]").astype("int64")
            dtf[valid] = dd
        y_chunk = ((dtf >= 0) & (dtf <= lookahead_days)).astype(np.int8)

        X_chunk = builder.transform_chunk(df)

        # Alinear columnas al orden guardado
        for m in feature_names:
            if m not in X_chunk.columns: X_chunk[m] = 0.0
        X_chunk = X_chunk[feature_names].astype(np.float32)

        proba = model.predict_proba(X_chunk)[:, 1]
        y_pred = (proba >= thr).astype(np.int8)

        cm = confusion_matrix(y_chunk, y_pred, labels=[0,1])
        tn += int(cm[0,0]); fp += int(cm[0,1]); fn += int(cm[1,0]); tp += int(cm[1,1])

        for s, p, y in zip(sn_ser.values, proba, y_chunk):
            if s not in disk_stat: disk_stat[s] = (p, int(y))
            else:
                mp, my = disk_stat[s]
                disk_stat[s] = (max(mp, p), max(my, int(y)))

        processed_groups += 1
        cleanup()

    # Row metrics
    row_metrics = {
        'precision': float(tp / max(1, tp + fp)),
        'recall': float(tp / max(1, tp + fn)),
        'f1': float((2*tp) / max(1, 2*tp + fp + fn)),
        'confusion_matrix': [[tn, fp], [fn, tp]],
        'fpr': float(fp / max(1, fp + tn))
    }
    # Disk metrics
    y_disk = np.array([v[1] for v in disk_stat.values()], dtype=np.int8)
    yhat_disk = np.array([1 if v[0] >= thr else 0 for v in disk_stat.values()], dtype=np.int8)
    cm_d = confusion_matrix(y_disk, yhat_disk, labels=[0,1])
    tn_d, fp_d, fn_d, tp_d = int(cm_d[0,0]), int(cm_d[0,1]), int(cm_d[1,0]), int(cm_d[1,1])
    disk_metrics = {
        'precision': float(tp_d / max(1, tp_d + fp_d)),
        'recall': float(tp_d / max(1, tp_d + fn_d)),
        'f1': float((2*tp_d) / max(1, 2*tp_d + fp_d + fn_d)),
        'confusion_matrix': [[tn_d, fp_d], [fn_d, tp_d]],
        'n_disks': int(len(disk_stat))
    }

    print("\nRow-level TEST metrics (streaming 2024):")
    print(json.dumps(row_metrics, indent=2))
    print("\nDisk-level TEST metrics (streaming 2024):")
    print(json.dumps(disk_metrics, indent=2))
    print("="*100)
    return {"row": row_metrics, "disk": disk_metrics}


# ===================== WRAPPERS =====================

def train_ssd_precision_v72s4_xgb():
    """
    SSD:
     - RAM (sin big_mode), rolling ON, categóricas nativas
     - Umbral calibrado con DEV=2024 (si está disponible)
    """
    xgb_params = dict(
        learning_rate=0.06,
        max_depth=8,
        min_child_weight=8.0,
        subsample=0.8,
        colsample_bytree=0.6,
        reg_lambda=1.2,
        reg_alpha=0.0,
        max_bin=256,
        enable_categorical=True,     # RAM path
        max_cat_to_onehot=16,
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        eval_metric='aucpr',
        random_state=42,
        verbosity=0,
    )
    return train_xgb_precision_pipeline_v72s4(
        train_parquet='./Procesados/finales/SSD_FULL_CLEAN.parquet',
        test_parquet='./Procesados/finales/SSD_FULL_CLEAN.parquet',
        dataset_type='SSD',
        train_years=[2020, 2021, 2022, 2023],
        dev_years=[2024],
        test_years=[2025],              # si quieres, cámbialo a [2024] y desactiva test_big_mode
        lookahead_days=7,
        n_splits=5,
        neg_pos_ratio=3,
        hard_window=60,
        hard_fraction=0.7,
        xgb_params=xgb_params,
        xgb_n_estimators=1500,
        early_stopping_rounds=200,
        min_precision=0.90,
        min_recall=0.03,
        top_k_rate=1e-4,
        min_alerts=20,
        big_mode=False,                 # RAM
        test_big_mode=False,            # test normal
        neg_random_keep_rate=0.0025,
        add_rolling_ram=True,
        output_dir='./models_xgb_ssd_v72s4'
    )


def train_hdd_precision_v72s4_xgb(neg_random_keep_rate: float = 0.0025):
    """
    HDD:
     - TRAIN big-mode (por partes, filtro streaming), sin rolling, categorías a códigos
     - TEST 2024: se ejecuta aparte con evaluate_saved_model_2024_streaming_xgb(...)
    """
    xgb_params = dict(
        learning_rate=0.06,
        max_depth=8,
        min_child_weight=10.0,
        subsample=0.8,
        colsample_bytree=0.6,
        reg_lambda=1.0,
        reg_alpha=0.0,
        max_bin=256,
        enable_categorical=False,    # BIG path (todo numérico/códigos)
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        eval_metric='aucpr',
        random_state=42,
        verbosity=0,
    )
    return train_xgb_precision_pipeline_v72s4(
        train_parquet='./Procesados/finales/HDD_FULL_CLEAN.parquet',
        test_parquet='./Procesados/finales/HDD_FULL_CLEAN.parquet',  # no se carga en RAM si test_big_mode=True
        dataset_type='HDD',
        train_years=[2020, 2021, 2022, 2023],
        dev_years=[],                      # opcionalmente vacío para evitar RAM extra
        test_years=[2024],                 # usamos función streaming aparte
        lookahead_days=7,
        n_splits=5,
        neg_pos_ratio=5,
        hard_window=90,
        hard_fraction=0.7,
        xgb_params=xgb_params,
        xgb_n_estimators=1500,
        early_stopping_rounds=200,
        min_precision=0.90,
        min_recall=0.03,
        top_k_rate=7.5e-5,
        min_alerts=30,
        big_mode=True,                     # TRAIN por partes
        test_big_mode=True,                # TEST lo harás con evaluate_saved_model_2024_streaming_xgb
        neg_random_keep_rate=neg_random_keep_rate,
        add_rolling_ram=False,             # sin rolling en BIG
        output_dir='./models_xgb_hdd_v72s4'
    )


In [18]:
train_ssd_xgb_precision_v5()

XGBOOST (PRECISION-CONTROLLED v5, GPU) - SSD
Loading [2020, 2021, 2022, 2023] from ./Procesados/finales/SSD_FULL_CLEAN.parquet...
Loaded 2,124,111 rows
Loading [2024] from ./Procesados/finales/SSD_FULL_CLEAN.parquet...
Loaded 1,220,745 rows
Labels (train): 1,325 positive (0.0624%)
Creating features (temporal join) for SSD...
  Found 13 SMART attributes (_raw)
  Final features: 72
  Partitioning enabled: SSD young

Fold 1/5: train=1,709,653 (pos=1,079) | val=414,458 (pos=246)
  [G1] After sampling: n=1,476 (pos=738, neg=738)
  [G0] After sampling: n=682 (pos=341, neg=341)

Fold 2/5: train=1,707,701 (pos=1,054) | val=416,410 (pos=271)
  [G1] After sampling: n=1,522 (pos=761, neg=761)
  [G0] After sampling: n=586 (pos=293, neg=293)

Fold 3/5: train=1,682,081 (pos=1,061) | val=442,030 (pos=264)
  [G1] After sampling: n=1,464 (pos=732, neg=732)
  [G0] After sampling: n=658 (pos=329, neg=329)

Fold 4/5: train=1,691,757 (pos=1,061) | val=432,354 (pos=264)
  [G1] After sampling: n=1,560 (pos=7

{'dataset_type': 'SSD',
 'train_years': [2020, 2021, 2022, 2023],
 'test_years': [2024],
 'lookahead_days': 7,
 'n_splits': 5,
 'neg_pos_ratio': 1,
 'hard_window': 60,
 'hard_fraction': 0.7,
 'xgb_params': {'n_estimators': 1300,
  'learning_rate': 0.05,
  'max_depth': 9,
  'min_child_weight': 8,
  'subsample': 0.8,
  'colsample_bytree': 0.6,
  'reg_lambda': 1.5,
  'reg_alpha': 0.0,
  'max_bin': 256,
  'tree_method': 'gpu_hist',
  'predictor': 'gpu_predictor',
  'objective': 'binary:logistic',
  'eval_metric': 'aucpr',
  'nthread': 4,
  'verbosity': 0,
  'random_state': 42},
 'min_precision': 0.85,
 'min_recall': 0.05,
 'top_k_rate': 0.0002,
 'min_alerts': 20,
 'partitioning': {'enabled': True,
  'ssd_age_days_thresh': 90,
  'hdd_hfh_thresh': 40000},
 'oof_metrics': {'precision': 0.8571428571428571,
  'recall': 0.07245283018867925,
  'f1': 0.1336116910229645,
  'confusion_matrix': [[2122770, 16], [1229, 96]],
  'fpr': 7.5372647077943795e-06,
  'pr_auc': 0.10935226487534482,
  'threshold

In [2]:
train_hdd_precision_v72s4_xgb()

XGBOOST (GPU, PRECISION v72s4) - HDD
Big-mode TRAIN=True | TEST streaming=True
Scanning earliest failure dates (streaming)...
  Found 8,044 failed serials
Loading BIG-FILTERED [2020, 2021, 2022, 2023] from ./Procesados/finales/HDD_FULL_CLEAN.parquet (keep_rate=0.0025)...
  BIG-FILTERED kept 1,184,183/215,762,596 rows (~0.55%)
TRAIN labels: 62,131 pos (5.2467%)

Fold 1/5: train=948,389 (pos=49,699) | val=235,794 (pos=12,432)
  After hard-neg sampling: 298,194 (pos=49,699, neg=248,495)
  Fold 1 @thr=0.9993 | P=1.000 R=0.030 F1=0.058 PR-AUC=0.9008

Fold 2/5: train=946,580 (pos=49,693) | val=237,603 (pos=12,438)
  After hard-neg sampling: 298,158 (pos=49,693, neg=248,465)
  Fold 2 @thr=0.9995 | P=0.995 R=0.030 F1=0.058 PR-AUC=0.9002

Fold 3/5: train=948,183 (pos=49,752) | val=236,000 (pos=12,379)
  After hard-neg sampling: 298,512 (pos=49,752, neg=248,760)
  Fold 3 @thr=0.9981 | P=1.000 R=0.030 F1=0.058 PR-AUC=0.8970

Fold 4/5: train=946,121 (pos=49,655) | val=238,062 (pos=12,476)
  After 

{'dataset_type': 'HDD',
 'train_years': [2020, 2021, 2022, 2023],
 'dev_years': [],
 'test_years': [2024],
 'lookahead_days': 7,
 'n_splits': 5,
 'hard_negative_sampling': {'neg_pos_ratio': 5,
  'hard_window': 90,
  'hard_fraction': 0.7},
 'xgb_params': {'tree_method': 'gpu_hist',
  'predictor': 'gpu_predictor',
  'objective': 'binary:logistic',
  'eval_metric': 'aucpr',
  'learning_rate': 0.06,
  'max_depth': 8,
  'min_child_weight': 10.0,
  'subsample': 0.8,
  'colsample_bytree': 0.6,
  'reg_lambda': 1.0,
  'reg_alpha': 0.0,
  'max_bin': 256,
  'enable_categorical': False,
  'max_cat_to_onehot': 16,
  'random_state': 42,
  'nthread': 4,
  'verbosity': 0,
  'n_estimators': 1500},
 'min_precision': 0.9,
 'min_recall': 0.03,
 'top_k_rate': 7.5e-05,
 'min_alerts': 30,
 'oof_metrics': {'precision': 0.9989281886387996,
  'recall': 0.030001126651751944,
  'f1': 0.058252730596746724,
  'confusion_matrix': [[1122050, 2], [60267, 1864]],
  'fpr': 1.7824485852705578e-06,
  'pr_auc': 0.900259060

# Demás Técnicas

In [11]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
XGBOOST — PRECISION-CONTROLLED v7.2 (GPU) con estrategias de balanceo
----------------------------------------------------------------------
Refactor del pipeline CatBoost v7.2 hacia XGBoost manteniendo técnicas:
 - Join temporal TRAIN∪DEV∪TEST con features causales y normalización robusta por modelo
 - Folds estratificados por DISCO (Group K-Fold estratificado)
 - Muestreo de negativos "duros" (hard negatives) cercano al fallo
 - Balanceo: 'none' | 'under' | 'smote_knn' | 'smote_enn' (SMOTENC respeta categóricas model/vendor)
 - Umbral "precision-first" (min_precision/min_recall + fallback top-k/min_alerts)
 - Guardrails VRAM para RTX 2060 6GB (gpu_hist, single-precision, max_bin, subsample/colsample)

Salida:
 - Métricas OOF + DEV + TEST
 - Modelos y metadata (JSON / PKL) con threshold para despliegue

Requisitos:
 - pandas, numpy, pyarrow, scikit-learn, imblearn, xgboost>=2.0, joblib
"""

import os, gc, json, warnings, pickle
from datetime import datetime
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import pyarrow.parquet as pq

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    precision_recall_curve, average_precision_score, precision_score,
    recall_score, f1_score, confusion_matrix
)

# XGBoost (GPU)
import xgboost as xgb

# Imbalance: under/over/combined (con soporte para categóricas)
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.over_sampling import SMOTENC
from imblearn.combine import SMOTEENN

import joblib
warnings.filterwarnings("ignore")


# ===================== UTILS =====================

def cleanup():
    """GC agresivo para liberar RAM/VRAM indirecta."""
    gc.collect()


# ===================== IO =====================

def load_data(path: str, years: List[int]) -> pd.DataFrame:
    """Carga .parquet (archivo o carpeta) y filtra por años en columna 'date'."""
    print(f"Loading {years} from {path}...")
    chunks = []
    if not path:
        return pd.DataFrame()

    files = []
    if os.path.isdir(path):
        files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.parquet')]
    else:
        files = [path]

    for f in files:
        pf = pq.ParquetFile(f)
        for i in range(pf.num_row_groups):
            df = pf.read_row_group(i).to_pandas()
            if 'date' not in df.columns:
                continue
            df['date'] = pd.to_datetime(df['date'], errors='coerce')
            df = df[df['date'].dt.year.isin(years)]
            if len(df) > 0:
                chunks.append(df)
            if len(chunks) >= 20:
                chunks = [pd.concat(chunks, ignore_index=True)]
                cleanup()

    result = pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()
    print(f"Loaded {len(result):,} rows")
    return result


# ===================== PREP =====================

def prepare_df(df: pd.DataFrame) -> pd.DataFrame:
    """Tipa columnas clave y ordena por (serial_number, date) de forma causal."""
    df = df.copy()
    df['serial_number'] = df['serial_number'].astype(str)
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['serial_number', 'date'])
    df = df.sort_values(['serial_number', 'date']).reset_index(drop=True)
    return df


# ===================== LABELS / DTF =====================

def compute_days_to_failure(dfs: pd.DataFrame) -> np.ndarray:
    """
    Días hasta el primer fallo por disco (>=0 antes de fallo, <0 después/no fallo conocido).
    """
    fail_map: Dict[str, pd.Timestamp] = {}
    fails = dfs[dfs['failure'] == 1]
    for sn, dt in zip(fails['serial_number'], fails['date']):
        fail_map[sn] = min(dt, fail_map.get(sn, dt))

    dtf = np.full(len(dfs), 1_000_000_000, dtype=np.int64)
    for i, (sn, dt) in enumerate(zip(dfs['serial_number'], dfs['date'])):
        if sn in fail_map:
            dtf[i] = (fail_map[sn] - dt).days
    return dtf


def create_labels_from_dtf(dtf: np.ndarray, lookahead: int = 7) -> np.ndarray:
    """Etiqueta positiva si está dentro de [0, lookahead] días al fallo."""
    return ((dtf >= 0) & (dtf <= lookahead)).astype(np.int8)


# ===================== CATEGORÍAS =====================

def extract_vendor(series: pd.Series) -> pd.Series:
    s = series.astype(str).str.strip()
    return s.str.extract(r"^([A-Za-z]+)", expand=False).fillna("UNK")


# ===================== NORMALIZACIÓN POR MODELO =====================

def fit_model_stats(df_train: pd.DataFrame, smart_cols: List[str]) -> Dict[str, Dict[str, Tuple[float, float]]]:
    """
    Aprende mediana e IQR por (model, atributo) SOLO en TRAIN.
    Devuelve: stats[model][col] = (median, iqr)
    """
    stats: Dict[str, Dict[str, Tuple[float, float]]] = {}
    if 'model' not in df_train.columns or df_train.empty:
        return stats

    for m, g in df_train.groupby(df_train['model'].astype(str), sort=False):
        d = {}
        for c in smart_cols:
            x = pd.to_numeric(g[c], errors='coerce')
            x = x[np.isfinite(x)]
            if x.empty:
                continue
            med = float(np.median(x))
            q1, q3 = np.percentile(x, [25, 75])
            iqr = float(max(q3 - q1, 1e-6))
            d[c] = (med, iqr)
        stats[str(m)] = d
    return stats


def apply_model_normalization(df_all: pd.DataFrame, smart_cols: List[str], stats: Dict[str, Dict[str, Tuple[float, float]]]):
    """
    Agrega features:
    - z_{col}  = (col - median(model,col)) / (IQR(model,col)+eps)
    - log1p_{col}
    Vectorizado y robusto a modelos no vistos (fallback global).
    """
    # Global fallback
    global_median = {}
    global_iqr = {}
    for c in smart_cols:
        x = pd.to_numeric(df_all[c], errors='coerce').fillna(0.0).values
        global_median[c] = float(np.median(x))
        q1, q3 = np.percentile(x, [25, 75])
        global_iqr[c] = float(max(q3 - q1, 1e-6))

    model_series = df_all['model'].astype(str)

    for c in smart_cols:
        med_map = {m: v[c][0] for m, v in stats.items() if c in v}
        iqr_map = {m: v[c][1] for m, v in stats.items() if c in v}

        med_s = model_series.map(med_map).fillna(global_median[c]).astype(np.float32)
        iqr_s = model_series.map(iqr_map).fillna(global_iqr[c]).astype(np.float32)

        col = pd.to_numeric(df_all[c], errors='coerce').fillna(0.0).astype(np.float32)
        z = (col - med_s) / (iqr_s + 1e-6)
        df_all[f'z_{c}'] = z.values
        df_all[f'log1p_{c}'] = np.log1p(np.maximum(col.values, 0.0)).astype(np.float32)


# ===================== FEATURES (JOINED TRAIN∪DEV∪TEST) =====================

def create_features_joined_xgb_v72(
    df_train: pd.DataFrame,
    df_dev: pd.DataFrame,
    df_test: pd.DataFrame,
    dataset_type: str,
    add_rolling: bool,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray, np.ndarray, List[str], List[str]]:
    """
    Une TRAIN∪DEV∪TEST, crea features causales por disco y marca 'model'/'vendor' como categóricas nativas.
    Devuelve X_train, X_dev, X_test y timestamps, más nombres de features y lista de columnas categóricas.
    Nota: XGBoost no acepta 'timestamp' como CatBoost; lo usamos sólo para info/diagnóstico.
    """
    print(f"Creating features (temporal join) for {dataset_type} with TRAIN∪DEV∪TEST...")

    df_train = df_train.copy(); df_train['__subset__'] = 'train'
    df_dev   = df_dev.copy();   df_dev['__subset__']   = 'dev'
    df_test  = df_test.copy();  df_test['__subset__']  = 'test'
    df_all = pd.concat([df_train, df_dev, df_test], ignore_index=True)
    df_all.sort_values(['serial_number','date'], inplace=True)

    all_cols = df_all.columns.tolist()
    smart_cols = [c for c in all_cols if ('smart' in c.lower()) and ('_raw' in c.lower())]
    print(f"  Found {len(smart_cols)} SMART attributes (raw)")

    for c in smart_cols:
        df_all[c] = pd.to_numeric(df_all[c], errors='coerce').fillna(0.0)

    # Deltas / cummax por disco (causal)
    for c in smart_cols:
        g = df_all.groupby('serial_number', sort=False)[c]
        df_all[f'delta_{c}'] = g.diff().fillna(0.0)
        df_all[f'max_{c}']   = g.cummax()

    # Rolling 7d
    if add_rolling:
        for c in smart_cols:
            r = df_all.groupby('serial_number', sort=False)[c]
            df_all[f'rmean7_{c}'] = r.rolling(window=7, min_periods=2).mean().reset_index(level=0, drop=True).fillna(0.0)
            df_all[f'rstd7_{c}']  = r.rolling(window=7, min_periods=2).std().reset_index(level=0, drop=True).fillna(0.0)

    # Edad y calendario
    df_all['age_days'] = df_all.groupby('serial_number', sort=False).cumcount()
    d = df_all['date']
    df_all['month'] = d.dt.month
    df_all['day_of_week'] = d.dt.dayofweek

    # Categóricas nativas + vendor
    if 'model' not in df_all.columns:
        df_all['model'] = 'UNK'
    df_all['model'] = df_all['model'].astype(str).fillna('UNK')
    df_all['vendor'] = extract_vendor(df_all['model']).astype(str).fillna('UNK')

    # Timestamp (solo para referencia; XGBoost no lo usa como parámetro)
    df_all['ts_sec'] = (df_all['date'].astype('int64') // 10**9).astype(np.int64)

    # Normalización por modelo aprendida en TRAIN
    stats = fit_model_stats(df_train, smart_cols)
    apply_model_normalization(df_all, smart_cols, stats)

    # Drop no predictoras para X
    drop_cols = ['serial_number', 'date', 'failure']
    X_all = df_all.drop(columns=[c for c in drop_cols if c in df_all.columns], errors='ignore')

    # Definición de categóricas nativas (XGBoost soporta dtype 'category' con enable_categorical=True)
    cat_cols = [c for c in ['model','vendor'] if c in X_all.columns]
    num_cols = [c for c in X_all.columns if c not in cat_cols + ['ts_sec']]

    # Tipado numérico
    for c in num_cols:
        X_all[c] = pd.to_numeric(X_all[c], errors='coerce').fillna(0.0).astype(np.float32)

    # Elimina constantes
    var = X_all[num_cols].var()
    keep_num = var[var > 0].index.tolist()
    X_all = pd.concat([X_all[cat_cols], X_all[keep_num], df_all[['ts_sec']]], axis=1)

    # Asegura dtype categórico para XGBoost
    for c in cat_cols:
        X_all[c] = X_all[c].astype('category')

    feature_names = list(X_all.columns)

    # Split back
    tr_mask  = (df_all['__subset__']=='train').values
    dev_mask = (df_all['__subset__']=='dev').values
    te_mask  = (df_all['__subset__']=='test').values

    X_train = X_all.loc[tr_mask].reset_index(drop=True)
    X_dev   = X_all.loc[dev_mask].reset_index(drop=True)
    X_test  = X_all.loc[te_mask].reset_index(drop=True)

    ts_train = X_train['ts_sec'].values.astype(np.int64)
    ts_dev   = X_dev['ts_sec'].values.astype(np.int64)
    ts_test  = X_test['ts_sec'].values.astype(np.int64)

    # Quitar ts_sec de features (no se usa como predictor por defecto)
    X_train = X_train.drop(columns=['ts_sec'])
    X_dev   = X_dev.drop(columns=['ts_sec'])
    X_test  = X_test.drop(columns=['ts_sec'])
    feature_names.remove('ts_sec')

    print(f"  Final features: {X_train.shape[1]} (cat={len(cat_cols)}, num≈{len(keep_num)})")
    return X_train, X_dev, X_test, ts_train, ts_dev, ts_test, feature_names, cat_cols


# ===================== GROUPED CV (por DISCO) =====================

def make_group_folds(serials: pd.Series, y: np.ndarray, n_splits: int = 5, random_state: int = 42):
    """
    Folds estratificados por disco (cada fold recibe discos con/ sin fallo).
    """
    serials = serials.astype(str).values
    uniq_serials, inverse = np.unique(serials, return_inverse=True)

    y_disk = np.zeros(len(uniq_serials), dtype=np.int8)
    for idx_row, disk_idx in enumerate(inverse):
        if y[idx_row] == 1:
            y_disk[disk_idx] = 1

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for tr_d, va_d in skf.split(uniq_serials, y_disk):
        tr_mask = np.isin(inverse, tr_d)
        va_mask = np.isin(inverse, va_d)
        yield np.where(tr_mask)[0], np.where(va_mask)[0]


# ===================== HARD NEGATIVES =====================

def sample_negatives_hard(
    X: pd.DataFrame, y: np.ndarray, dtf: np.ndarray, lookahead: int,
    neg_pos_ratio: int = 3, hard_window: int = 60, hard_fraction: float = 0.7,
    seed: int = 42,
):
    """
    Selección de negativos con énfasis en los cercanos al fallo (dtf in (lookahead, hard_window]).
    Devuelve subset (Xb, yb) + índices seleccionados (sel_idx) para diagnóstico.
    """
    rng = np.random.default_rng(seed)
    pos_idx = np.where(y == 1)[0]
    if len(pos_idx) == 0:
        raise ValueError("No positives in training fold for hard-negative sampling")

    n_pos = len(pos_idx)
    n_neg_needed = max(n_pos * neg_pos_ratio, 1)

    hard_mask = (dtf > lookahead) & (dtf <= hard_window)
    hard_idx = np.where((y == 0) & hard_mask)[0]
    easy_idx = np.where((y == 0) & (~hard_mask))[0]

    n_hard = min(int(n_neg_needed * hard_fraction), len(hard_idx))
    n_easy = min(n_neg_needed - n_hard, len(easy_idx))

    chosen_hard = rng.choice(hard_idx, size=n_hard, replace=False) if n_hard > 0 else np.empty(0, dtype=int)
    chosen_easy = rng.choice(easy_idx, size=n_easy, replace=False) if n_easy > 0 else np.empty(0, dtype=int)

    sel_idx = np.sort(np.concatenate([pos_idx, chosen_hard, chosen_easy]))
    Xb = X.iloc[sel_idx].reset_index(drop=True)
    yb = y[sel_idx]
    return Xb, yb, sel_idx


# ===================== BALANCING =====================

def _encode_categoricals_for_smote(X: pd.DataFrame, cat_cols: List[str]):
    """
    Codifica categóricas a códigos enteros por columna (mapas por col).
    Devuelve X_num (DataFrame), mapas enc/dec y lista de índices categóricos.
    """
    Xc = X.copy()
    enc_maps = {}
    dec_maps = {}
    for c in cat_cols:
        cat = pd.Categorical(Xc[c].astype(str).fillna("UNK"))
        codes = cat.codes.astype(np.int64)
        codes = np.where(codes < 0, cat.categories.size, codes)  # remapea -1 a "__UNK__"
        Xc[c] = codes
        enc_maps[c] = {k: i for i, k in enumerate(list(cat.categories) + ["__UNK__"])}
        dec_maps[c] = {i: k for k, i in enc_maps[c].items()}
    cat_indices = [Xc.columns.get_loc(c) for c in cat_cols]
    return Xc, enc_maps, dec_maps, cat_indices


def _decode_categoricals_after_smote(X_res: pd.DataFrame, dec_maps: Dict[str, Dict[int, str]], cat_cols: List[str]):
    """Convierte códigos enteros devueltos por SMOTENC a strings originales."""
    Xd = X_res.copy()
    for c in cat_cols:
        inv = dec_maps[c]
        vals = Xd[c].astype(int).values
        Xd[c] = [inv.get(int(v), "__UNK__") for v in vals]
    return Xd


def balance_with_strategy(
    X: pd.DataFrame,
    y: np.ndarray,
    cat_cols: List[str],
    strategy: str = "under",
    neg_pos_ratio: int = 3,        # objetivo final NEG:POS
    smote_k_neighbors: int = 5,
    enn_k_neighbors: int = 3,
    random_state: int = 42,
    max_total_samples: Optional[int] = None,
):
    if strategy not in {"none", "under", "smote_knn", "smote_enn"}:
        raise ValueError(f"Unknown balancing strategy: {strategy}")

    if strategy == "none":
        return X.reset_index(drop=True), y.copy()

    n_pos = int((y == 1).sum())
    n_neg = int((y == 0).sum())
    if n_pos == 0 or n_neg == 0:
        return X.reset_index(drop=True), y.copy()

    desired_neg_pos = max(1, int(neg_pos_ratio))
    desired_pos_over_neg = 1.0 / float(desired_neg_pos)  # pos/neg deseado
    cur_pos_over_neg = n_pos / float(n_neg)
    eps = 1e-6

    # ---------------- UNDER-SAMPLING ----------------
    if strategy == "under":
        target_neg = min(n_neg, int(round(n_pos * desired_neg_pos)))
        target_pos = n_pos
        if max_total_samples:
            per_pos = max(1, max_total_samples // (1 + desired_neg_pos))
            target_pos = min(n_pos, per_pos)
            target_neg = min(n_neg, per_pos * desired_neg_pos)
        rus = RandomUnderSampler(
            sampling_strategy={0: int(target_neg), 1: int(target_pos)},
            random_state=random_state
        )
        X_res, y_res = rus.fit_resample(X, y)
        return pd.DataFrame(X_res, columns=X.columns).reset_index(drop=True), y_res.astype(np.int8)

    # ---------------- SMOTENC / SMOTEENN ----------------
    # Guardia 1: si ya estamos en (o por encima de) el ratio objetivo, NO oversamplear.
    if cur_pos_over_neg + eps >= desired_pos_over_neg:
        # Opciones: (a) no tocar, (b) ajustar por under a ratio exacto. Tomamos (b) para consistencia.
        target_neg = min(n_neg, int(round(n_pos * desired_neg_pos)))
        rus = RandomUnderSampler(
            sampling_strategy={0: int(target_neg), 1: int(n_pos)},
            random_state=random_state
        )
        X_res, y_res = rus.fit_resample(X, y)
        X_res = pd.DataFrame(X_res, columns=X.columns)
        # Asegura dtypes tras el re-sample
        for c in X.columns:
            if c in cat_cols:
                X_res[c] = pd.Categorical(X_res[c]).astype('category')
            else:
                X_res[c] = pd.to_numeric(X_res[c], errors='coerce').fillna(0.0).astype(np.float32)
        return X_res.reset_index(drop=True), y_res.astype(np.int8)

    # Guardia 2: asegurar que SMOTE realmente genere al menos 1 muestra
    target_min = max(n_pos + 1, int(np.ceil(desired_pos_over_neg * n_neg)))
    sampling_strategy = min(1.0, target_min / float(n_neg))

    # Encodifica categóricas a enteros para SMOTENC
    X_enc = X.copy()
    enc_maps = {}; dec_maps = {}; cat_idx = []
    for c in cat_cols:
        cat = pd.Categorical(X_enc[c].astype(str).fillna("UNK"))
        codes = cat.codes.astype(np.int64)
        codes = np.where(codes < 0, cat.categories.size, codes)  # -1 -> "__UNK__"
        X_enc[c] = codes
        enc_maps[c] = {k: i for i, k in enumerate(list(cat.categories) + ["__UNK__"])}
        dec_maps[c] = {i: k for k, i in enc_maps[c].items()}
        cat_idx.append(X_enc.columns.get_loc(c))

    # Clamping de k_neighbors a n_pos-1 para evitar ValueError
    k_smote = max(1, min(smote_k_neighbors, n_pos - 1))
    smote = SMOTENC(
        categorical_features=cat_idx,
        sampling_strategy=float(sampling_strategy),
        k_neighbors=k_smote,
        random_state=random_state
    )

    if strategy == "smote_knn":
        X_res, y_res = smote.fit_resample(X_enc.values, y)
    else:  # smote_enn
        enn = EditedNearestNeighbours(n_neighbors=enn_k_neighbors)
        comb = SMOTEENN(smote=smote, enn=enn)
        X_res, y_res = comb.fit_resample(X_enc.values, y)

    # Tope de tamaño si aplica
    if max_total_samples and len(y_res) > max_total_samples:
        rng = np.random.default_rng(random_state)
        pos_idx = np.where(y_res == 1)[0]; neg_idx = np.where(y_res == 0)[0]
        keep_pos = min(len(pos_idx), max_total_samples // (1 + desired_neg_pos))
        keep_neg = min(len(neg_idx), keep_pos * desired_neg_pos)
        sel_pos = rng.choice(pos_idx, size=keep_pos, replace=False)
        sel_neg = rng.choice(neg_idx, size=keep_neg, replace=False)
        sel = np.sort(np.concatenate([sel_pos, sel_neg]))
        X_res = X_res[sel]; y_res = y_res[sel]

    # Decodifica categóricas de vuelta y fija dtypes
    X_res_df = pd.DataFrame(X_res, columns=X_enc.columns)
    for c in cat_cols:
        inv = dec_maps[c]; X_res_df[c] = [inv.get(int(v), "__UNK__") for v in X_res_df[c].astype(int).values]
        X_res_df[c] = pd.Categorical(X_res_df[c]).astype('category')
    for c in X.columns:
        if c not in cat_cols:
            X_res_df[c] = pd.to_numeric(X_res_df[c], errors='coerce').fillna(0.0).astype(np.float32)

    return X_res_df.reset_index(drop=True), y_res.astype(np.int8)



# ===================== XGBOOST (GPU) =====================

def get_xgb_default_params(
    random_state: int = 42,
    enable_categorical: bool = True,
    max_depth: int = 8,
    n_estimators: int = 1500,
    learning_rate: float = 0.06,
    min_child_weight: float = 8.0,
    subsample: float = 0.8,
    colsample_bytree: float = 0.6,
    reg_lambda: float = 1.0,
    reg_alpha: float = 0.0,
    max_bin: int = 256,
    max_cat_to_onehot: int = 16,
):
    """
    Parámetros por defecto seguros para RTX 2060 6GB:
    - gpu_hist + single precision -> menor VRAM
    - max_bin moderado, subsamples para reducir memoria
    - enable_categorical para tratar model/vendor como categóricas nativas
    """
    params = dict(
        # Core
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        objective='binary:logistic',
        eval_metric='aucpr',
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        max_bin=max_bin,
        # Categóricas
        enable_categorical=enable_categorical,
        max_cat_to_onehot=max_cat_to_onehot,
        # Miscelánea
        random_state=random_state,
        nthread=4,
        verbosity=0,
    )
    return params, int(n_estimators)


class XGBBoosterWrapper:
    """Wrapper sobre xgb.Booster con predict_proba robusto a versiones/early stopping."""
    def __init__(self, booster: xgb.Booster):
        self.booster = booster

    def predict_proba(self, X: pd.DataFrame | np.ndarray) -> np.ndarray:
        # Asegura dtype category en columnas categóricas si vienen como object
        if isinstance(X, pd.DataFrame):
            X2 = X.copy()
            for c in X2.columns:
                if str(X2[c].dtype) == 'object' and X2[c].nunique() < 10_000:
                    X2[c] = X2[c].astype('category')
            d = xgb.DMatrix(X2, enable_categorical=True)
        else:
            d = xgb.DMatrix(X)

        best_it = getattr(self.booster, "best_iteration", None)
        if best_it is not None:
            try:
                pred = self.booster.predict(d, iteration_range=(0, int(best_it) + 1))
            except TypeError:
                ntree_limit = getattr(self.booster, "best_ntree_limit", int(best_it) + 1)
                pred = self.booster.predict(d, ntree_limit=int(ntree_limit))
        else:
            pred = self.booster.predict(d)
        return np.vstack([1 - pred, pred]).T

    def __getstate__(self):
        return {'raw': self.booster.save_raw()}

    def __setstate__(self, state):
        bst = xgb.Booster()
        bst.load_model(bytearray(state['raw']))
        self.booster = bst


def train_xgb_gpu(
    X_tr: pd.DataFrame, y_tr: np.ndarray,
    X_va: Optional[pd.DataFrame] = None, y_va: Optional[np.ndarray] = None,
    params: Optional[Dict] = None, n_estimators: Optional[int] = None,
    early_stopping_rounds: int = 200, scale_pos_weight: Optional[float] = None
) -> XGBBoosterWrapper:
    """
    Entrena XGBoost GPU con soporte de categóricas nativas (pandas.Categorical).
    - scale_pos_weight se usa si el set está desbalanceado (p.ej. strategy='none').
    """
    base_params, base_n = get_xgb_default_params()
    if params:
        base_params.update(params)
    if n_estimators is None:
        n_estimators = base_n
    if scale_pos_weight is not None:
        base_params['scale_pos_weight'] = float(scale_pos_weight)

    # Asegura dtype category en columnas categóricas
    X_tr2 = X_tr.copy()
    if isinstance(X_tr2, pd.DataFrame):
        for c in X_tr2.columns:
            if str(X_tr2[c].dtype) == 'object':
                X_tr2[c] = X_tr2[c].astype('category')

    dtrain = xgb.DMatrix(X_tr2, label=y_tr, enable_categorical=True)
    evals = []
    if X_va is not None and y_va is not None:
        X_va2 = X_va.copy()
        for c in X_va2.columns:
            if str(X_va2[c].dtype) == 'object':
                X_va2[c] = X_va2[c].astype('category')
        dvalid = xgb.DMatrix(X_va2, label=y_va, enable_categorical=True)
        evals = [(dtrain, 'train'), (dvalid, 'valid')]

    booster = xgb.train(
        params=base_params,
        dtrain=dtrain,
        num_boost_round=int(n_estimators),
        evals=evals,
        early_stopping_rounds=early_stopping_rounds if evals else 0,
        verbose_eval=False
    )
    return XGBBoosterWrapper(booster)


# ===================== THRESHOLDING & EVAL =====================

def pick_threshold_precision_first(
    y_true: np.ndarray, proba: np.ndarray,
    min_precision: float = 0.90, min_recall: float = 0.03,
    top_k_rate: float = 1e-4, min_alerts: int = 5
):
    """
    Selección de umbral:
      1) prioridad a cumplir precisión y recall mínimos (elige el umbral mayor que cumpla)
      2) fallback a top-k por tasa (o mínimo de alertas)
    """
    precision, recall, thr = precision_recall_curve(y_true, proba)
    pr_auc = average_precision_score(y_true, proba)

    valid = (precision >= min_precision) & (recall >= min_recall)
    if valid.any():
        idxs = np.where(valid)[0]
        idx = idxs[-1] - 1 if idxs[-1] >= len(thr) else idxs[-1]
        idx = max(0, min(idx, len(thr)-1))
        chosen = thr[idx]
    else:
        k = max(max(1, min_alerts), int(len(proba) * max(top_k_rate, 1e-6)))
        chosen = float(np.partition(proba, -k)[-k])

    return float(chosen), float(pr_auc)


def metrics_at_threshold(y_true: np.ndarray, proba: np.ndarray, thr: float) -> Dict:
    y_pred = (proba >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {
        'precision': float(precision_score(y_true, y_pred, zero_division=0)),
        'recall': float(recall_score(y_true, y_pred, zero_division=0)),
        'f1': float(f1_score(y_true, y_pred, zero_division=0)),
        'confusion_matrix': [[int(tn), int(fp)], [int(fn), int(tp)]],
        'fpr': float(fp / (fp + tn)) if (fp + tn) > 0 else 0.0
    }


# ===================== MAIN (TRAIN 20–22, DEV 23, TEST 24/25) =====================

def train_xgb_precision_pipeline_v72(
    train_parquet: str,
    test_parquet: str | None,
    dataset_type: str,
    train_years: List[int] = [2020, 2021, 2022, 2023],
    dev_years: List[int]   = [2024],
    test_years: List[int]  = [2025],
    lookahead_days: int = 7,
    n_splits: int = 5,

    # HARD-NEG sampling
    neg_pos_ratio: int = 3,        # SSD=3, HDD=5 recomendado
    hard_window: int = 60,         # HDD=90
    hard_fraction: float = 0.7,

    # BALANCING (tras hard-neg subset)
    balancing: str = "under",      # 'none' | 'under' | 'smote_knn' | 'smote_enn'
    balancing_neg_pos_ratio: int = 3,   # objetivo final neg:pos
    smote_k_neighbors: int = 5,
    enn_k_neighbors: int = 3,
    max_balanced_samples: Optional[int] = None,  # p.ej., 40_000

    # XGBoost
    xgb_params: Optional[Dict] = None,  # override opcional
    xgb_n_estimators: int = 1500,
    early_stopping_rounds: int = 200,

    # Objetivos de despliegue (umbral)
    min_precision: float = 0.90,
    min_recall: float = 0.03,
    top_k_rate: float = 1e-4,
    min_alerts: int = 20,

    output_dir: str = './models_xgb_precision_v72',
    random_state: int = 42,
):
    """
    Entrena XGBoost (GPU) con control de precisión y estrategias de balanceo configurables.
    """
    print("="*92)
    print(f"XGBOOST (GPU, PRECISION-CONTROLLED v7.2) - {dataset_type.upper()} "
          f"(Train={train_years} | Dev={dev_years} | Test={test_years})")
    print(f"Balancing: {balancing} (target neg:pos={balancing_neg_pos_ratio})")
    print("="*92)
    os.makedirs(output_dir, exist_ok=True)

    # ---------- Load ----------
    df_tr_raw = load_data(train_parquet, train_years)
    df_dev_raw = load_data(train_parquet, dev_years) if dev_years else pd.DataFrame()
    df_te_raw  = load_data(test_parquet, test_years) if (test_parquet and test_years) else pd.DataFrame()
    if df_tr_raw.empty:
        raise ValueError("Training data is empty!")

    # ---------- Prepare ----------
    df_tr  = prepare_df(df_tr_raw)
    df_dev = prepare_df(df_dev_raw) if not df_dev_raw.empty else pd.DataFrame()
    df_te  = prepare_df(df_te_raw) if not df_te_raw.empty else pd.DataFrame()

    # ---------- Labels ----------
    dtf_tr  = compute_days_to_failure(df_tr)
    y_tr    = create_labels_from_dtf(dtf_tr, lookahead_days)
    print(f"TRAIN labels: {int(y_tr.sum()):,} pos ({100*y_tr.mean():.4f}%)")
    if y_tr.sum() < 50:
        raise ValueError(f"Insufficient positive samples in TRAIN: {y_tr.sum()}")

    dtf_dev = compute_days_to_failure(df_dev) if not df_dev.empty else np.array([], dtype=np.int64)
    y_dev   = create_labels_from_dtf(dtf_dev, lookahead_days) if not df_dev.empty else np.array([], dtype=np.int8)

    # ---------- Features (join temporal) ----------
    X_tr, X_dev, X_te, ts_tr, ts_dev, ts_te, feature_names, cat_cols = create_features_joined_xgb_v72(
        df_tr, df_dev, df_te, dataset_type, add_rolling=True
    )

    # ---------- CV por disco en TRAIN (OOF diagnóstico) ----------
    serials_tr = df_tr['serial_number']
    oof_proba = np.zeros(len(y_tr), dtype=np.float32)
    fold_metrics = []

    for fold, (tr_idx, va_idx) in enumerate(make_group_folds(serials_tr, y_tr, n_splits=n_splits, random_state=random_state), start=1):
        X_tr_fold, y_tr_fold = X_tr.iloc[tr_idx].reset_index(drop=True), y_tr[tr_idx]
        dtf_tr_fold = dtf_tr[tr_idx]

        X_va_fold, y_va_fold = X_tr.iloc[va_idx].reset_index(drop=True), y_tr[va_idx]

        print(f"\nFold {fold}/{n_splits}: train={len(y_tr_fold):,} (pos={int(y_tr_fold.sum()):,}) | "
              f"val={len(y_va_fold):,} (pos={int(y_va_fold.sum()):,})")

        # 1) Hard-negatives para controlar tamaño y acercar al borde
        Xb, yb, sel_idx = sample_negatives_hard(
            X_tr_fold, y_tr_fold, dtf_tr_fold,
            lookahead=lookahead_days,
            neg_pos_ratio=neg_pos_ratio,
            hard_window=hard_window if dataset_type.upper()=='SSD' else max(hard_window, 90),
            hard_fraction=hard_fraction,
            seed=random_state,
        )
        print(f"  After hard-neg sampling: {len(yb):,} (pos={int(yb.sum()):,}, neg={len(yb)-int(yb.sum()):,})")

        # 2) Balanceo elegido (UNDER/SMOTENC/SMOTEENN/none)
        X_bal, y_bal = balance_with_strategy(
            Xb, yb, cat_cols=cat_cols,
            strategy=balancing,
            neg_pos_ratio=balancing_neg_pos_ratio,
            smote_k_neighbors=smote_k_neighbors,
            enn_k_neighbors=enn_k_neighbors,
            random_state=random_state,
            max_total_samples=max_balanced_samples,
        )
        # scale_pos_weight si no balanceamos (mantener info de skew para XGB)
        spw = None
        if balancing == 'none':
            n_pos = max(1, int((y_bal == 1).sum()))
            n_neg = max(1, int((y_bal == 0).sum()))
            spw = float(n_neg / n_pos)

        print(f"  After balancing [{balancing}]: {len(y_bal):,} (pos={int(y_bal.sum()):,}, neg={len(y_bal)-int(y_bal.sum()):,})")

        # 3) XGBoost
        model = train_xgb_gpu(
            X_bal, y_bal, X_va_fold, y_va_fold,
            params=xgb_params, n_estimators=xgb_n_estimators,
            early_stopping_rounds=early_stopping_rounds,
            scale_pos_weight=spw
        )

        proba_va = model.predict_proba(X_va_fold)[:, 1]
        oof_proba[va_idx] = proba_va

        thr_oof, pr_auc = pick_threshold_precision_first(
            y_va_fold, proba_va, min_precision=min_precision, min_recall=min_recall,
            top_k_rate=top_k_rate, min_alerts=min_alerts
        )
        m = metrics_at_threshold(y_va_fold, proba_va, thr_oof)
        m.update({'pr_auc': float(pr_auc), 'threshold': float(thr_oof), 'fold': int(fold)})
        fold_metrics.append(m)
        print(f"  Fold {fold} @thr={thr_oof:.4f} | P={m['precision']:.3f} R={m['recall']:.3f} F1={m['f1']:.3f} PR-AUC={pr_auc:.4f}")

        del X_tr_fold, y_tr_fold, X_va_fold, y_va_fold, Xb, yb, X_bal, y_bal, model
        cleanup()

    # ---------- OOF (diag, no se usa para prod) ----------
    thr_oof_global, pr_auc_oof = pick_threshold_precision_first(
        y_tr, oof_proba, min_precision=min_precision, min_recall=min_recall,
        top_k_rate=top_k_rate, min_alerts=min_alerts
    )
    agg_metrics = metrics_at_threshold(y_tr, oof_proba, thr_oof_global)
    agg_metrics.update({'pr_auc': float(pr_auc_oof), 'threshold': float(thr_oof_global)})
    print("\nOOF (diag, not used for prod):")
    print(json.dumps(agg_metrics, indent=2))

    # ---------- Calibración UMBRAL en DEV ----------
    thr_prod = None
    dev_metrics = None
    if not df_dev.empty:
        Xb_full, yb_full, _ = sample_negatives_hard(
            X_tr, y_tr, dtf_tr, lookahead=lookahead_days,
            neg_pos_ratio=neg_pos_ratio, hard_window=hard_window if dataset_type.upper()=='SSD' else max(hard_window,90),
            hard_fraction=hard_fraction, seed=random_state
        )
        X_bal_full, y_bal_full = balance_with_strategy(
            Xb_full, yb_full, cat_cols=cat_cols, strategy=balancing,
            neg_pos_ratio=balancing_neg_pos_ratio,
            smote_k_neighbors=smote_k_neighbors,
            enn_k_neighbors=enn_k_neighbors,
            random_state=random_state,
            max_total_samples=max_balanced_samples,
        )
        spw = None
        if balancing == 'none':
            n_pos = max(1, int((y_bal_full == 1).sum()))
            n_neg = max(1, int((y_bal_full == 0).sum()))
            spw = float(n_neg / n_pos)

        final_model_dev = train_xgb_gpu(
            X_bal_full, y_bal_full, params=xgb_params, n_estimators=xgb_n_estimators,
            early_stopping_rounds=0, scale_pos_weight=spw
        )

        proba_dev = final_model_dev.predict_proba(X_dev)[:, 1]
        thr_prod, pr_auc_dev = pick_threshold_precision_first(
            y_dev, proba_dev, min_precision=min_precision, min_recall=min_recall,
            top_k_rate=top_k_rate, min_alerts=min_alerts
        )
        dev_metrics = metrics_at_threshold(y_dev, proba_dev, thr_prod)
        dev_metrics.update({'pr_auc': float(pr_auc_dev), 'threshold': float(thr_prod)})
        print("\nDEV calibration metrics (used for PROD threshold):")
        print(json.dumps(dev_metrics, indent=2))

        del final_model_dev
        cleanup()
    else:
        thr_prod = float(thr_oof_global)
        dev_metrics = None

    # ---------- TEST ----------
    test_metrics = None
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_prefix = os.path.join(output_dir, f"{dataset_type}_xgb_precv72_{balancing}_{timestamp}")

    if not df_te.empty:
        Xb_full, yb_full, _ = sample_negatives_hard(
            X_tr, y_tr, dtf_tr, lookahead=lookahead_days,
            neg_pos_ratio=neg_pos_ratio, hard_window=hard_window if dataset_type.upper()=='SSD' else max(hard_window,90),
            hard_fraction=hard_fraction, seed=random_state
        )
        X_bal_full, y_bal_full = balance_with_strategy(
            Xb_full, yb_full, cat_cols=cat_cols, strategy=balancing,
            neg_pos_ratio=balancing_neg_pos_ratio,
            smote_k_neighbors=smote_k_neighbors,
            enn_k_neighbors=enn_k_neighbors,
            random_state=random_state,
            max_total_samples=max_balanced_samples,
        )
        spw = None
        if balancing == 'none':
            n_pos = max(1, int((y_bal_full == 1).sum()))
            n_neg = max(1, int((y_bal_full == 0).sum()))
            spw = float(n_neg / n_pos)

        final_model = train_xgb_gpu(
            X_bal_full, y_bal_full, params=xgb_params, n_estimators=xgb_n_estimators,
            early_stopping_rounds=0, scale_pos_weight=spw
        )

        y_test = create_labels_from_dtf(compute_days_to_failure(df_te), lookahead_days)
        proba_test = final_model.predict_proba(X_te)[:, 1]

        thr_used = float(thr_prod)
        test_metrics = metrics_at_threshold(y_test, proba_test, thr_used)
        test_metrics.update({'pr_auc': float(average_precision_score(y_test, proba_test)), 'threshold_used': thr_used})

        # Persistencia
        joblib.dump(final_model, f"{model_prefix}_model.pkl")
        final_model.booster.save_model(f"{model_prefix}_model.json")
        with open(f"{model_prefix}_features.json", 'w') as f:
            json.dump({
                'feature_names': feature_names,
                'cat_cols': cat_cols,
                'threshold': float(thr_used),
                'calibration': 'dev_years',
                'balancing': {
                    'strategy': balancing,
                    'neg_pos_ratio': balancing_neg_pos_ratio,
                    'smote_k_neighbors': smote_k_neighbors,
                    'enn_k_neighbors': enn_k_neighbors,
                    'max_balanced_samples': max_balanced_samples
                },
                'xgb_params': (xgb_params or get_xgb_default_params()[0])
            }, f, indent=2)
        print(f"\n✓ Final model saved: {model_prefix}_model.pkl/.json")

        del final_model
        cleanup()
    else:
        # Train-only artefacts
        Xb_full, yb_full, _ = sample_negatives_hard(
            X_tr, y_tr, dtf_tr, lookahead=lookahead_days,
            neg_pos_ratio=neg_pos_ratio, hard_window=hard_window if dataset_type.upper()=='SSD' else max(hard_window,90),
            hard_fraction=hard_fraction, seed=random_state
        )
        X_bal_full, y_bal_full = balance_with_strategy(
            Xb_full, yb_full, cat_cols=cat_cols, strategy=balancing,
            neg_pos_ratio=balancing_neg_pos_ratio,
            smote_k_neighbors=smote_k_neighbors,
            enn_k_neighbors=enn_k_neighbors,
            random_state=random_state,
            max_total_samples=max_balanced_samples,
        )
        spw = None
        if balancing == 'none':
            n_pos = max(1, int((y_bal_full == 1).sum()))
            n_neg = max(1, int((y_bal_full == 0).sum()))
            spw = float(n_neg / n_pos)

        final_model = train_xgb_gpu(
            X_bal_full, y_bal_full, params=xgb_params, n_estimators=xgb_n_estimators,
            early_stopping_rounds=0, scale_pos_weight=spw
        )
        joblib.dump(final_model, f"{model_prefix}_model.pkl")
        final_model.booster.save_model(f"{model_prefix}_model.json")
        with open(f"{model_prefix}_features.json", 'w') as f:
            json.dump({
                'feature_names': feature_names,
                'cat_cols': cat_cols,
                'threshold': float(thr_oof_global),
                'calibration': 'oof_global',
                'balancing': {
                    'strategy': balancing,
                    'neg_pos_ratio': balancing_neg_pos_ratio,
                    'smote_k_neighbors': smote_k_neighbors,
                    'enn_k_neighbors': enn_k_neighbors,
                    'max_balanced_samples': max_balanced_samples
                },
                'xgb_params': (xgb_params or get_xgb_default_params()[0])
            }, f, indent=2)
        print(f"\n✓ Final model saved (train-only): {model_prefix}_model.pkl/.json")

    # ---------- Metadata ----------
    metadata = {
        'dataset_type': dataset_type,
        'train_years': train_years,
        'dev_years': dev_years,
        'test_years': test_years,
        'lookahead_days': lookahead_days,
        'n_splits': n_splits,
        'hard_negative_sampling': {
            'neg_pos_ratio': neg_pos_ratio,
            'hard_window': hard_window,
            'hard_fraction': hard_fraction
        },
        'balancing': {
            'strategy': balancing,
            'neg_pos_ratio': balancing_neg_pos_ratio,
            'smote_k_neighbors': smote_k_neighbors,
            'enn_k_neighbors': enn_k_neighbors,
            'max_balanced_samples': max_balanced_samples
        },
        'xgb_params': (xgb_params or get_xgb_default_params()[0]) | {'n_estimators': xgb_n_estimators},
        'min_precision': min_precision,
        'min_recall': min_recall,
        'top_k_rate': top_k_rate,
        'min_alerts': min_alerts,
        'oof_metrics': {
            'precision': agg_metrics['precision'],
            'recall': agg_metrics['recall'],
            'f1': agg_metrics['f1'],
            'confusion_matrix': agg_metrics['confusion_matrix'],
            'fpr': agg_metrics['fpr'],
            'pr_auc': agg_metrics['pr_auc'],
            'threshold': agg_metrics['threshold']
        },
        'dev_metrics': dev_metrics,
        'test_metrics': test_metrics,
        'feature_names': feature_names,
        'cat_cols': cat_cols,
        'normalization': 'per-model robust z + log1p on *_raw'
    }

    meta_path = os.path.join(output_dir, f"{dataset_type}_xgb_precv72_{balancing}_{datetime.now().strftime('%Y%m%d_%H%M%S')}_metadata.json")
    with open(meta_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"\n✓ Metadata saved: {meta_path}")
    print("="*92)
    return metadata


# ===================== WRAPPERS =====================

def train_ssd_precision_v72_xgb():
    """
    SSD: ratios moderados, hard-window 60d, SMOTE opcional si hay muy pocos positivos.
    """
    xgb_params = dict(
        # Guardrails VRAM + rendimiento
        learning_rate=0.06,
        max_depth=8,
        min_child_weight=8.0,
        subsample=0.8,
        colsample_bytree=0.6,
        reg_lambda=1.2,
        reg_alpha=0.0,
        max_bin=256,
        enable_categorical=True,
        max_cat_to_onehot=16,
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        eval_metric='aucpr',
        random_state=42,
        verbosity=0,
    )
    return train_xgb_precision_pipeline_v72(
        train_parquet='./Procesados/finales/SSD_FULL_CLEAN.parquet',
        test_parquet='./Procesados/finales/SSD_FULL_CLEAN.parquet',
        dataset_type='SSD',
        train_years=[2020, 2021, 2022, 2023],
        dev_years=[2024],
        test_years=[2025],
        lookahead_days=7,
        n_splits=5,

        neg_pos_ratio=3,
        hard_window=60,
        hard_fraction=0.7,

        balancing='smote_knn',                 # cambiar a 'smote_enn' si la clase positiva es < ~0.1%
        balancing_neg_pos_ratio=3,
        smote_k_neighbors=5,
        enn_k_neighbors=3,
        max_balanced_samples=40_000,

        xgb_params=xgb_params,
        xgb_n_estimators=1500,
        early_stopping_rounds=200,

        min_precision=0.90,
        min_recall=0.03,
        top_k_rate=1e-4,
        min_alerts=20,
        output_dir='./models_xgb_ssd_precv72',
        random_state=42
    )


def train_hdd_precision_v72_xgb():
    """
    HDD: dataset grande → más negativos, ventana larga, y a veces conviene 'under' puro para controlar VRAM.
    """
    xgb_params = dict(
        learning_rate=0.06,
        max_depth=8,
        min_child_weight=10.0,
        subsample=0.8,
        colsample_bytree=0.6,
        reg_lambda=1.0,
        reg_alpha=0.0,
        max_bin=256,         # 256–512; subir a 512 si mejoras estables en PR-AUC y hay VRAM
        enable_categorical=True,
        max_cat_to_onehot=16,
        tree_method='gpu_hist',
        predictor='gpu_predictor',
        eval_metric='aucpr',
        random_state=42,
        verbosity=0,
    )
    return train_xgb_precision_pipeline_v72(
        train_parquet='./Procesados/finales/HDD_FULL_CLEAN.parquet',
        test_parquet='./Procesados/finales/HDD_FULL_CLEAN.parquet',
        dataset_type='HDD',
        train_years=[2020, 2021, 2022, 2023],
        dev_years=[2024],
        test_years=[2025],
        lookahead_days=7,
        n_splits=5,

        neg_pos_ratio=5,
        hard_window=90,
        hard_fraction=0.7,

        balancing='under',                 # en HDD empezar con 'under'; probar SMOTENC en subsets
        balancing_neg_pos_ratio=5,
        smote_k_neighbors=7,
        enn_k_neighbors=3,
        max_balanced_samples=60_000,

        xgb_params=xgb_params,
        xgb_n_estimators=1500,
        early_stopping_rounds=200,

        min_precision=0.90,
        min_recall=0.03,
        top_k_rate=7.5e-5,
        min_alerts=30,
        output_dir='./models_xgb_hdd_precv72',
        random_state=42
    )


In [12]:
train_ssd_precision_v72_xgb()

XGBOOST (GPU, PRECISION-CONTROLLED v7.2) - SSD (Train=[2020, 2021, 2022, 2023] | Dev=[2024] | Test=[2025])
Balancing: smote_knn (target neg:pos=3)
Loading [2020, 2021, 2022, 2023] from ./Procesados/finales/SSD_FULL_CLEAN.parquet...
Loaded 2,124,111 rows
Loading [2024] from ./Procesados/finales/SSD_FULL_CLEAN.parquet...
Loaded 1,220,745 rows
Loading [2025] from ./Procesados/finales/SSD_FULL_CLEAN.parquet...
Loaded 0 rows
TRAIN labels: 1,325 pos (0.0624%)
Creating features (temporal join) for SSD with TRAIN∪DEV∪TEST...
  Found 13 SMART attributes (raw)
  Final features: 97 (cat=2, num≈95)

Fold 1/5: train=1,709,653 (pos=1,079) | val=414,458 (pos=246)
  After hard-neg sampling: 4,316 (pos=1,079, neg=3,237)
  After balancing [smote_knn]: 4,316 (pos=1,079, neg=3,237)
  Fold 1 @thr=0.7276 | P=1.000 R=0.045 F1=0.086 PR-AUC=0.2743

Fold 2/5: train=1,707,701 (pos=1,054) | val=416,410 (pos=271)
  After hard-neg sampling: 4,216 (pos=1,054, neg=3,162)
  After balancing [smote_knn]: 4,216 (pos=1,05

{'dataset_type': 'SSD',
 'train_years': [2020, 2021, 2022, 2023],
 'dev_years': [2024],
 'test_years': [2025],
 'lookahead_days': 7,
 'n_splits': 5,
 'hard_negative_sampling': {'neg_pos_ratio': 3,
  'hard_window': 60,
  'hard_fraction': 0.7},
 'balancing': {'strategy': 'smote_knn',
  'neg_pos_ratio': 3,
  'smote_k_neighbors': 5,
  'enn_k_neighbors': 3,
  'max_balanced_samples': 40000},
 'xgb_params': {'learning_rate': 0.06,
  'max_depth': 8,
  'min_child_weight': 8.0,
  'subsample': 0.8,
  'colsample_bytree': 0.6,
  'reg_lambda': 1.2,
  'reg_alpha': 0.0,
  'max_bin': 256,
  'enable_categorical': True,
  'max_cat_to_onehot': 16,
  'tree_method': 'gpu_hist',
  'predictor': 'gpu_predictor',
  'eval_metric': 'aucpr',
  'random_state': 42,
  'verbosity': 0,
  'n_estimators': 1500},
 'min_precision': 0.9,
 'min_recall': 0.03,
 'top_k_rate': 0.0001,
 'min_alerts': 20,
 'oof_metrics': {'precision': 0.9302325581395349,
  'recall': 0.03018867924528302,
  'f1': 0.05847953216374269,
  'confusion_m