# 03 · Entrenamiento y Evaluación (Pipeline sklearn)
Usa `data/processed/dataset_obras.parquet` con `y_riesgo` como target.
Guarda `models/pipeline.pkl`.

In [None]:
# === Notebook 03: Entrenamiento, selección y guardado del pipeline ===
from pathlib import Path
import json, warnings
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report, roc_auc_score, average_precision_score,
    precision_recall_curve, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import joblib

warnings.filterwarnings("ignore", message="Se omiten características sin ningún valor observado")

# --- Rutas y constantes ---
BASE = Path.cwd().parents[0]
DATA = BASE / "data" / "processed" / "dataset_obras.parquet"
OUT_DIR = BASE / "models"
OUT = OUT_DIR / "pipeline.pkl"
META = OUT_DIR / "pipeline_meta.json"
TARGET = "y_riesgo"

# == 1) Cargar dataset ==
print("Leyendo:", DATA)
df = pd.read_parquet(DATA)
assert TARGET in df.columns, f"No encuentra target: {TARGET}"
print("Shape:", df.shape)

print("Nulos por columna (top 15):")
print(df.isna().sum().sort_values(ascending=False).head(15))

y = df[TARGET].astype(int)
X = df.drop(columns=[TARGET])

print("\nDistribución target total:")
print(y.value_counts(dropna=False))

# == 2) Limpieza de columnas problemáticas ==
# 2.1 Quitar columnas 100% NaN
all_nan = [c for c in X.columns if X[c].isna().all()]
if all_nan:
    print("\nEliminando columnas 100% NaN:", all_nan)
    X = X.drop(columns=all_nan)

# 2.2 Quitar columnas constantes (una sola categoría/valor)
const_cols = [c for c in X.columns if X[c].nunique(dropna=True) <= 1]
if const_cols:
    print("Eliminando columnas constantes:", const_cols)
    X = X.drop(columns=const_cols)

# 2.3 Re-detectar tipos
num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object","category","bool"]).columns.tolist()
print(f"[Después de limpieza] Num cols: {len(num_cols)}  Cat cols: {len(cat_cols)}  Total: {X.shape[1]}")

# == 3) Sanidad de filas y split estratificado robusto ==
# Eliminar filas sin ninguna feature (todo NaN en num y en cat)
if num_cols:
    X_num = X[num_cols]
else:
    X_num = pd.DataFrame(index=X.index)
if cat_cols:
    X_cat = X[cat_cols]
else:
    X_cat = pd.DataFrame(index=X.index)

rows_all_nan = (X_num.isna().all(axis=1) if len(X_num.columns) else True) & \
               (X_cat.isna().all(axis=1) if len(X_cat.columns) else True)

n_drop = int(rows_all_nan.sum())
if n_drop > 0:
    print("Eliminando filas sin ninguna feature:", n_drop)
    X = X.loc[~rows_all_nan].copy()
    y = y.loc[X.index].copy()

print("Filas post-limpieza:", len(X))
print("Distribución target post-limpieza:\n", y.value_counts(dropna=False))

# Split: asegurar al menos 1 minoritario en test
minor = int(y.value_counts().min())
ts_min = max(0.1, 1 / max(minor, 1))  # al menos 10% o 1 minoritario
test_size = max(0.2, ts_min) if len(y) >= 10 else 0.3

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, stratify=y, random_state=42
)
print(f"Split con test_size={test_size:.3f} -> train={len(X_train)}  test={len(X_test)}")

# == 4) Preprocesamiento ==
pre = ColumnTransformer([
    ("num", SimpleImputer(strategy="median"), num_cols),
    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols)
])

# == 5) Comparación de modelos por PR-AUC (CV=5) ==
models = {
    "LogReg": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "RF": RandomForestClassifier(
        n_estimators=400, max_depth=10, min_samples_leaf=3,
        class_weight="balanced", n_jobs=-1, random_state=42
    ),
    "GB": GradientBoostingClassifier(random_state=42)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = {}
print("\n== Validación cruzada (PR-AUC) ==")
for name, m in models.items():
    pipe = Pipeline([("pre", pre), ("model", m)])
    scores = cross_val_score(pipe, X, y, cv=cv, scoring="average_precision")
    cv_scores[name] = float(scores.mean())
    print(f"{name:>6}  PR-AUC CV: {np.round(scores,4)}  → mean: {scores.mean():.4f}")

best_name = max(cv_scores, key=cv_scores.get)
best_model = models[best_name]
print(f"\n▶ Mejor por PR-AUC CV: {best_name} (mean={cv_scores[best_name]:.4f})")

# == 6) Entrenar el mejor y evaluar ==
clf = Pipeline([("pre", pre), ("model", best_model)])
clf.fit(X_train, y_train)

# Predicciones y probabilidades
pred = clf.predict(X_test)
proba = None
try:
    est = clf.named_steps["model"]
    classes = list(est.classes_)
    pos_idx = classes.index(1) if 1 in classes else 1
    proba = clf.predict_proba(X_test)[:, pos_idx]
except Exception as e:
    print("Predict_proba no disponible:", e)

print(f"\n== {best_name} (umbral 0.5) ==")
print(classification_report(y_test, pred, digits=4))
best_thr = 0.5
if proba is not None:
    roc = roc_auc_score(y_test, proba)
    prauc = average_precision_score(y_test, proba)
    print(f"ROC-AUC: {roc:.4f} | PR-AUC: {prauc:.4f}")

    # Umbral óptimo por F1
    precisions, recalls, thresholds = precision_recall_curve(y_test, proba)
    f1_scores = 2*precisions*recalls/(precisions+recalls+1e-9)
    best_idx = f1_scores[:-1].argmax() if len(thresholds) else 0
    best_thr = float(thresholds[best_idx]) if len(thresholds) else 0.5

    pred_opt = (proba >= best_thr).astype(int)
    cm = confusion_matrix(y_test, pred_opt, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()
    print(f"\nUmbral óptimo por F1: {best_thr:.3f}")
    print("Matriz de confusión (umbral óptimo por F1) [0/1]:")
    print(cm)
    print(f"Precision: {tp/(tp+fp+1e-9):.4f} | Recall: {tp/(tp+fn+1e-9):.4f} | F1: {f1_scores[best_idx]:.4f}")

# == 7) Guardar pipeline y metadatos ==
OUT_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(clf, OUT)
meta = {
    "target": TARGET,
    "num_cols": num_cols,
    "cat_cols": cat_cols,
    "best_model": best_name,
    "cv_pr_auc": cv_scores,
    "best_threshold_f1": best_thr
}
META.write_text(json.dumps(meta, indent=2, ensure_ascii=False))
print("\n✅ Pipeline guardado en:", OUT)
print("✅ Metadatos guardados en:", META)


Leyendo: c:\MaestriaUNI\Cursos\III-CICLO\TesisI\Solucion\Deteccion_Corrupcion\data\processed\dataset_obras.parquet
Shape: (171, 53)
Nulos por columna (top 15):
DNI_MIEMBRO_COMITE               171
CONVOCATORIA                     171
CODIGO_RUC_PARTICIPANTE          171
CODIGO_RUC_GANADOR               171
NOMBRE_MIEMBRO_COMITE            171
MES                              171
Planificado                      171
ANHO                             171
IND_Monto_Adelanto_Directo       171
IND_Monto_Adelanto_Materiales    171
IND_Residente                    171
IND_Intervension                 171
Real                             171
IND_Fecha_Adelanto_Directo       171
IND_Fecha_Adelanto_Materiales    171
dtype: int64

Distribución target total:
y_riesgo
1    153
0     18
Name: count, dtype: int64

Eliminando columnas 100% NaN: ['CONVOCATORIA', 'DNI_MIEMBRO_COMITE', 'NOMBRE_MIEMBRO_COMITE', 'CODIGO_RUC_GANADOR', 'CODIGO_RUC_PARTICIPANTE', 'NOMBRE_PARTICIPANTE', 'EMPRESA_EJECUTORA', 'EM