In [1]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

BASE = Path.cwd().parents[0]
DATA = BASE / "data" / "processed" / "dataset_obras.parquet"
FIGS = BASE / "reports" / "figures"
FIGS.mkdir(parents=True, exist_ok=True)

df = pd.read_parquet(DATA)

# 1) Distribución target
ax = df["y_riesgo"].value_counts().sort_index().plot(kind="bar")
ax.set_title("Distribución de y_riesgo")
ax.set_xlabel("Clase")
ax.set_ylabel("Cuenta")
plt.tight_layout()
plt.savefig(FIGS / "01_target_dist.png")
plt.close()

# 2) Missing values (top 20)
na = df.isna().sum().sort_values(ascending=False)
na = na[na > 0].head(20)
ax = na.plot(kind="bar")
ax.set_title("Nulos por columna (Top 20)")
ax.set_ylabel("Nulos")
plt.tight_layout()
plt.savefig(FIGS / "02_missing_top20.png")
plt.close()

# 3) Importancias (Permutation) si tienes pipeline y modelo
import joblib
from sklearn.inspection import permutation_importance

try:
    pipe = joblib.load(BASE / "models" / "pipeline.pkl")
    y = df["y_riesgo"].astype(int)
    X = df.drop(columns=["y_riesgo"])
    # muestrar si es grande
    rs = np.random.RandomState(42)
    sample_idx = rs.choice(len(X), size=min(150, len(X)), replace=False)
    Xs, ys = X.iloc[sample_idx], y.iloc[sample_idx]
    r = permutation_importance(pipe, Xs, ys, n_repeats=5, random_state=42)
    # nombres de features post-OHE
    # (usamos nombres de columnas crudos; para OHE el mapping es largo, basta top-20 índices)
    imp = (
        pd.Series(r.importances_mean, index=range(len(r.importances_mean)))
        .sort_values(ascending=False)
        .head(20)
    )
    ax = imp.plot(kind="bar")
    ax.set_title("Importancia por permutación (Top 20, índices transformados)")
    ax.set_ylabel("Δ score")
    plt.tight_layout()
    plt.savefig(FIGS / "03_importance_perm.png")
    plt.close()
except Exception as e:
    print("No se pudo calcular permutation_importance:", e)

# 4) Correlación numérica
num_cols = df.select_dtypes(include=["number"]).drop(columns=["y_riesgo"]).columns
if len(num_cols) >= 2:
    corr = df[num_cols].corr(numeric_only=True)
    plt.figure()
    plt.imshow(corr, aspect="auto")
    plt.colorbar()
    plt.title("Matriz de correlación (numérica)")
    plt.xticks(range(len(num_cols)), num_cols, rotation=90, fontsize=7)
    plt.yticks(range(len(num_cols)), num_cols, fontsize=7)
    plt.tight_layout()
    plt.savefig(FIGS / "04_corr_heatmap.png")
    plt.close()

# 5) Top categorías (ejemplo: SECTOR si existe)
cat_col = next((c for c in ["SECTOR", "NIVEL_GOBIERNO", "ESTADO_OBRA"] if c in df.columns), None)
if cat_col:
    top = df[cat_col].value_counts().head(10)
    ax = top.plot(kind="bar")
    ax.set_title(f"Top categorías: {cat_col}")
    ax.set_ylabel("Cuenta")
    plt.tight_layout()
    plt.savefig(FIGS / "05_top_categorias.png")
    plt.close()

# 6) Curva PR del modelo guardado
try:
    from sklearn.metrics import average_precision_score, precision_recall_curve

    meta = json.load(open(BASE / "models" / "pipeline_meta.json"))
    thr = meta.get("best_threshold_f1", 0.5)

    # partimos 80/20 igual que en entrenamiento para reproducir proba
    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(
        df.drop(columns=["y_riesgo"]),
        df["y_riesgo"].astype(int),
        test_size=0.2,
        random_state=42,
        stratify=df["y_riesgo"].astype(int),
    )
    pos = list(pipe.named_steps["model"].classes_).index(1)
    proba = pipe.predict_proba(X_test)[:, pos]
    p, r, _ = precision_recall_curve(y_test, proba)
    ap = average_precision_score(y_test, proba)
    plt.plot(r, p)
    plt.title(f"Curva Precisión-Recall (AP={ap:.3f})")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.tight_layout()
    plt.savefig(FIGS / "06_pr_curve.png")
    plt.close()
except Exception as e:
    print("No se pudo generar PR curve:", e)

print("Figuras guardadas en:", FIGS)

No se pudo generar PR curve: name 'json' is not defined
Figuras guardadas en: c:\MaestriaUNI\Cursos\III-CICLO\TesisI\Solucion\Deteccion_Corrupcion\reports\figures
