# XAI – Interpretabilidad con SHAP (RandomForest)

Este notebook entrena un RandomForest con el mismo preprocesamiento del parcial y utiliza SHAP para analizar la importancia de las variables.


In [1]:

# ======================================================
# Patch rutas absolutas (compatible con papermill + jobs)
# ======================================================
import os

# Ruta absoluta a la raíz del proyecto
ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))

def path(*args):
    """Devuelve rutas absolutas a partir de la raíz del proyecto."""
    return os.path.join(ROOT, *args)


INPUT_PATH=path('data\\processed\\dataset_parcial_features.parquet')
print("[Patch] Rutas absolutas activadas. INPUT_PATH =", INPUT_PATH)


[Patch] Rutas absolutas activadas. INPUT_PATH = c:\IA_Investigacion\Deteccion_Corrupcion\data\processed\dataset_parcial_features.parquet


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
import shap

df = pd.read_parquet(INPUT_PATH)

features_num = [
    "TIEMPO_ABSOLUCION_CONSULTAS",
    "TIEMPO_PRESENTACION_OFERTAS",
    "MONTO_CONTRACTUAL",
    "MONTO_REFERENCIAL",
    "MONTO_OFERTADO_PROMEDIO",
    "MONTO_OFERTADO",
    "TOTALPROCESOSPARTICIPANTES",
    "DIAS_PLAZO",
    "TOTAL_CONTROL_PREVIO",
    "TOTAL_CONTROL_SIMULTANEO",
    "TOTAL_CONTROL_POSTERIOR",
    "PLANIFICADO",
    "REAL",
    "ANHO",
    "MES"
]

features_cat = [
    "SECTOR",
    "DEPARTAMENTO",
    "NIVEL_GOBIERNO",
    "OBJETO_PROCESO",
    "METODO_CONTRATACION",
    "ESTADO_OBRA",
    "ETAPA",
    "IND_INTERVENSION",
    "IND_RESIDENTE",
    "IND_MONTO_ADELANTO_MATERIALES",
    "IND_MONTO_ADELANTO_DIRECTO"
]


X = df[features_num + features_cat]
y = df['y_riesgo']

num_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

cat_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

pre = ColumnTransformer([
    ('num', num_trans, features_num),
    ('cat', cat_trans, features_cat)
])

model = RandomForestClassifier(
    n_estimators=300,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline([('pre', pre), ('model', model)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe.fit(X_train, y_train)

X_train_trans = pipe.named_steps['pre'].transform(X_train)
if hasattr(X_train_trans, 'toarray'):
    X_train_trans = X_train_trans.toarray()
X_train_trans = X_train_trans.astype('float64')

rf = pipe.named_steps['model']
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_train_trans)
shap.summary_plot(shap_values[1])
