# Ablation Study – Examen Parcial

Se evalúa el impacto de diferentes subconjuntos de variables sobre el desempeño del modelo RandomForest (full, solo numéricas, solo categóricas).


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

df = pd.read_parquet('data/processed/dataset_parcial_features.parquet')

features_num = [
    "TIEMPO_ABSOLUCION_CONSULTAS",
    "TIEMPO_PRESENTACION_OFERTAS",
    "MONTO_CONTRACTUAL",
    "MONTO_REFERENCIAL",
    "MONTO_OFERTADO_PROMEDIO",
    "MONTO_OFERTADO",
    "TOTALPROCESOSPARTICIPANTES",
    "DIAS_PLAZO",
    "TOTAL_CONTROL_PREVIO",
    "TOTAL_CONTROL_SIMULTANEO",
    "TOTAL_CONTROL_POSTERIOR",
    "PLANIFICADO",
    "REAL",
    "ANHO",
    "MES"
]

features_cat = [
    "SECTOR",
    "DEPARTAMENTO",
    "NIVEL_GOBIERNO",
    "OBJETO_PROCESO",
    "METODO_CONTRATACION",
    "ESTADO_OBRA",
    "ETAPA",
    "IND_INTERVENSION",
    "IND_RESIDENTE",
    "IND_MONTO_ADELANTO_MATERIALES",
    "IND_MONTO_ADELANTO_DIRECTO"
]


X = df[features_num + features_cat]
y = df['y_riesgo']

num_base = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

cat_base = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

def run_cfg(name, num_cols, cat_cols):
    pre = ColumnTransformer([
        ('num', num_base, num_cols),
        ('cat', cat_base, cat_cols)
    ])
    model = RandomForestClassifier(
        n_estimators=300,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    pipe = Pipeline([('pre', pre), ('model', model)])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_validate(
        pipe,
        X[num_cols + cat_cols],
        y,
        cv=cv,
        scoring={'f1': 'f1', 'recall': 'recall', 'roc_auc': 'roc_auc'}
    )
    return {
        'config': name,
        'f1_mean': scores['test_f1'].mean(),
        'recall_mean': scores['test_recall'].mean(),
        'roc_auc_mean': scores['test_roc_auc'].mean()
    }

resultados = []
resultados.append(run_cfg('full', features_num, features_cat))
resultados.append(run_cfg('solo_numericas', features_num, []))
resultados.append(run_cfg('solo_categoricas', [], features_cat))

df_abla = pd.DataFrame(resultados)
display(df_abla)
