In [3]:
# ===========================================================
# PIPELINE GENERAL: evaluación múltiple de modelos (Base/Aumentado)
# ===========================================================

import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import (
    make_scorer,
    mean_absolute_error,
    r2_score,
    root_mean_squared_error
)
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor





In [None]:
def interpretar_rmse_log(rmse_log, df, target_col="precio_venta"):
    """Convierte RMSE en logaritmos a error porcentual y pesos COP."""
    precio_medio = df[target_col].mean()
    error_pct = np.exp(rmse_log) - 1
    error_pesos = precio_medio * error_pct
    return error_pct, error_pesos, precio_medio


def evaluar_modelos(df, nombre_dataset="Dataset"):
    """
    Evalúa múltiples modelos de regresión sobre el dataset dado.
    Retorna DataFrame con métricas y diccionario con modelos entrenados.
    """

    print(f"\n=== Evaluando {nombre_dataset} ===")

    # ------------------------------
    # 1️⃣ Preparar datos
    # ------------------------------
    target = "precio_venta_log"
    ignore_cols = ["precio_venta", "precio_venta_log", "sector", "localidad_calculada"]
    features = [c for c in df.columns if c not in ignore_cols]

    X = df[features].copy()
    y = df[target].copy()

    # ------------------------------
    # 2️⃣ Preprocesamiento
    # ------------------------------
    numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features)
        ]
    )

    # ------------------------------
    # 3️⃣ Definir modelos
    # ------------------------------
    modelos = {
        "LinearRegression": LinearRegression(),
        "Lasso": Lasso(alpha=0.1, max_iter=10000),
        "Ridge": Ridge(alpha=1.0),
        "RandomForest": RandomForestRegressor(
            n_estimators=200, random_state=42, n_jobs=-1
        ),
        # "SVR": SVR(kernel="rbf", C=10, epsilon=0.2),
        "XGBoost": XGBRegressor(
            n_estimators=500, learning_rate=0.05, max_depth=6,
            subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
        ),
        "LightGBM": LGBMRegressor(
            n_estimators=500, learning_rate=0.05, max_depth=-1,
            subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
        )
    }

    # ------------------------------
    # 4️⃣ Validación cruzada
    # ------------------------------
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    scoring = {
        "RMSE": make_scorer(root_mean_squared_error, greater_is_better=False),
        "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
        "R2": make_scorer(r2_score)
    }

    resultados = []
    modelos_entrenados = {}

    # ------------------------------
    # 5️⃣ Entrenar y evaluar
    # ------------------------------
    for nombre, modelo in modelos.items():
        print(f"Entrenando modelo: {nombre} ...")
        pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", modelo)])

        cv_result = cross_validate(
            pipeline, X, y, cv=kf, scoring=scoring, n_jobs=-1, return_train_score=False
        )

        rmse_log = -np.mean(cv_result["test_RMSE"])
        mae_log = -np.mean(cv_result["test_MAE"])
        r2_mean = np.mean(cv_result["test_R2"])
        error_pct, error_pesos, precio_medio = interpretar_rmse_log(rmse_log, df)

        resultados.append({
            "Dataset": nombre_dataset,
            "Modelo": nombre,
            "RMSE log": rmse_log,
            "MAE log": mae_log,
            "R²": r2_mean,
            "Error %": error_pct * 100,
            "Error medio (COP)": error_pesos,
            "Precio medio (COP)": precio_medio
        })

        # Entrenar el modelo completo para guardarlo
        pipeline.fit(X, y)
        modelos_entrenados[nombre] = pipeline

    # ------------------------------
    # 6️⃣ Resultados
    # ------------------------------
    resultados_df = pd.DataFrame(resultados).sort_values(by="RMSE log")
    print("\n=== Resultados de evaluación ===")
    print(resultados_df[["Modelo", "RMSE log", "R²", "Error %", "Error medio (COP)"]].round(3))

    return resultados_df, modelos_entrenados


In [None]:
# ===========================================================
# EJEMPLO DE USO
# ===========================================================
# result_base, modelos_base = evaluar_modelos(df_base, "Modelo Base")
# result_enriched, modelos_enriched = evaluar_modelos(df_enriched, "Modelo Aumentado")

# comparacion = pd.concat([result_base, result_enriched])
# print("\n=== Comparación entre datasets ===")
# print(comparacion.round(3))