In [None]:
import pandas as pd
import numpy as np

# ===========================
# 1 Cargar y preparar datos
# ===========================
df = pd.read_csv("../data/aptos_bogota_enriched.csv")

# Renombrar columnas calculadas
df = df.drop("estrato", axis=1).rename(columns={
    "estrato_calculado": "estrato",
    "barrio_calculado": "barrio",
    "upz_calculada": "upz"
})

# Eliminar filas sin barrio (necesario para imputar)
df = df[~df['barrio'].isnull()].reset_index(drop=True)

print(f"Registros después de limpieza: {len(df)}")

# ===========================
# 2 Imputación jerárquica
# ===========================
def imputar_por_barrio(df, columna, metodo='media'):
    if metodo == 'media':
        imputacion = df.groupby('barrio')[columna].transform('mean')
    elif metodo == 'moda':
        imputacion = df.groupby('barrio')[columna].transform(
            lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0]
        )
    df[columna] = df[columna].fillna(imputacion)
    return df

# Variables a imputar
for col, metodo in [
    ('administracion', 'media'),
    ('estado', 'moda'),
    ('upz', 'moda'),
    ('catastral', 'media'),
    ('comercial', 'media'),
    ('antiguedad', 'moda')
]:
    if col in df.columns:
        df = imputar_por_barrio(df, col, metodo)

print("Imputación por barrio completada.")

# ===========================
# 3 Transformación logarítmica
# ===========================
df['precio_venta_log'] = np.log1p(df['precio_venta'])
df['area_log'] = np.log1p(df['area'])

# Confirmación de variables
print("\nColumnas transformadas añadidas:")
print(['precio_venta_log', 'area_log'])

df.head()


Registros después de limpieza: 27214
Imputación por barrio completada.

Columnas transformadas añadidas:
['precio_venta_log', 'area_log']


Unnamed: 0,precio_venta,area,habitaciones,banos,administracion,parqueaderos,sector,antiguedad,latitud,longitud,...,parks_and_recreation_2000,infrastructure_services_2000,cultural_amenities_2000,catastral,comercial,upz,barrio,localidad_calculada,precio_venta_log,area_log
0,339000000.0,76.0,3.0,2.0,300000.0,1.0,BRITALIA,ENTRE 10 Y 20 ANOS,4.746592,-74.057571,...,35,165,7,3227303.0,3848904.0,USAQUEN,SANTA ANA OCCIDENTAL,USAQUEN,19.641511,4.343805
1,440898168.0,54.0,3.0,2.0,305000.0,0.0,LA SABANA,ENTRE 0 Y 5 ANOS,4.607378,-74.082648,...,89,86,1,1718168.0,2144399.0,EL PRADO,EL PLAN,SUBA,19.904324,4.007333
2,158000000.0,43.0,2.0,2.0,106600.0,0.0,TIBABUYES,ENTRE 10 Y 20 ANOS,4.740109,-74.113675,...,144,85,1,1627592.0,2322134.0,EL PRADO,MAZUREN,SUBA,18.878106,3.78419
3,222800000.0,48.0,3.0,2.0,151000.0,0.0,VERBENAL,MAS DE 20 ANOS,4.7639,-74.02528,...,40,178,6,3550046.0,4278709.0,SANTA BARBARA,MOLINOS NORTE,USAQUEN,19.221785,3.89182
4,128900000.0,47.0,2.0,1.0,86500.0,0.0,TINTAL SUR,ENTRE 10 Y 20 ANOS,4.632698,-74.198111,...,39,190,20,3340397.0,4172320.0,EL REFUGIO,LOS ROSALES,CHAPINERO,18.674547,3.871201


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# ===========================
# 4 Definir target y features
# ===========================
target = "precio_venta_log"

# Columnas que no se usarán como predictores
ignore_cols = [
    "precio_venta", "precio_venta_log", "sector",
    "localidad_calculada", "area_log"
]

# Variables predictoras
features = [col for col in df.columns if col not in ignore_cols]

X = df[features].copy()
y = df[target].copy()

# ===========================
# 5 Clasificar variables
# ===========================
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "bool"]).columns.tolist()

print(f"Variables numéricas: {len(numeric_features)}")
print(f"Variables categóricas: {len(categorical_features)}")

# ===========================
# 6 Preprocesamiento
# ===========================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    remainder="drop"
)

print("Preprocesamiento configurado correctamente")


Variables numéricas: 55
Variables categóricas: 4
✅ Preprocesamiento configurado correctamente


In [3]:
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import make_scorer, mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import pandas as pd
import numpy as np

# ===========================
# 7 Definir modelos
# ===========================
modelos = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(alpha=0.1, max_iter=10000),
    "Ridge": Ridge(alpha=1.0),
    "RandomForest": RandomForestRegressor(
        n_estimators=200, max_depth=None, random_state=42, n_jobs=-1),
    "SVR": SVR(kernel="rbf", C=10, epsilon=0.2),
    "XGBoost": XGBRegressor(
        n_estimators=500, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1),
    "LightGBM": LGBMRegressor(
        n_estimators=500, learning_rate=0.05, max_depth=-1,
        subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1)
}

# ===========================
# 8 Configurar validación cruzada
# ===========================
kf = KFold(n_splits=10, shuffle=True, random_state=42)
scoring = {
    "RMSE": make_scorer(root_mean_squared_error, greater_is_better=False),
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
    "R2": make_scorer(r2_score)
}

resultados = []

# ===========================
# 9 Evaluar cada modelo
# ===========================
for nombre, modelo in modelos.items():
    pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", modelo)])
    cv_result = cross_validate(
        pipeline, X, y, cv=kf, scoring=scoring, n_jobs=-1, return_train_score=False
    )
    resultados.append({
        "Modelo": nombre,
        "RMSE medio": -np.mean(cv_result["test_RMSE"]),
        "RMSE std": np.std(cv_result["test_RMSE"]),
        "MAE medio": -np.mean(cv_result["test_MAE"]),
        "R² medio": np.mean(cv_result["test_R2"])
    })

# ===========================
# 10 Mostrar resultados ordenados
# ===========================
resultados_df = pd.DataFrame(resultados).sort_values(by="RMSE medio")
print(resultados_df.round(3))


KeyboardInterrupt: 