In [1]:
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import pandas as pd
import numpy as np

In [3]:


# Cargar data enriquecida
df = pd.read_csv("../data/aptos_bogota_enriched.csv")

# Variables
y = df["precio_venta"]

enrichment_features = [
    col for col in df.columns
    if any(prefix in col for prefix in [
        "education_", "healthcare_", "retail_access_", 
        "dining_and_entertainment_", "accommodation_", 
        "parks_and_recreation_", "infrastructure_services_", 
        "cultural_amenities_"
    ])
]

num_features = [
    "area", "habitaciones", "banos", "administracion", 
    "parqueaderos", "estrato", "latitud", "longitud", 
    "estrato_calculado", "catastral", "comercial"
] + enrichment_features

cat_features = ["sector", "estado", "antiguedad"]
binary_features = ["alarma", "ascensor", "conjunto_cerrado", "gimnasio", "piscina", "zona_de_bbq"]

X = df[num_features + cat_features + binary_features]

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)



In [4]:
# Preprocesadores
num_transformer = SimpleImputer(strategy="median")
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="desconocido")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_features),
        ("cat", cat_transformer, cat_features),
        ("bin", "passthrough", binary_features)
    ]
)

model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(random_state=42, n_jobs=-1))
])
model

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'desconocido'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [5]:
# Validación cruzada
kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scores = -cross_val_score(model, X_train, y_train, cv=kf, scoring="neg_root_mean_squared_error")
mae_scores = -cross_val_score(model, X_train, y_train, cv=kf, scoring="neg_mean_absolute_error")
r2_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring="r2")

print("\n===== Validación cruzada (k=5) =====")
print(f"RMSE promedio: {rmse_scores.mean():,.0f}")
print(f"MAE promedio: {mae_scores.mean():,.0f}")
print(f"R² promedio: {r2_scores.mean():.3f}")


===== Validación cruzada (k=5) =====
RMSE promedio: 263,878,583
MAE promedio: 147,533,570
R² promedio: 0.907


In [6]:

# Entrenar modelo final y evaluar en hold-out
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

In [7]:
import numpy as np
from sklearn.metrics import (
    root_mean_squared_error,
    mean_absolute_error,
    r2_score
)

y_true = np.array(y_val)
y_pred = np.array(y_pred)

mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
y_true = y_true[mask]
y_pred = y_pred[mask]

rmse_final = root_mean_squared_error(y_true, y_pred)
mae_final = mean_absolute_error(y_true, y_pred)
r2_final = r2_score(y_true, y_pred)
mape_final = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print(f"✅ RMSE: {rmse_final:,.0f}")
print(f"✅ MAE: {mae_final:,.0f}")
print(f"✅ R²: {r2_final:.3f}")
print(f"✅ MAPE: {mape_final:.2f}%")


✅ RMSE: 258,549,664
✅ MAE: 145,706,022
✅ R²: 0.908
✅ MAPE: 14.23%


In [26]:
print(y_val.describe())
print(pd.Series(y_pred).describe())

count    4.114000e+03
mean     9.920474e+08
std      8.583844e+08
min      1.300000e+06
25%      4.100000e+08
50%      7.000000e+08
75%      1.300000e+09
max      5.325000e+09
Name: precio_venta, dtype: float64
count    4.114000e+03
mean     9.942244e+08
std      8.221304e+08
min      7.588037e+07
25%      4.188935e+08
50%      7.177876e+08
75%      1.314576e+09
max      4.928606e+09
dtype: float64


In [27]:
print(y_true[y_true < 50_000_000])

[1590000. 1300000. 1500000. 1530000. 5000000. 1590000.]
