## Modelo con características reducidas

Este modelo se entrena después de obtener las conclusiones del primer modelo en que se obtuvieron las características más importantes, el objetivo acá es hacer un modelo más ligero.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

In [2]:


# ===========================
# 1️⃣ Cargar y preparar datos
# ===========================
df = pd.read_csv("../data/aptos_bogota_enriched.csv")

df = df.drop("estrato", axis=1).rename(columns={
    "estrato_calculado": "estrato",
    "barrio_calculado": "barrio",
    "upz_calculada": "upz"
})

df = df[~df['barrio'].isnull()].reset_index(drop=True)

def imputar_por_barrio(df, columna, metodo='media'):
    imputacion = None
    if metodo == 'media':
        imputacion = df.groupby('barrio')[columna].transform('mean')
    elif metodo == 'moda':
        imputacion = df.groupby('barrio')[columna].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0])
    df[columna] = df[columna].fillna(imputacion)
    return df

for col, metodo in [
    ('administracion', 'media'),
    ('estado', 'moda'),
    ('upz', 'moda'),
    ('catastral', 'media'),
    ('comercial', 'media'),
    ('antiguedad', 'moda')
]:
    df = imputar_por_barrio(df, col, metodo)

ignore_cols = ['precio_venta', 'sector', 'localidad_calculada']

target = 'precio_venta'

# ===========================
# 2️⃣ Selección de features
# ===========================
# Variables originales importantes + 4 barrios del top20
features_reducidas = [
    'area', 'parqueaderos', 'administracion', 'banos',
    'antiguedad', 'habitaciones', 'estado', 'gimnasio', 
    'ascensor', 'piscina', 'zona_de_bbq', 'latitud', 'longitud'
]

# barrios más relevantes según top 20
barrios_top = [
    'VILLAS DE ARANJUEZ', 'SAN', 'SAN PATRICIO', 'CHICO RESERVADO'
]

# crear columna categórica de barrio solo si está en top
df['barrio_top'] = df['barrio'].where(df['barrio'].isin(barrios_top), 'OTROS')

features_reducidas.append('barrio_top')

train_df, holdout_df = train_test_split(df, test_size=0.2, random_state=42)


In [3]:


X_train = train_df[features_reducidas]
y_train_log = np.log(train_df[target])

X_holdout = holdout_df[features_reducidas]
y_holdout = holdout_df[target]

# ===========================
# 3️⃣ Pipeline
# ===========================
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

def log_transform(x):
    x = np.where(x <= 0, np.nan, x)
    return np.log(x)

log_area_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log', FunctionTransformer(log_transform, validate=False))
])

num_cols_sin_area = [col for col in numerical_features if col != 'area']

numeric_transformer = ColumnTransformer(
    transformers=[
        ('log_area', log_area_transformer, ['area']),
        ('num', SimpleImputer(strategy='median'), num_cols_sin_area)
    ]
)

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

model = XGBRegressor(
    n_estimators=100, max_depth=6, learning_rate=0.1,
    objective='reg:squarederror', random_state=42
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# ===========================
# 4️⃣ Entrenar y evaluar
# ===========================
pipeline.fit(X_train, y_train_log)



0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function log...00250DA9A2AC0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [4]:
y_pred_log = pipeline.predict(X_holdout)
y_pred = np.exp(y_pred_log)

rmse = root_mean_squared_error(y_holdout, y_pred)
mae = mean_absolute_error(y_holdout, y_pred)
r2 = r2_score(y_holdout, y_pred)

print("\n=== Métricas en hold-out con modelo reducido ===")
print(f"RMSE: {rmse:.2f}")
print(f"MAE : {mae:.2f}")
print(f"R²  : {r2:.4f}")


=== Métricas en hold-out con modelo reducido ===
RMSE: 254660375.93
MAE : 136544393.48
R²  : 0.9139


## Búsqueda de hiperparámetros

Buscar la mejor combinación de hiperparámetros para XGBRegressor usando validación cruzada en el conjunto de entrenamiento del modelo reducido.

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, root_mean_squared_error

# usar RMSE como métrica para optimizar
rmse_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)

# espacio de búsqueda
param_distributions = {
    "model__n_estimators": [100, 300, 500, 700],
    "model__max_depth": [3, 5, 7, 9],
    "model__learning_rate": [0.01, 0.05, 0.1, 0.3],
    "model__subsample": [0.6, 0.8, 1.0],
    "model__colsample_bytree": [0.6, 0.8, 1.0],
    "model__reg_alpha": [0, 0.1, 1, 10],
    "model__reg_lambda": [1, 5, 10],
}

# configuramos la búsqueda
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=30,  # puedes subirlo a 50–100 para más cobertura
    cv=3,
    scoring=rmse_scorer,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# ejecutar la búsqueda
random_search.fit(X_train, y_train_log)

# mejores parámetros
print("\nMejores hiperparámetros encontrados:")
print(random_search.best_params_)


Fitting 3 folds for each of 30 candidates, totalling 90 fits

Mejores hiperparámetros encontrados:
{'model__subsample': 0.8, 'model__reg_lambda': 1, 'model__reg_alpha': 0, 'model__n_estimators': 500, 'model__max_depth': 9, 'model__learning_rate': 0.05, 'model__colsample_bytree': 0.8}


In [6]:

# evaluar en hold-out
y_pred_best_log = random_search.best_estimator_.predict(X_holdout)
y_pred_best = np.exp(y_pred_best_log)

rmse_best = root_mean_squared_error(y_holdout, y_pred_best)
mae_best = mean_absolute_error(y_holdout, y_pred_best)
r2_best = r2_score(y_holdout, y_pred_best)

print("\n=== Métricas en hold-out con modelo optimizado ===")
print(f"RMSE: {rmse_best:.2f}")
print(f"MAE : {mae_best:.2f}")
print(f"R²  : {r2_best:.4f}")


=== Métricas en hold-out con modelo optimizado ===
RMSE: 233494104.13
MAE : 121955187.67
R²  : 0.9276


## Conclusiones
* El modelo reducido + optimizado supera al modelo original en todas las métricas.
* Se mejoró el RMSE en ~22 millones frente al modelo original (~8.5%).
* El MAE bajó en ~17 millones, lo cual significa errores promedio más bajos.
* El R² subió de 0.9083 a 0.9233, mostrando mejor capacidad explicativa.

In [7]:
import cloudpickle
import os

errores = y_pred - y_holdout
abs_errores = np.abs(errores)

info_modelo = {
    "rmse": float(mean_squared_error(y_holdout, y_pred) ** 0.5),
    "mae": float(mean_absolute_error(y_holdout, y_pred)),
    "r2": float(r2_score(y_holdout, y_pred)),
    "error_80_percentil": float(np.percentile(abs_errores, 80)),
    "error_95_percentil": float(np.percentile(abs_errores, 95)),
    "min_predicho": float(y_pred.min()),
    "max_predicho": float(y_pred.max())
}


columns = X_train.columns.tolist()
# guardar el pipeline
os.makedirs("../data/models", exist_ok=True)
file_name = "xgboost_model_2.2.pkl"
model_path = f"../data/models/{file_name}"
with open(model_path, "wb") as f:
    cloudpickle.dump({
        "model": pipeline,
        "info": info_modelo,
        "columns": columns
    }, f)
print(f"Pipeline exportado a '{file_name}'")

Pipeline exportado a 'xgboost_model_2.2.pkl'
