# Modelo de regresión lineal

En este notebook vamos a probar y optimizar el modelo de regresión lineal

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [8]:
# Partimos del dataset imputado
df = pd.read_csv("../data/aptos_bogota_imputed.csv")  # Usa aquí tu archivo completo

# Variables a usar
numeric_features = ['area', 'habitaciones', 'banos', 'administracion', 'parqueaderos', 'estrato']
categorical_features = ['sector', 'estado']

# Eliminar valores faltantes en columnas seleccionadas
df = df[numeric_features + categorical_features + ['precio_venta']].dropna()

X = df[numeric_features + categorical_features]
y = df['precio_venta']

X.head()

Unnamed: 0,area,habitaciones,banos,administracion,parqueaderos,estrato,sector,estado
434,48.0,2.0,1.0,105000.0,0.0,3.0,ALTOS DE SUBA Y CERROS DE SAN JORGE,USADO
435,109.0,3.0,2.0,247000.0,1.0,4.0,NORMANDIA,USADO
436,51.0,2.0,1.0,307000.0,0.0,6.0,SANTA BARBARA,USADO
437,230.0,4.0,3.0,530000.0,2.0,4.0,NICOLAS DE FEDERMAN,USADO
438,90.0,3.0,3.0,600000.0,1.0,5.0,COLINA Y ALREDEDORES,USADO


In [9]:
# Preprocesador
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Pipeline completo
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [10]:
# === Validación cruzada (k=5) ===
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, scoring='r2', cv=cv)
rmse_scores = -1 * cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=cv)
mae_scores = -1 * cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv)

print("===== Validación Cruzada (k=5) =====")
print(f"R² promedio      : {scores.mean():.4f}")
print(f"RMSE promedio    : {rmse_scores.mean():,.2f}")
print(f"MAE promedio     : {mae_scores.mean():,.2f}")



===== Validación Cruzada (k=5) =====
R² promedio      : 0.8116
RMSE promedio    : 371,659,795.77
MAE promedio     : 227,903,659.25


In [15]:
# === Evaluación final con hold-out ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

rmse_holdout = root_mean_squared_error(y_test, y_pred)
mae_holdout = mean_absolute_error(y_test, y_pred)
r2_holdout = r2_score(y_test, y_pred)

print("\n===== Evaluación en Hold-Out =====")
print(f"R²  : {r2_holdout:.4f}")
print(f"RMSE: {rmse_holdout:,.2f}")
print(f"MAE : {mae_holdout:,.2f}")


===== Evaluación en Hold-Out =====
R²  : 0.8191
RMSE: 363,392,998.32
MAE : 225,901,561.80
