In [1]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
df = pd.read_csv('../data/matriz_features_lags.csv')

In [3]:
df_train = df[df['date_block_num'] < 33]
df_val = df[df['date_block_num'] == 33]

In [4]:
features = [
    'item_id', 'shop_id', 'item_category_id',
    'month', 'year',
    'item_avg_monthly_sales', 'shop_avg_monthly_sales',
    'item_cnt_month_lag_1', 'item_cnt_month_lag_3'
]
target = 'item_cnt_month'

X_train = df_train[features]
y_train = df_train[target]

In [5]:
#Modelo base
xgb = XGBRegressor(objective='reg:squarederror', n_jobs=-1, random_state=42)

#Definir búsqueda de hiperparámetros
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'subsample': [0.5, 0.7, 0.9, 1.0],
}

In [6]:
#RandomizedSearch con validación cruzada
search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    scoring='neg_root_mean_squared_error',
    n_iter=10,  #Número combinaciones que probará
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

#Entrenar con train
search.fit(X_train, y_train)

#Mejor combinación de parámetros
print("Mejores parámetros encontrados:")
print(search.best_params_)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Mejores parámetros encontrados:
{'subsample': 0.5, 'n_estimators': 100, 'max_depth': 8, 'learning_rate': 0.05}


In [7]:
#Modelo con mejores parámetros
best_model = search.best_estimator_

#Predecir en validación
X_val = df_val[features]
y_val = df_val[target]
y_pred = best_model.predict(X_val)

#Calcular RMSE
rmse = sqrt(mean_squared_error(y_val, y_pred))
print(f'RMSE en validación (tuned): {rmse:.4f}')

RMSE en validación (tuned): 0.8958
