PARTE VI

A.- El reto aquí es transformar un formato "ancho" (días como columnas) a "largo" (tidy data) y hacer los joins correctos sin explotar la memoria.

In [1]:
import pandas as pd
import numpy as np

# Cargar datasets (filtrando columnas innecesarias al leer si fuera posible, 
# pero aquí filtraremos post-carga por simplicidad pedagógica)
print("Cargando datos...")
df_sales = pd.read_csv('sales_train_evaluation.csv')
df_calendar = pd.read_csv('calendar.csv')
df_prices = pd.read_csv('sell_prices.csv')

# --- 1. Filtrado del Subconjunto (FOODS_3 & CA Stores) ---
# Filtramos ANTES de hacer melt para reducir el tamaño drásticamente
subset_sales = df_sales[
    (df_sales['dept_id'] == 'FOODS_3') & 
    (df_sales['store_id'].isin(['CA_1', 'CA_2', 'CA_3', 'CA_4']))
].copy()

# --- 2. Transformación de Ancho a Largo (Melt) ---
# Identificamos las columnas de días (d_1 a d_1913)
id_vars = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
val_vars = [c for c in subset_sales.columns if c.startswith('d_')]

df_melted = pd.melt(subset_sales, id_vars=id_vars, value_vars=val_vars, 
                    var_name='d', value_name='y')

# --- 3. Feature Engineering con Calendar ---
# Preparamos calendar
df_calendar['date'] = pd.to_datetime(df_calendar['date'])
df_calendar['is_event'] = (~df_calendar['event_name_1'].isna()).astype(int)
df_calendar['is_weekend'] = df_calendar['wday'].isin([1, 2]).astype(int) # 1=Sat, 2=Sun en M5

# Seleccionamos cols necesarias de calendar
cols_cal = ['date', 'd', 'wm_yr_wk', 'wday', 'snap_CA', 'is_event', 'is_weekend']
df_merged = df_melted.merge(df_calendar[cols_cal], on='d', how='left')

# --- 4. Feature Engineering con Precios ---
df_final = df_merged.merge(df_prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')

# --- 5. Formato Final MLForecast ---
df_final = df_final.rename(columns={'date': 'ds', 'id': 'unique_id'})

# Renombramos 'wday' a 'day_of_week' como pide la tarea (ajustando a 0-6 si prefieres, pero wday sirve)
df_final = df_final.rename(columns={'wday': 'day_of_week'})

# Seleccionamos columnas finales
final_cols = ['unique_id', 'ds', 'y', 'is_event', 'snap_CA', 'sell_price', 'day_of_week', 'is_weekend', 'd']
df_final = df_final[final_cols]

# Separar Train (hasta d_1900) y Test (d_1901 a d_1913)
# Nota: Guardamos d_1901+ para la evaluación, pero MLForecast necesita las X futuras separadas
train_df = df_final[df_final['ds'] <= df_calendar.loc[df_calendar['d']=='d_1900', 'date'].values[0]].reset_index(drop=True)
test_df = df_final[(df_final['ds'] > df_calendar.loc[df_calendar['d']=='d_1900', 'date'].values[0]) & 
                   (df_final['ds'] <= df_calendar.loc[df_calendar['d']=='d_1913', 'date'].values[0])].reset_index(drop=True)

print("Entregable 4a: Head del DataFrame:")
print(train_df.head())

ModuleNotFoundError: No module named 'pandas'

B.- Aquí usamos la potencia de Nixtla. Configuramos los lags y las transformaciones de ventana móvil sobre esos lags.

In [5]:
from mlforecast import MLForecast
from mlforecast.lag_transforms import RollingMean, RollingStd
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# 1. Definir Modelos
models = [
    LGBMRegressor(random_state=42, verbose=-1), # verbose=-1 para menos ruido
    XGBRegressor(random_state=42)
]

# 2. Configurar el Pipeline
fcst = MLForecast(
    models=models,
    freq='D', # Frecuencia diaria
    lags=[1, 7, 14, 28], # Lags solicitados
    lag_transforms={
        1: [ # Transformaciones aplicadas sobre el lag 1 (standard practice)
            RollingMean(window_size=7),
            RollingMean(window_size=14),
            RollingStd(window_size=7)
        ]
    },
    date_features=['day', 'dayofyear', 'week'], # Features extraídas de la fecha
    num_threads=4 # Ajustar según tu CPU
)

# 3. Entrenamiento
# Definimos las columnas estáticas y exógenas dinámicas
# MLForecast detecta automáticamente las exógenas si están en el DF pero no son 'ds', 'y' o 'unique_id'
print("Entrenando modelos (esto puede tardar unos minutos)...")
fcst.fit(train_df, id_col='unique_id', time_col='ds', target_col='y')

print("¡Entrenamiento exitoso!")

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ['dlopen(/opt/anaconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Symbol not found: ___kmpc_dispatch_deinit\n  Referenced from: <2B45A273-7403-3BBA-8DBD-A90F576E2F02> /opt/anaconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib\n  Expected in:     <8AE74FFC-75EE-374B-BA46-A70944EA81BD> /opt/anaconda3/lib/libomp.dylib', 'dlopen(/opt/anaconda3/lib/libxgboost.dylib, 0x0006): Symbol not found: ___kmpc_dispatch_deinit\n  Referenced from: <0F4027F9-A1CA-38CB-9F8E-AFC46906DEDB> /opt/anaconda3/lib/libxgboost.dylib\n  Expected in:     <8AE74FFC-75EE-374B-BA46-A70944EA81BD> /opt/anaconda3/lib/libomp.dylib']
