In [None]:
"""
forecast_pipeline.py

Skeleton pipeline to forecast weekly sell‑out units for TV SKUs across multiple retail chains
using LightGBM + skforecast.  Every step is annotated so you understand *what* happens and *why*.

──────────────────────────────────────────────────────────────────────────────
🔧  WHAT YOU NEED TO FILL IN BEFORE RUNNING
──────────────────────────────────────────────────────────────────────────────
1.  DATA SOURCES
    • Path(s) to the raw CSV / Parquet files   OR   a SQLAlchemy connection URL.
    • Column mapping – rename here if your names differ:
        ─ date:              "fecha"
        ─ product id:        "sku"
        ─ retail chain id:   "cadena"
        ─ units sold:        "sellout_u"
        ─ revenue:           "sellout_m"
        ─ inventory (Sun):   "inventario"
        ─ sell‑in units:     "sellin_u"
        ─ promo flags / weight(s): one column per promo or a single numeric weight.
        ─ technology:        "tecnologia"   (LCD, OLED…)
        ─ size inches:       "tamano"

2.  FORECAST SETTINGS
    • FORECAST_H = 16            # horizon in *weeks*
    • LEVEL_KEYS  = ["cadena", "tecnologia", "tamano"]  # hierarchy you want one model per combo

3.  VALIDATION WINDOWS
    • SPLIT_DATE = "2024‑10‑06"   # last date used for training (YYYY‑MM‑DD)

4.  OPTIONAL
    • GRANULARITY = "W"  # "W" for weekly (recommended), "D" if you stay at daily level.

Replace the values inside CONFIG below or load them from a YAML/ENV.
"""

# ===========================================================================
# 1 ─ Imports & Config
# ===========================================================================
from __future__ import annotations

import pandas as pd
import numpy as np
from pathlib import Path
from dataclasses import dataclass, field

# ML libs
from lightgbm import LGBMRegressor
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import grid_search_forecaster
from sklearn.metrics import mean_absolute_error

###############################################################################
# Configuration dataclass keeps all tunables in one place
###############################################################################
@dataclass
class CONFIG:
    # ── paths ────────────────────────────────────────────────────────────────
    DATA_DIR: Path = Path("./data")
    SALES_FILE: str = "sellout.csv"       # adjust
    INVENTORY_FILE: str | None = None     # join later if separate
    PROMO_FILE: str | None = None

    # ── columns ──────────────────────────────────────────────────────────────
    DATE_COL: str = "fecha"
    UNIT_COL: str = "sellout_u"
    REV_COL: str = "sellout_m"
    INV_COL: str = "inventario"
    SELLIN_COL: str = "sellin_u"

    ID_COLS:   list[str] = field(default_factory=lambda: ["cadena", "sku"])
    CAT_COLS:  list[str] = field(default_factory=lambda: ["tecnologia", "tamano"])
    PROMO_COLS:list[str] = field(default_factory=lambda: ["promo_hot_sale", "promo_bf"])

    # ── modelling ────────────────────────────────────────────────────────────
    GRANULARITY: str = "W"      # resample freq
    FORECAST_H: int = 16        # horizon (weeks)
    LAGS: int = 14              # how many past observations as features
    SPLIT_DATE: str = "2024-10-06"

CFG = CONFIG()

# ===========================================================================
# 2 ─ Data Loading
# ===========================================================================

def load_sales() -> pd.DataFrame:
    """Load raw daily sell‑out.  Replace with SQL if needed."""
    sales_path = CFG.DATA_DIR / CFG.SALES_FILE
    df = pd.read_csv(sales_path, parse_dates=[CFG.DATE_COL])
    return df

# ===========================================================================
# 3 ─ Pre‑processing & Feature Engineering
# ===========================================================================

def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """Clean types, derive price & weekly aggregation if requested."""
    # ░░ Derive average selling price ░░
    df["precio"] = (df[CFG.REV_COL] / df[CFG.UNIT_COL]).replace([np.inf, -np.inf], np.nan)
    
    # ░░ Resample / aggregate ░░
    if CFG.GRANULARITY == "W":
        # Sunday as week‑ending (ISO‑like). Using pandas' W‑SUN label.
        agg_dict = {
            CFG.UNIT_COL: "sum",
            CFG.REV_COL: "sum",
            "precio": "mean",
            CFG.INV_COL: "last",     # keep Sunday inventory
            CFG.SELLIN_COL: "sum",
            **{col: "max" for col in CFG.PROMO_COLS},
        }
        df = (
            df
            .set_index(CFG.DATE_COL)
            .groupby(CFG.ID_COLS + CFG.CAT_COLS)
            .resample("W-SUN")
            .agg(agg_dict)
            .reset_index()
        )
    
    # ░░ Fill missing values ░░
    df.fillna(0, inplace=True)
    
    # ░░ Cyclical date features ░░
    df["weekofyear"] = df[CFG.DATE_COL].dt.isocalendar().week.astype(int)
    df["sin_woy"] = np.sin(2 * np.pi * df["weekofyear"] / 52)
    df["cos_woy"] = np.cos(2 * np.pi * df["weekofyear"] / 52)
    
    return df

# ===========================================================================
# 4 ─ Train / Validation split & Model setup
# ===========================================================================

def make_train_test(df: pd.DataFrame):
    train = df[df[CFG.DATE_COL] <= CFG.SPLIT_DATE].copy()
    test  = df[df[CFG.DATE_COL] >  CFG.SPLIT_DATE].copy()
    return train, test


def build_forecaster(lags: int = CFG.LAGS) -> ForecasterAutoreg:
    """Create skforecast wrapper around LightGBM."""
    reg = LGBMRegressor(objective="poisson", random_state=42)
    forecaster = ForecasterAutoreg(regressor=reg, lags=lags)
    return forecaster

# ===========================================================================
# 5 ─ Hyper‑parameter search per hierarchy combo
# ===========================================================================

def train_per_combo(df: pd.DataFrame):
    all_results = []
    for keys, gdf in df.groupby(CFG.ID_COLS + CFG.CAT_COLS):
        print(f"\n🛠  Training model for combo: {keys}")
        train, test = make_train_test(gdf)
        y_train = train[CFG.UNIT_COL]
        y_test  = test[CFG.UNIT_COL]

        exog_cols = [c for c in gdf.columns if c not in CFG.ID_COLS + CFG.CAT_COLS + [CFG.DATE_COL, CFG.UNIT_COL]]
        exog_train = train[exog_cols]
        exog_test  = test[exog_cols]

        forecaster = build_forecaster()

        param_grid = {
            "regressor__max_depth": [3, 5, 7],
            "regressor__learning_rate": [0.05, 0.1, 0.2],
            "regressor__n_estimators": [200, 500],
        }
        forecaster, grid_results = grid_search_forecaster(
            forecaster=forecaster,
            y=y_train,
            exog=exog_train,
            param_grid=param_grid,
            steps=CFG.FORECAST_H,
            metric="mae",
            refit=True,
            verbose=False,
        )
        # ─ Evaluation ─
        preds = forecaster.predict(steps=len(y_test), exog=exog_test)
        mae = mean_absolute_error(y_test, preds)
        print(f"MAE: {mae:.3f}")

        # Store
        result = {
            "combo": keys,
            "mae": mae,
            "best_params": forecaster.regressor.get_params()
        }
        all_results.append(result)
    return pd.DataFrame(all_results)

# ===========================================================================
# 6 ─ Main execution
# ===========================================================================
if __name__ == "__main__":
    raw  = load_sales()
    data = preprocess(raw)
    perf = train_per_combo(data)

    # Save results
    perf.to_csv("model_performance.csv", index=False)
    print("\n✅ Training finished.  Results saved to model_performance.csv")
