In [1]:
# ==============================
# 1. Importaciones
# ==============================
import pandas as pd
import numpy as np
from joblib import load
from sklearn.metrics import mean_squared_error, mean_absolute_error


In [2]:
from pathlib import Path

def _infer_y_col_from_name(artifact_path: str) -> str:
    name = Path(artifact_path).name.lower()
    for cand in ["price", "consumption", "profit"]:
        if cand in name:
            return cand
    # por defecto
    return "price"

def _infer_group_cols(df: pd.DataFrame) -> list:
    # intenta columnas típicas disponibles
    candidates = ["country", "type", "variety", "region", "continent"]
    return [c for c in candidates if c in df.columns]

def predict_with_intervals(df_new: pd.DataFrame, artifact_path: str):
    art = load(artifact_path)

    # 1) Obtén el modelo (soporta dict o pipeline)
    if isinstance(art, dict) or hasattr(art, "get"):
        mdl = art.get("model", art)  # si no está, asumimos que el propio objeto es el modelo
        y_col = art.get("y_col", None)
        group_cols = art.get("group_cols", None)
        q80 = art.get("PI80_abs", None)
        q95 = art.get("PI95_abs", None)
    else:
        mdl = art
        y_col = None
        group_cols = None
        q80 = None
        q95 = None

    # 2) Si faltan metadatos, infiere
    if y_col is None:
        y_col = _infer_y_col_from_name(artifact_path)
        print(f"[AVISO] 'y_col' no venía en el artifact. Inferido desde el nombre: y_col='{y_col}'")

    if group_cols is None:
        group_cols = _infer_group_cols(df_new)
        if group_cols:
            print(f"[AVISO] 'group_cols' no venía en el artifact. Inferido: {group_cols}")
        else:
            print(f"[AVISO] 'group_cols' no disponible. Se usará solo 'year' como clave.")

    # 3) Reconstruye features con tu misma función
    #    Asegúrate que build_xy(df, y_col) esté definido/importado antes.
    X_new, _, y_true = build_xy(df_new, y_col)
    X_new = X_new.dropna()
    y_true = y_true.loc[X_new.index]

    # 4) Predicción
    preds = mdl.predict(X_new)

    base_cols = ["year"] + group_cols if group_cols else ["year"]
    out = df_new.loc[X_new.index, base_cols].copy()
    out[y_col] = y_true
    out[f"pred_{y_col}"] = preds

    # 5) Intervalos (si existen en el artifact)
    if q80 is not None:
        out[f"pred_{y_col}_lo80"] = preds - q80
        out[f"pred_{y_col}_hi80"] = preds + q80
    if q95 is not None:
        out[f"pred_{y_col}_lo95"] = preds - q95
        out[f"pred_{y_col}_hi95"] = preds + q95

    return out, y_col, group_cols


In [3]:
# ==============================
# build_xy que replica columnas esperadas por el Pipeline
# ==============================
def build_xy(df: pd.DataFrame, y_col: str, group_cols: list | None = None):
    """
    Construye las columnas que tu ColumnTransformer espera:
      - consumption: lag1..3, ma2, ma3
      - price      : lag1..3, ma2, ma3  (>> añadido lag1..3 y ma3 <<)
      - profit     : lag1..3, ma2, ma3
    Además, pasa columnas crudas si existen:
      country, type, margin, revenue, market_share
    Retorna: X, feat_cols, y (alineados y sin NAs en features/target)
    """
    df = df.copy()
    if "year" not in df.columns:
        raise ValueError("Se requiere columna 'year' en el DataFrame.")

    # Grupos por defecto
    if group_cols is None or len(group_cols) == 0:
        group_cols = [c for c in ["country", "type"] if c in df.columns]

    # Orden temporal estable
    sort_cols = (group_cols + ["year"]) if group_cols else ["year"]
    df = df.sort_values(sort_cols)

    # Helper para crear lags y MAs por grupo (sin fuga de futuro)
    def _add_lags_ma(g, col, lags=(1,2,3), mas=(2,3)):
        if col not in g.columns:
            g[col] = np.nan
        for k in lags:
            g[f"{col}_lag{k}"] = g[col].shift(k)
        for w in mas:
            g[f"{col}_ma{w}"] = g[col].shift(1).rolling(w, min_periods=1).mean()
        return g

    # Aplicar por grupo (o global)
    if group_cols:
        df = df.groupby(group_cols, group_keys=False).apply(
            lambda g: _add_lags_ma(
                _add_lags_ma(
                    _add_lags_ma(g, "consumption"),   # cons: lags+ma2+ma3
                    "price"                          # price: lags+ma2+ma3 (ahora sí)
                ),
                "profit"                             # profit: lags+ma2+ma3
            )
        )
    else:
        df = _add_lags_ma(_add_lags_ma(_add_lags_ma(df, "consumption"), "price"), "profit")

    # Columnas crudas que el pipeline podría pasar
    passthrough_cols = [c for c in ["country", "type", "margin", "revenue", "market_share"] if c in df.columns]

    # Features de cada target
    cons_feats = [f"consumption_lag{k}" for k in (1,2,3)] + [f"consumption_ma{k}" for k in (2,3)]
    price_feats = [f"price_lag{k}" for k in (1,2,3)] + [f"price_ma{k}" for k in (2,3)]   # << actualizado
    prof_feats  = [f"profit_lag{k}" for k in (1,2,3)] + [f"profit_ma{k}" for k in (2,3)]

    required_feats = cons_feats + price_feats + prof_feats + passthrough_cols

    # Asegura existencia de todas
    for c in required_feats:
        if c not in df.columns:
            df[c] = np.nan

    # Construir X e y y alinear
    X = df[required_feats].copy()
    y = df[y_col].copy()

    mask = X.notna().all(axis=1) & y.notna()
    X = X.loc[mask]
    y = y.loc[mask]

    feat_cols = required_feats
    return X, feat_cols, y

In [6]:
def smape(y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (sMAPE).
    Valores en %: cuanto más bajo, mejor.
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2
    # Evita divisiones por cero
    mask = denom != 0
    return 100 * np.mean(np.abs(y_pred[mask] - y_true[mask]) / denom[mask])


In [7]:
artifact_path = "models/lasso_price.joblib"
df_data = pd.read_csv("data/coffee_clean.csv")

out, y_col, group_cols = predict_with_intervals(df_data, artifact_path)

year_val = 2020
df_val = out[out["year"] == year_val].copy()

y_true = df_val[y_col].values
y_pred = df_val[f"pred_{y_col}"].values

rmse = np.sqrt(mean_squared_error(y_true, y_pred))   # <<-- cambiado
mae = mean_absolute_error(y_true, y_pred)
smape_val = smape(y_true, y_pred)
n_val = len(df_val)

print(f"Artifact elegido: {artifact_path}")
print(f"▶ EVAL HOLDOUT — target={y_col}, año_val={year_val}")
print(f"  RMSE     : {rmse:.6f}")
print(f"  MAE      : {mae:.6f}")
print(f"  sMAPE(%) : {smape_val:.6f}")
print(f"  n_val    : {n_val}")



[AVISO] 'y_col' no venía en el artifact. Inferido desde el nombre: y_col='price'
[AVISO] 'group_cols' no venía en el artifact. Inferido: ['country', 'type']
Artifact elegido: models/lasso_price.joblib
▶ EVAL HOLDOUT — target=price, año_val=2020
  RMSE     : 5.491270
  MAE      : 3.931593
  sMAPE(%) : 3.578794
  n_val    : 52


  df = df.groupby(group_cols, group_keys=False).apply(
