
# Inferencia (Simple) — High Garden Coffee

Cuaderno **mínimo** para:
1) Cargar datos limpios (`data/coffee_clean.csv`).
2) Encontrar un *artifact* de modelo en `models/`.
3) Evaluar por **holdout temporal** (entrena con años `< max_year` y valida en `max_year`).
4) Predecir (incluye opción sencilla para **extender** a 2021–2022).

> No requiere gráficos ni notebooks previos. Si tu artifact es un `dict` (con `model`) o un `Pipeline`, funciona igual.


In [1]:

import os, glob, warnings
import numpy as np
import pandas as pd
from joblib import load
from sklearn.base import clone
from IPython.display import display

warnings.filterwarnings("ignore")
pd.set_option("display.width", 120)


## 1) Cargar datos

In [2]:

# Ajusta la ruta si es necesario
DF_PATH = "data/coffee_clean.csv"

df = pd.read_csv(DF_PATH)
df.columns = [c.strip().lower() for c in df.columns]
df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
df = df.sort_values(["country","type","year"]).reset_index(drop=True)

print("Columnas:", df.columns.tolist())
print("Rango de años:", int(df["year"].dropna().min()), "→", int(df["year"].dropna().max()))
display(df.head(3))


Columnas: ['country', 'type', 'consumption', 'year', 'price', 'revenue', 'profit', 'margin', 'market_share']
Rango de años: 1991 → 2020


Unnamed: 0,country,type,consumption,year,price,revenue,profit,margin,market_share
0,Angola,Robusta/Arabica,1200000,1991,87.686363,105223600.0,105130300.0,0.999113,0.001025
1,Angola,Robusta/Arabica,1800000,1992,87.686363,157835500.0,157712100.0,0.999219,0.001483
2,Angola,Robusta/Arabica,2100000,1993,87.686363,184141400.0,184003000.0,0.999249,0.001671


## 2) Helpers (artifact y columnas)

In [3]:

def find_artifact(models_dir="models", target="price"):
    """Busca un .joblib en models/. Prioriza nombres que contengan el target."""
    for pat in (f"*{target}*.joblib", "*.joblib"):
        files = sorted(glob.glob(os.path.join(models_dir, pat)))
        if files:
            print("Artifact elegido:", os.path.basename(files[0]))
            return files[0]
    raise FileNotFoundError("No encontré .joblib en 'models/'.")

def expected_input_columns_from_pipeline(mdl):
    """Extrae columnas que puedan estar configuradas en un ColumnTransformer (si aplica)."""
    cols = []
    try:
        for _, step in getattr(mdl, "steps", []):
            transformers = getattr(step, "transformers", None)
            if not transformers: 
                continue
            for _, __, c in transformers:
                if isinstance(c, (list, tuple)):
                    cols.extend([str(x) for x in c])
                elif isinstance(c, str):
                    cols.append(c)
        # unique preserving order
        seen=set(); out=[]
        for c in cols:
            if c not in seen:
                seen.add(c); out.append(c)
        return out
    except Exception:
        return []

def ensure_compat_columns(dfin, expected_cols):
    if not expected_cols:
        return dfin
    d = dfin.copy()
    for c in expected_cols:
        if c not in d.columns:
            d[c] = np.nan
    return d[expected_cols]


## 3) `build_xy` mínima (para artifacts tipo dict)

In [4]:

def build_xy(df_in: pd.DataFrame, target: str):
    d = df_in.copy()
    d.columns = [c.strip().lower() for c in d.columns]

    # y
    if target in d.columns:
        y = pd.to_numeric(d[target], errors="coerce")
    else:
        y = pd.Series(np.nan, index=d.index, dtype=float)

    # X: year + one-hot de country/type si existen
    parts = []
    if "year" in d.columns:
        parts.append(pd.to_numeric(d["year"], errors="coerce").to_frame("year"))
    for cat in ["country","type"]:
        if cat in d.columns:
            parts.append(pd.get_dummies(d[cat].astype(str), prefix=cat, drop_first=False))

    if not parts:
        parts.append(pd.DataFrame({"row_ix": np.arange(len(d), dtype=float)}, index=d.index))

    X = pd.concat(parts, axis=1)
    return X, y, {}


## 4) Evaluación por holdout (simple y robusta)

In [5]:

def evaluate_holdout(df_in: pd.DataFrame, target="price", models_dir="models"):
    art_path = find_artifact(models_dir=models_dir, target=target)
    art = load(art_path)

    d = df_in.copy()
    d.columns = [c.strip().lower() for c in d.columns]
    if "year" not in d.columns:
        raise ValueError("Se requiere columna 'year' para holdout temporal.")

    max_year = int(pd.to_numeric(d["year"], errors="coerce").dropna().max())
    tr_mask = d["year"] < max_year
    va_mask = d["year"] == max_year

    # Caso 1: artifact dict => usa build_xy (misma lógica de features)
    if isinstance(art, dict) and "model" in art:
        mdl = art["model"]
        X_all, y_all, _ = build_xy(d, target)
        X_tr = X_all.loc[tr_mask].dropna(); y_tr = y_all.loc[X_tr.index]
        X_va = X_all.loc[va_mask].dropna(); y_va = y_all.loc[X_va.index]
    else:
        # Caso 2: Pipeline => pasa columnas crudas que espera el ColumnTransformer
        mdl = art
        expected = expected_input_columns_from_pipeline(mdl)
        d_use = ensure_compat_columns(d, expected)
        # y para evaluación
        y_all = pd.to_numeric(d.get(target, np.nan), errors="coerce")
        X_tr = d_use.loc[tr_mask].copy(); y_tr = y_all.loc[X_tr.index]
        X_va = d_use.loc[va_mask].copy(); y_va = y_all.loc[X_va.index]
        # quitar NaN en y
        keep_tr = y_tr.notna(); X_tr, y_tr = X_tr.loc[keep_tr], y_tr.loc[keep_tr]
        keep_va = y_va.notna(); X_va, y_va = X_va.loc[keep_va], y_va.loc[keep_va]

    model_fit = clone(mdl).fit(X_tr, y_tr)
    pred = model_fit.predict(X_va)

    # métricas
    def rmse(y_true, y_pred):
        yt, yp = np.asarray(y_true, float), np.asarray(y_pred, float)
        return float(np.sqrt(np.mean((yt-yp)**2)))
    def smape(y_true, y_pred):
        yt, yp = np.asarray(y_true, float), np.asarray(y_pred, float)
        denom = np.abs(yt)+np.abs(yp); denom = np.where(denom==0, 1e-9, denom)
        return float(np.mean(2*np.abs(yp-yt)/denom)*100)
    mae = float(np.mean(np.abs(y_va - pred)))

    print(f"▶ EVAL HOLDOUT — target={target}, año_val={max_year}")
    print(f"  RMSE     : {rmse(y_va, pred):,.6f}")
    print(f"  MAE      : {mae:,.6f}")
    print(f"  sMAPE(%) : {smape(y_va, pred):,.6f}")
    print(f"  n_val    : {int(len(y_va))}")
    return {"artifact": art_path, "RMSE": rmse(y_va, pred), "MAE": mae, "sMAPE(%)": smape(y_va, pred), "n_val": int(len(y_va))}


## 5) Predicción (incluye opción de años futuros)

In [6]:

def predict_simple(df_in: pd.DataFrame, target="price", models_dir="models", future_years=None):
    art_path = find_artifact(models_dir=models_dir, target=target)
    art = load(art_path)

    # extender años si se pide
    d = df_in.copy()
    d.columns = [c.strip().lower() for c in d.columns]
    if future_years:
        combos = d[["country","type"]].drop_duplicates() if set(["country","type"]).issubset(d.columns) else pd.DataFrame(index=[0])
        fut = []
        for y in future_years:
            if not combos.empty:
                fut.append(pd.DataFrame({"country": combos["country"], "type": combos["type"], "year": y}))
            else:
                fut.append(pd.DataFrame({"year": [y]}))
        d = (pd.concat([d] + fut, ignore_index=True)
               .sort_values(["country","type","year"] if set(["country","type"]).issubset(d.columns) else ["year"])
               .reset_index(drop=True))

    if isinstance(art, dict) and "model" in art:
        mdl = art["model"]
        y_use = art.get("y_col", target)
        X_all, _, _ = build_xy(d, y_use)
        X_use = X_all.dropna()
        preds = mdl.predict(X_use)
        base_cols = ["year"] + [c for c in ("country","type") if c in d.columns]
        out = d.loc[X_use.index, base_cols].copy()
        out[f"pred_{y_use}"] = preds
        # intervalos si existen en artifact
        q80 = float(art.get("PI80_abs", 0.0)); q95 = float(art.get("PI95_abs", 0.0))
        if q80 > 0:
            out[f"pred_{y_use}_lo80"] = preds - q80
            out[f"pred_{y_use}_hi80"] = preds + q80
        if q95 > 0:
            out[f"pred_{y_use}_lo95"] = preds - q95
            out[f"pred_{y_use}_hi95"] = preds + q95
        return out.reset_index(drop=True)
    else:
        mdl = art
        expected = expected_input_columns_from_pipeline(mdl)
        d_use = ensure_compat_columns(d, expected)
        preds = mdl.predict(d_use)
        base_cols = ["year"] + [c for c in ("country","type") if c in d.columns]
        out = d.loc[:, base_cols].copy()
        out[f"pred_{target}"] = preds
        return out.reset_index(drop=True)


## 6) Ejecutar: evaluar y predecir

In [7]:

# Cambia el objetivo si quieres ('price', 'consumption', 'profit')
TARGET = "price"

# 6.1 Evaluación (holdout último año)
eval_out = evaluate_holdout(df, target=TARGET)
print("\nArtifact usado:", os.path.basename(eval_out["artifact"]))

# 6.2 Predicción (opcional: años futuros)
FUTURE_YEARS = [2021, 2022]   # déjalo [] si no quieres futuro
preds = predict_simple(df, target=TARGET, future_years=FUTURE_YEARS)

print("\nPredicciones:", preds.shape)
display(preds.head(10))
# Ejemplo rápido: Brasil + Arab* en el último año disponible
try:
    last_year = int(pd.to_numeric(preds["year"], errors="coerce").dropna().max())
    ex = preds.query("country.str.contains('brazil', case=False) and type.str.contains('arab', case=False) and year == @last_year", engine="python")
    print(f"Filas ejemplo (Brazil, Arab*, año={last_year}):", len(ex))
    display(ex.head(10))
except Exception as e:
    print("Filtro ejemplo no aplicó:", e)


Artifact elegido: lasso_price.joblib
▶ EVAL HOLDOUT — target=price, año_val=2020
  RMSE     : 11.862949
  MAE      : 7.892790
  sMAPE(%) : 7.072279
  n_val    : 55

Artifact usado: lasso_price.joblib
Artifact elegido: lasso_price.joblib

Predicciones: (1760, 4)


Unnamed: 0,year,country,type,pred_price
0,1991,Angola,Robusta/Arabica,108.324169
1,1992,Angola,Robusta/Arabica,108.616499
2,1993,Angola,Robusta/Arabica,108.711437
3,1994,Angola,Robusta/Arabica,108.335012
4,1995,Angola,Robusta/Arabica,108.508721
5,1996,Angola,Robusta/Arabica,107.516818
6,1997,Angola,Robusta/Arabica,108.348686
7,1998,Angola,Robusta/Arabica,108.816841
8,1999,Angola,Robusta/Arabica,108.657044
9,2000,Angola,Robusta/Arabica,108.359031


Filas ejemplo (Brazil, Arab*, año=2022): 1


Unnamed: 0,year,country,type,pred_price
95,2022,Brazil,Arabica/Robusta,137.513189
