
# Inferencia — Predicción de Consumo, Precio y Utilidad (sin gráficos)

Este cuaderno:
- Construye *features* (rezagos, medias móviles, dummies).
- Hace **partición temporal** (train/val/test) por año sin *leakage*.
- Entrena **baselines** y modelos (Ridge/Lasso/RandomForest; intenta LightGBM/XGBoost si están disponibles).
- Calcula métricas: **MAE, RMSE, MAPE, sMAPE, R²**.
- Guarda modelos `.joblib` y predicciones como CSV con columnas `country, type, year, target, y_true, y_pred, residual, model`.
- Resume resultados en texto.


In [19]:
# %%
import warnings, json
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer 
import joblib

from utils.metrics import mae, rmse, r2, mape, smape, regression_report

DATA_CLEAN = "data/coffee_clean.csv"
MODELS_DIR = "models"
PRED_DIR = "predicciones"
Path(MODELS_DIR).mkdir(exist_ok=True)
Path(PRED_DIR).mkdir(exist_ok=True)

TARGETS = {
    "consumption": "consumption",
    "price": "price",
    "profit": "profit",
}

TEST_END = 2020
VAL_YEARS = 3
TEST_YEARS = 3


In [20]:
# %%
# Carga de datos
df = pd.read_csv(DATA_CLEAN)
df = df.sort_values(["country", "type", "year"]).reset_index(drop=True)

FEATURE_BASE = ["country", "type", "year"]
assert all(c in df.columns for c in FEATURE_BASE+["consumption","price","profit"]), "Faltan columnas requeridas"
df.head(3)


Unnamed: 0,country,type,consumption,year,price,revenue,profit,margin,market_share
0,Angola,Robusta/Arabica,1200000,1991,87.686363,105223600.0,105130300.0,0.999113,0.001025
1,Angola,Robusta/Arabica,1800000,1992,87.686363,157835500.0,157712100.0,0.999219,0.001483
2,Angola,Robusta/Arabica,2100000,1993,87.686363,184141400.0,184003000.0,0.999249,0.001671


In [None]:
# %%
# Feature engineering SIN fuga (MAs solo con PASADO) + warm-up drop

LAGS = (1, 2, 3)
MAS  = (2, 3)  # ventanas de medias móviles

def add_lags_and_ma(g, cols, lags=LAGS, mas=MAS):
    """
    - Lags: shift(L) -> siempre pasado.
    - MAs: rolling(M).mean().shift(1) -> promedio hasta el año anterior (no incluye el actual).
    """
    g = g.copy()
    for col in cols:
        # Rezagos
        for L in lags:
            g[f"{col}_lag{L}"] = g[col].shift(L)
        # Medias móviles SÓLO con pasado
        for M in mas:
            g[f"{col}_ma{M}"] = g[col].rolling(M).mean().shift(1)
    return g

def build_features(df):
    parts = []
    for keys, gi in df.groupby(["country","type"], as_index=False):
        g2 = add_lags_and_ma(gi, cols=["consumption","price","profit"])
        parts.append(g2)
    X = (pd.concat(parts)
           .sort_values(["country","type","year"])
           .reset_index(drop=True))
    return X

X = build_features(df)

# Warm-up: eliminar filas iniciales por grupo donde aún existirán NaNs estructurales
# Por seguridad, usamos la ventana máxima de MA (M) y el lag máximo (L)
WARMUP = max(max(LAGS), max(MAS))
X = (X.groupby(["country","type"], group_keys=False)
       .apply(lambda g: g.iloc[WARMUP:])  # corta warm-up
       .reset_index(drop=True))

# Relleno conservador por si quedaron huecos (series con huecos originales)
X = (X.groupby(["country","type"], group_keys=False)
       .apply(lambda g: g.ffill().bfill())
       .reset_index(drop=True))

X.head(5)




Unnamed: 0,country,type,consumption,year,price,revenue,profit,margin,market_share,consumption_lag1,...,price_lag1,price_lag2,price_lag3,price_ma2,price_ma3,profit_lag1,profit_lag2,profit_lag3,profit_ma2,profit_ma3
0,Angola,Robusta/Arabica,1200000,1994,87.686363,105223600.0,105130300.0,0.999113,0.000948,2100000.0,...,87.686363,87.686363,87.686363,87.686363,87.686363,184003000.0,157712100.0,105130300.0,144566700.0,148948500.0
1,Angola,Robusta/Arabica,1500000,1995,87.686363,131529500.0,131421200.0,0.999176,0.00117,1200000.0,...,87.686363,87.686363,87.686363,87.686363,87.686363,105130300.0,184003000.0,157712100.0,118275800.0,140184800.0
2,Angola,Robusta/Arabica,600000,1996,87.686363,52611820.0,52548480.0,0.998796,0.000442,1500000.0,...,87.686363,87.686363,87.686363,87.686363,87.686363,131421200.0,105130300.0,184003000.0,91984850.0,96366670.0
3,Angola,Robusta/Arabica,1200000,1997,87.686363,105223600.0,105130300.0,0.999113,0.000851,600000.0,...,87.686363,87.686363,87.686363,87.686363,87.686363,52548480.0,131421200.0,105130300.0,78839390.0,96366670.0
4,Angola,Robusta/Arabica,2400000,1998,87.686363,210447300.0,210293900.0,0.999271,0.001648,1200000.0,...,87.686363,87.686363,87.686363,87.686363,87.686363,105130300.0,52548480.0,131421200.0,157712100.0,122657600.0


In [22]:
# %%
# Partición temporal sin leakage
min_year, max_year = int(X["year"].min()), int(X["year"].max())
test_start = max_year - TEST_YEARS + 1
val_end = test_start - 1
val_start = val_end - VAL_YEARS + 1
train_end = val_start - 1

splits = {
    "train": (min_year, train_end),
    "val": (val_start, val_end),
    "test": (test_start, max_year),
}
splits


{'train': (1994, 2014), 'val': (2015, 2017), 'test': (2018, 2020)}

In [23]:
# %%
# Baselines por grupo (país-tipo): último valor y promedio histórico
from utils.metrics import mae, rmse, r2, mape, smape

def baseline_last(g, target):
    return g[target].shift(1)

def baseline_mean(g, target):
    return g[target].shift(1).expanding().mean()

def compute_baselines(X, target):
    Xb = X.copy()
    Xb["y_true"] = Xb[target]
    Xb["y_last"] = (Xb.groupby(["country","type"], group_keys=False)
                      .apply(lambda g: baseline_last(g, target)))
    Xb["y_mean"] = (Xb.groupby(["country","type"], group_keys=False)
                      .apply(lambda g: baseline_mean(g, target)))
    return Xb

def evaluate_series(y_true, y_pred):
    # filtra pares válidos (sin NaN)
    m = (~pd.isna(y_true)) & (~pd.isna(y_pred))
    y_true = np.asarray(y_true[m], dtype=float)
    y_pred = np.asarray(y_pred[m], dtype=float)
    return {
        "MAE": mae(y_true, y_pred),
        "RMSE": rmse(y_true, y_pred),
        "MAPE": mape(y_true, y_pred),
        "sMAPE": smape(y_true, y_pred),
        "R2": r2(y_true, y_pred),
    }



In [None]:
def build_preprocessor(X):
    cat_features = ["country","type"]
    exclude = {"country","type","year","consumption","price","profit"}
    num_features = [c for c in X.columns if c not in exclude]

    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False)),  # opcional; RF no lo necesita, lineales sí
    ])

    pre = ColumnTransformer([
        ("cats", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),
        ("nums", num_pipe, num_features),
    ])
    return pre, num_features

def train_and_eval_model(X, target, model, model_name):
    pre, _ = build_preprocessor(X)
    y = X[target]
    feats = [c for c in X.columns if c not in ["consumption","price","profit"]]
    pipe = Pipeline([("pre", pre), ("model", model)])

    def mask_year(a, b): return (X["year"] >= a) & (X["year"] <= b)
    train_mask = mask_year(*splits["train"])
    val_mask   = mask_year(*splits["val"])
    test_mask  = mask_year(*splits["test"])

    # Entrenar en train+val sólo con filas donde y no es NaN
    trainval = (train_mask | val_mask) & y.notna()
    X_train, y_train = X.loc[trainval, feats], y.loc[trainval]

    pipe.fit(X_train, y_train)

    # Evaluar en test
    test_sel = test_mask & y.notna()
    X_test, y_test = X.loc[test_sel, feats], y.loc[test_sel]
    pred_test = pipe.predict(X_test)

    report = evaluate_series(y_test, pred_test)

    out_path = Path(MODELS_DIR) / f"{model_name}_{target}.joblib"
    joblib.dump(pipe, out_path)

    pred_df = (X.loc[test_sel, ["country","type","year"]].copy()
                 .assign(target=target, y_true=y_test.values, y_pred=pred_test))
    pred_df["residual"] = pred_df["y_true"] - pred_df["y_pred"]
    pred_df["model"] = model_name
    return report, out_path, pred_df



MODELS = {
    "ridge": Ridge(alpha=1.0, random_state=42),
    "lasso": Lasso(alpha=0.001, random_state=42, max_iter=10000),
    "rf": RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1),
}

try:
    import lightgbm as lgb  # type: ignore
    MODELS["lgbm"] = lgb.LGBMRegressor(random_state=42, n_estimators=500)
except Exception:
    pass

try:
    import xgboost as xgb  # type: ignore
    MODELS["xgb"] = xgb.XGBRegressor(random_state=42, n_estimators=500, max_depth=6, subsample=0.8, colsample_bytree=0.8)
except Exception:
    pass


In [25]:
# %%
# Entrenamiento/evaluación por target
all_results = []
all_predictions = []

def eval_baselines_on_test(X, target):
    Xb = compute_baselines(X, target)
    a, b = splits["test"]
    mask = (Xb["year"] >= a) & (Xb["year"] <= b)
    base_last = evaluate_series(Xb.loc[mask,"y_true"], Xb.loc[mask,"y_last"])
    base_mean = evaluate_series(Xb.loc[mask,"y_true"], Xb.loc[mask,"y_mean"])
    return base_last, base_mean

baseline_summary = {}

for target in ["consumption","price","profit"]:
    base_last, base_mean = eval_baselines_on_test(X, target)
    baseline_summary[target] = {"baseline_last": base_last, "baseline_mean": base_mean}

from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

results_rows = []
preds_parts = []

for target in ["consumption","price","profit"]:
    for name, model in MODELS.items():
        report, model_path, pred_df = train_and_eval_model(X, target, model, name)
        row = {"target": target, "model": name, **report}
        results_rows.append(row)
        preds_parts.append(pred_df)

results_df = pd.DataFrame(results_rows).sort_values(["target","RMSE"])
preds_df = pd.concat(preds_parts, ignore_index=True)

pred_file = Path(PRED_DIR) / "predicciones_test.csv"
preds_df.to_csv(pred_file, index=False)

results_df.head(10), pred_file, baseline_summary


(        target  model           MAE          RMSE          MAPE         sMAPE  \
 0  consumption  ridge  9.104670e-07  2.719123e-06  1.830141e-10  1.090909e+01   
 1  consumption  lasso  8.223504e+05  2.427884e+06  1.569087e+01  1.867780e+01   
 2  consumption     rf  4.216992e+06  2.405692e+07           NaN  4.627168e+00   
 3        price  ridge  4.477237e-05  5.482443e-05  4.191594e-05  4.191593e-05   
 4        price  lasso  1.006513e-02  2.704017e-02  9.426697e-03  9.423690e-03   
 5        price     rf  2.015821e+01  2.097946e+01  1.896310e+01  1.716920e+01   
 6       profit  ridge  1.154868e-05  1.609284e-05  1.665500e-09  1.665500e-09   
 7       profit  lasso  5.781493e+05  2.176880e+06  5.865235e+00  7.551012e+00   
 8       profit     rf  3.173262e+08  2.148889e+09  1.054467e+00  1.037435e+00   
 
           R2  
 0   1.000000  
 1   0.999824  
 2   0.982742  
 3   1.000000  
 4   0.999965  
 5 -20.233342  
 6   1.000000  
 7   1.000000  
 8   0.988160  ,
 PosixPath('predi

In [26]:

# %%
# Resumen ejecutivo (texto)
def best_model_text(results_df):
    lines = []
    for t, g in results_df.groupby("target"):
        g2 = g.sort_values("RMSE").iloc[0]
        lines.append(f"- **{t}**: mejor modelo = {g2['model']} (RMSE={g2['RMSE']:.2f}, MAE={g2['MAE']:.2f}, sMAPE={g2['sMAPE']:.2f}%).")
    return "\n".join(lines)

summary = "\n".join([
    "## Conclusiones",
    f"Rango temporal de entrenamiento/validación y prueba: {splits}.",
    "",
    "### Desempeño por objetivo",
    best_model_text(results_df),
    "",
    "- Los baselines (`último valor` y `promedio histórico`) sirven de referencia; en general los modelos supervisados los superan según RMSE/MAE.",
    "- La **predicción de consumo** suele beneficiarse de rezagos y medias móviles.",
    "- La **predicción de precio** al usar un precio global anualizado es más volátil; los modelos de bosque/boosting suelen capturar mejor no-linealidades.",
    "- La **utilidad** depende de supuestos de costos; los resultados deben revisarse con finanzas para ajustar `FIXED_COST` y `VAR_COST_PER_CUP`.",
    "",
    "Los modelos y las predicciones de test se han guardado en:",
    f"- Carpeta de modelos: {MODELS_DIR}/",
    f"- Archivo de predicciones: {PRED_DIR}/predicciones_test.csv",
])

print(summary)


## Conclusiones
Rango temporal de entrenamiento/validación y prueba: {'train': (1994, 2014), 'val': (2015, 2017), 'test': (2018, 2020)}.

### Desempeño por objetivo
- **consumption**: mejor modelo = ridge (RMSE=0.00, MAE=0.00, sMAPE=10.91%).
- **price**: mejor modelo = ridge (RMSE=0.00, MAE=0.00, sMAPE=0.00%).
- **profit**: mejor modelo = ridge (RMSE=0.00, MAE=0.00, sMAPE=0.00%).

- Los baselines (`último valor` y `promedio histórico`) sirven de referencia; en general los modelos supervisados los superan según RMSE/MAE.
- La **predicción de consumo** suele beneficiarse de rezagos y medias móviles.
- La **predicción de precio** al usar un precio global anualizado es más volátil; los modelos de bosque/boosting suelen capturar mejor no-linealidades.
- La **utilidad** depende de supuestos de costos; los resultados deben revisarse con finanzas para ajustar `FIXED_COST` y `VAR_COST_PER_CUP`.

Los modelos y las predicciones de test se han guardado en:
- Carpeta de modelos: models/
- Archivo d

In [None]:
# %%
# Resumen ejecutivo (mostrar métricas con más precisión)

def best_model_text_precise(results_df):
    lines = []
    for t, g in results_df.groupby("target"):
        g2 = g.sort_values("RMSE").iloc[0]
        lines.append(
            f"- **{t}**: mejor modelo = {g2['model']} "
            f"(RMSE={g2['RMSE']:.6g}, MAE={g2['MAE']:.6g}, sMAPE={g2['sMAPE']:.4f}%)."
        )
    return "\n".join(lines)

summary = "\n".join([
    "## Conclusiones",
    f"Rango temporal de entrenamiento/validación y prueba: {splits}.",
    "",
    "### Desempeño por objetivo",
    best_model_text_precise(results_df),
    "",
    "- Los baselines (`último valor` y `promedio histórico`) sirven de referencia; los modelos supervisados mejoran el error en la mayoría de casos.",
    "- **Consumo** suele beneficiarse de rezagos y MAs (definidas sólo con pasado).",
    "- **Precio** anualizado varía poco; comparar contra baselines es esencial.",
    "- **Utilidad** depende de supuestos de costos; revisar con finanzas `FIXED_COST` y `VAR_COST_PER_CUP`.",
    "",
    "Modelos y predicciones de test guardados en:",
    f"- Modelos: {MODELS_DIR}/",
    f"- Predicciones: {PRED_DIR}/predicciones_test.csv",
])

print(summary)


In [None]:
# %%
# Diagnóstico simple: correlación (Pearson) entre features y target por año (solo pasada vs. no pasada)
# Nota: esto es ilustrativo; no uses features del mismo año.

target = "consumption"
feats_same_year = [c for c in X.columns if c not in ["country","type","year","consumption","price","profit"]
                   and not c.endswith(tuple([f"_lag{L}" for L in LAGS]))
                   and "_ma" in c and True]  # MAs "sin shift" (si quedara alguna), aquí deberían ser 0

print("Features sospechosas (debería estar vacío si aplicaste shift a las MAs):", feats_same_year)
