
# Inferencia — Predicción de Consumo, Precio y Utilidad (sin gráficos)

Este cuaderno:
- Construye *features* (rezagos, medias móviles, dummies).
- Hace **partición temporal** (train/val/test) por año sin *leakage*.
- Entrena **baselines** y modelos (Ridge/Lasso/RandomForest; intenta LightGBM/XGBoost si están disponibles).
- Calcula métricas: **MAE, RMSE, MAPE, sMAPE, R²**.
- Guarda modelos `.joblib` y predicciones como CSV con columnas `country, type, year, target, y_true, y_pred, residual, model`.
- Resume resultados en texto.


In [1]:

# %%
import warnings, json
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import joblib

from utils.metrics import mae, rmse, r2, mape, smape, regression_report

DATA_CLEAN = "data/coffee_clean.csv"
MODELS_DIR = "models"
PRED_DIR = "predicciones"
Path(MODELS_DIR).mkdir(exist_ok=True)
Path(PRED_DIR).mkdir(exist_ok=True)

TARGETS = {
    "consumption": "consumption",
    "price": "price",
    "profit": "profit",
}

TEST_END = 2020
VAL_YEARS = 3
TEST_YEARS = 3


ModuleNotFoundError: No module named 'sklearn'

In [None]:

# %%
# Carga de datos
df = pd.read_csv(DATA_CLEAN)
df = df.sort_values(["country", "type", "year"]).reset_index(drop=True)

FEATURE_BASE = ["country", "type", "year"]
assert all(c in df.columns for c in FEATURE_BASE+["consumption","price","profit"]), "Faltan columnas requeridas"
df.head(3)


In [None]:

# %%
# Feature engineering
def add_lags_and_ma(g, cols, lags=(1,2,3), mas=(2,3)):
    g = g.copy()
    for col in cols:
        for L in lags:
            g[f"{col}_lag{L}"] = g[col].shift(L)
        for M in mas:
            g[f"{col}_ma{M}"] = g[col].rolling(M).mean()
    return g

def build_features(df):
    parts = []
    for keys, g in df.groupby(["country","type"], as_index=False):
        g2 = add_lags_and_ma(g, cols=["consumption","price","profit"])
        parts.append(g2)
    X = pd.concat(parts).sort_values(["country","type","year"]).reset_index(drop=True)
    return X

X = build_features(df)
X_ffill = X.groupby(["country","type"], group_keys=False).apply(lambda g: g.ffill())
X = X_ffill
X.head(5)


In [None]:

# %%
# Partición temporal sin leakage
min_year, max_year = int(X["year"].min()), int(X["year"].max())
test_start = max_year - TEST_YEARS + 1
val_end = test_start - 1
val_start = val_end - VAL_YEARS + 1
train_end = val_start - 1

splits = {
    "train": (min_year, train_end),
    "val": (val_start, val_end),
    "test": (test_start, max_year),
}
splits


In [None]:

# %%
# Baselines por grupo (país-tipo): último valor y promedio histórico
from utils.metrics import mae, rmse, r2, mape, smape

def baseline_last(g, target):
    return g[target].shift(1)

def baseline_mean(g, target):
    return g[target].shift(1).expanding().mean()

def compute_baselines(X, target):
    Xb = X.copy()
    Xb["y_true"] = Xb[target]
    Xb["y_last"] = Xb.groupby(["country","type"], group_keys=False).apply(lambda g: baseline_last(g, target))
    Xb["y_mean"] = Xb.groupby(["country","type"], group_keys=False).apply(lambda g: baseline_mean(g, target))
    return Xb

def evaluate_series(y_true, y_pred):
    return {
        "MAE": mae(y_true, y_pred),
        "RMSE": rmse(y_true, y_pred),
        "MAPE": mape(y_true, y_pred),
        "sMAPE": smape(y_true, y_pred),
        "R2": r2(y_true, y_pred),
    }


In [None]:

# %%
# Modelos supervisados
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

def build_preprocessor():
    cat_features = ["country","type"]
    num_features = [c for c in X.columns if c not in ["country","type","year","consumption","price","profit"]]
    pre = ColumnTransformer([
        ("cats", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),
        ("nums", "passthrough", num_features),
    ])
    return pre, num_features

def train_and_eval_model(X, target, model, model_name):
    pre, num_features = build_preprocessor()
    y = X[target]
    feats = [c for c in X.columns if c not in ["consumption","price","profit"]]
    pipe = Pipeline([("pre", pre), ("model", model)])
    def mask_year(a, b): return (X["year"] >= a) & (X["year"] <= b)
    train_mask = mask_year(*splits["train"])
    val_mask = mask_year(*splits["val"])
    test_mask = mask_year(*splits["test"])

    pipe.fit(X.loc[train_mask | val_mask, feats], y.loc[train_mask | val_mask])
    pred_test = pipe.predict(X.loc[test_mask, feats])

    report = evaluate_series(y.loc[test_mask], pred_test)
    out_path = Path(MODELS_DIR) / f"{model_name}_{target}.joblib"
    import joblib
    joblib.dump(pipe, out_path)
    pred_df = (X.loc[test_mask, ["country","type","year"]].copy()
                 .assign(target=target, y_true=y.loc[test_mask].values, y_pred=pred_test))
    pred_df["residual"] = pred_df["y_true"] - pred_df["y_pred"]
    pred_df["model"] = model_name
    return report, out_path, pred_df

MODELS = {
    "ridge": Ridge(alpha=1.0, random_state=42),
    "lasso": Lasso(alpha=0.001, random_state=42, max_iter=10000),
    "rf": RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1),
}

try:
    import lightgbm as lgb  # type: ignore
    MODELS["lgbm"] = lgb.LGBMRegressor(random_state=42, n_estimators=500)
except Exception:
    pass

try:
    import xgboost as xgb  # type: ignore
    MODELS["xgb"] = xgb.XGBRegressor(random_state=42, n_estimators=500, max_depth=6, subsample=0.8, colsample_bytree=0.8)
except Exception:
    pass


In [None]:

# %%
# Entrenamiento/evaluación por target
all_results = []
all_predictions = []

def eval_baselines_on_test(X, target):
    Xb = compute_baselines(X, target)
    a, b = splits["test"]
    mask = (Xb["year"] >= a) & (Xb["year"] <= b)
    base_last = evaluate_series(Xb.loc[mask,"y_true"], Xb.loc[mask,"y_last"])
    base_mean = evaluate_series(Xb.loc[mask,"y_true"], Xb.loc[mask,"y_mean"])
    return base_last, base_mean

baseline_summary = {}

for target in ["consumption","price","profit"]:
    base_last, base_mean = eval_baselines_on_test(X, target)
    baseline_summary[target] = {"baseline_last": base_last, "baseline_mean": base_mean}

from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

results_rows = []
preds_parts = []

for target in ["consumption","price","profit"]:
    for name, model in MODELS.items():
        report, model_path, pred_df = train_and_eval_model(X, target, model, name)
        row = {"target": target, "model": name, **report}
        results_rows.append(row)
        preds_parts.append(pred_df)

results_df = pd.DataFrame(results_rows).sort_values(["target","RMSE"])
preds_df = pd.concat(preds_parts, ignore_index=True)

pred_file = Path(PRED_DIR) / "predicciones_test.csv"
preds_df.to_csv(pred_file, index=False)

results_df.head(10), pred_file, baseline_summary


In [None]:

# %%
# Resumen ejecutivo (texto)
def best_model_text(results_df):
    lines = []
    for t, g in results_df.groupby("target"):
        g2 = g.sort_values("RMSE").iloc[0]
        lines.append(f"- **{t}**: mejor modelo = {g2['model']} (RMSE={g2['RMSE']:.2f}, MAE={g2['MAE']:.2f}, sMAPE={g2['sMAPE']:.2f}%).")
    return "\n".join(lines)

summary = "\n".join([
    "## Conclusiones",
    f"Rango temporal de entrenamiento/validación y prueba: {splits}.",
    "",
    "### Desempeño por objetivo",
    best_model_text(results_df),
    "",
    "- Los baselines (`último valor` y `promedio histórico`) sirven de referencia; en general los modelos supervisados los superan según RMSE/MAE.",
    "- La **predicción de consumo** suele beneficiarse de rezagos y medias móviles.",
    "- La **predicción de precio** al usar un precio global anualizado es más volátil; los modelos de bosque/boosting suelen capturar mejor no-linealidades.",
    "- La **utilidad** depende de supuestos de costos; los resultados deben revisarse con finanzas para ajustar `FIXED_COST` y `VAR_COST_PER_CUP`.",
    "",
    "Los modelos y las predicciones de test se han guardado en:",
    f"- Carpeta de modelos: {MODELS_DIR}/",
    f"- Archivo de predicciones: {PRED_DIR}/predicciones_test.csv",
])

print(summary)
