# Inferencia - High Garden Coffee
Este notebook contiene únicamente la parte **inferencial**:
- Modelado supervisado
- Validación temporal
- Métricas de evaluación
- Intervalos de predicción

👉 No hay gráficas (quedan en el EDA).


## 1. Imports y configuración

In [1]:

import os
import sys
import math
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List, Tuple, Dict

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin, clone

np.random.seed(42)

# XGBoost opcional
try:
    from xgboost import XGBRegressor
    XGB_AVAILABLE = True
except Exception:
    XGB_AVAILABLE = False

pd.set_option("display.max_columns", 120)


## 2. Carga de datos

In [None]:

PATH_COFFEE = "coffee_db.csv"
PATH_PRICES = "precios.csv"ss

def _try_read(path: str) -> pd.DataFrame:
    if os.path.exists(path):
        return pd.read_csv(path)
    alt = os.path.join("/mnt/data", os.path.basename(path))
    if os.path.exists(alt):
        return pd.read_csv(alt)
    raise FileNotFoundError(f"No encontré {path} (ni {alt}).")

def _norm_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns.str.lower()
        .str.replace("á","a").str.replace("é","e").str.replace("í","i").str.replace("ó","o").str.replace("ú","u").str.replace("ñ","n")
        .str.replace(r"[^a-z0-9]+", "_", regex=True)
        .str.strip("_")
    )
    return df

coffee = _norm_cols(_try_read(PATH_COFFEE))

# precios externo (opcional)
prices_ext = None
try:
    prices_ext = _norm_cols(_try_read(PATH_PRICES))
except FileNotFoundError:
    pass

coffee.head(3)


Unnamed: 0,country,coffee_type,1990_91,1991_92,1992_93,1993_94,1994_95,1995_96,1996_97,1997_98,1998_99,1999_00,2000_01,2001_02,2002_03,2003_04,2004_05,2005_06,2006_07,2007_08,2008_09,2009_10,2010_11,2011_12,2012_13,2013_14,2014_15,2015_16,2016_17,2017_18,2018_19,2019_20,total_domestic_consumption
0,Angola,Robusta/Arabica,1200000,1800000,2100000,1200000,1500000,600000,1200000,2400000,1800000,1200000,1200000,1200000,1200000,900000,900000,900000,1800000,1800000,1800000,1800000,1800000,1800000,1800000,1800000,1800000,1800000,1800000,1800000,1800000,1800000,46500000
1,Bolivia (Plurinational State of),Arabica,1500000,1620000,1650000,1710000,1770000,1830000,1890000,1950000,1980000,2040000,2100000,2190000,2250000,2310000,2700000,2460000,2520000,2610000,2700000,2760000,2850000,2940000,3030000,3120000,3210000,3300000,3420000,3510000,3600000,3660000,75180000
2,Brazil,Arabica/Robusta,492000000,510000000,534000000,546000000,558000000,606000000,660000000,690000000,732000000,762000000,792000000,815400000,825000000,852000000,896760000,932280000,979860000,1026600000,1059600000,1103400000,1147920000,1183200000,1219800000,1205100000,1219980000,1230480000,1273500000,1319820000,1332000000,1320000000,27824700000


## 3. Detección de columnas y objetivos

In [4]:
print(coffee.columns.tolist())

['country', 'coffee_type', '1990_91', '1991_92', '1992_93', '1993_94', '1994_95', '1995_96', '1996_97', '1997_98', '1998_99', '1999_00', '2000_01', '2001_02', '2002_03', '2003_04', '2004_05', '2005_06', '2006_07', '2007_08', '2008_09', '2009_10', '2010_11', '2011_12', '2012_13', '2013_14', '2014_15', '2015_16', '2016_17', '2017_18', '2018_19', '2019_20', 'total_domestic_consumption']


In [3]:

YEAR_CANDIDATES = ["year", "anio", "ano"]
COUNTRY_CANDIDATES = ["country", "pais"]
TYPE_CANDIDATES = ["type", "tipo", "variety", "categoria"]

TARGET_ALIASES = {
    "consumption": ["consumption", "consumo"],
    "price": ["price", "precio"],
    "profit": ["profit", "utilidad", "ganancia"]
}

def find_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

year_col = find_col(coffee, YEAR_CANDIDATES)
country_col = find_col(coffee, COUNTRY_CANDIDATES) or "country"
type_col = find_col(coffee, TYPE_CANDIDATES)

def find_target(df, names):
    for n in names:
        if n in df.columns:
            return n
    return None

targets = {}
for k, aliases in TARGET_ALIASES.items():
    col = find_target(coffee, aliases)
    if col: targets[k] = col

if not year_col:
    raise ValueError("No pude detectar la columna de año.")

if country_col not in coffee.columns:
    coffee[country_col] = "ALL"
if type_col and type_col not in coffee.columns:
    type_col = None

print("year_col:", year_col, "| country_col:", country_col, "| type_col:", type_col)
print("targets detectados:", targets)


ValueError: No pude detectar la columna de año.

## 4. Merge con precios externos (opcional)

In [None]:

def safe_merge_externals(df: pd.DataFrame, ext: pd.DataFrame) -> pd.DataFrame:
    if ext is None:
        return df
    keys = [k for k in [year_col, country_col] if k in ext.columns]
    if not keys:
        keys = [year_col] if year_col in ext.columns else []
    if not keys:
        return df
    rename_map = {c: f"ext_{c}" for c in ext.columns if c not in keys}
    ext2 = ext.rename(columns=rename_map)
    merged = df.merge(ext2, on=keys, how="left")
    return merged

data = safe_merge_externals(coffee, prices_ext)
data[year_col] = pd.to_numeric(data[year_col], errors="coerce").astype("Int64")
data = data.sort_values([year_col, country_col] + ([type_col] if type_col else []))
data.head(3)


## 5. Splits temporales por año

In [None]:

def year_based_cv(df, year_col, initial_train_years=20, val_years=2, step=1):
    years = sorted([int(y) for y in df[year_col].dropna().unique()])
    if len(years) < (initial_train_years + val_years):
        initial_train_years = max(3, len(years) - val_years - 1)
    for start in range(0, max(1, len(years) - initial_train_years - val_years + 1), step):
        train_years = years[:initial_train_years + start]
        val_years_ = years[initial_train_years + start: initial_train_years + start + val_years]
        tr_idx = df.index[df[year_col].isin(train_years)]
        va_idx = df.index[df[year_col].isin(val_years_)]
        yield tr_idx, va_idx, train_years, val_years_

CV_GENERATOR = list(year_based_cv(data, year_col, initial_train_years=21, val_years=3, step=1))
len(CV_GENERATOR), CV_GENERATOR[0][2][-1:], CV_GENERATOR[0][3]


## 6. Features con lags y rolling

In [None]:

NUMERIC_EXCLUDE = set([year_col])
TARGET_SET = set(targets.values())
CAT_COLS = [c for c in [country_col, type_col] if c is not None]

def add_group_safe_lags(df: pd.DataFrame, group_cols: List[str], lag_cols: List[str], lags=(1,2), add_roll=True):
    df = df.copy()
    sort_cols = [year_col] + group_cols
    df = df.sort_values(sort_cols)
    for col in lag_cols:
        base = df.groupby(group_cols, group_keys=False)[col]
        for L in lags:
            df[f"{col}_lag{L}"] = base.shift(L)
        if add_roll:
            s = base.shift(1)
            for win in [2,3,5]:
                df[f"{col}_ma{win}"] = s.rolling(win, min_periods=1).mean()
                df[f"{col}_std{win}"] = s.rolling(win, min_periods=2).std()
    return df

def build_xy(df: pd.DataFrame, y_col: str) -> Tuple[pd.DataFrame, pd.Series, List[str], List[str]]:
    numeric_cols = [c for c in df.select_dtypes(include=[np.number]).columns 
                    if c not in TARGET_SET and c != year_col]
    lag_pool = sorted(set(numeric_cols + [y_col]))
    df_feat = add_group_safe_lags(df, CAT_COLS, lag_pool, lags=(1,2), add_roll=True)
    feat_cols = [c for c in df_feat.columns if any(s in c for s in ["_lag", "_ma", "_std"])]
    df_feat["t_index"] = df_feat.groupby(CAT_COLS).cumcount()
    feat_cols += ["t_index"]
    X = df_feat[feat_cols + CAT_COLS]
    y = df_feat[y_col]
    return X, y, feat_cols, CAT_COLS


## 7. Métricas

In [None]:

def rmse(y_true, y_pred): 
    return float(np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2)))

def smape(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred))
    denom = np.where(denom == 0, 1e-9, denom)
    return float(np.mean(2.0 * np.abs(y_pred - y_true) / denom) * 100)

def mdape(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = np.where(y_true == 0, 1e-9, np.abs(y_true))
    return float(np.median(np.abs((y_true - y_pred) / denom)) * 100)

def mase(y_true, y_pred, y_train, m=1):
    y_true, y_pred = np.array(y_true, dtype=float), np.array(y_pred, dtype=float)
    y_train = np.array(y_train, dtype=float)
    if len(y_train) <= m:
        return np.nan
    denom = np.mean(np.abs(y_train[m:] - y_train[:-m]))
    denom = denom if denom != 0 else 1e-9
    return float(np.mean(np.abs(y_true - y_pred)) / denom)

def metrics_dict(y_true, y_pred, y_train_for_mase):
    return {
        "RMSE": rmse(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "sMAPE(%)": smape(y_true, y_pred),
        "MdAPE(%)": mdape(y_true, y_pred),
        "MASE": mase(y_true, y_pred, y_train_for_mase, m=1)
    }


## 8. Baselines Naive y SNaive

In [None]:

def baseline_naive(train_df, val_df, y_col):
    tr = train_df.sort_values([year_col, *CAT_COLS])
    va = val_df.sort_values([year_col, *CAT_COLS])
    last_train = tr.groupby(CAT_COLS)[y_col].last().to_dict()
    preds = []
    for _, row in va.iterrows():
        key = tuple(row[c] for c in CAT_COLS)
        preds.append(last_train.get(key, tr[y_col].mean()))
    return np.array(preds)

def baseline_snaive(train_df, val_df, y_col, season_m=1):
    if season_m == 1:
        return baseline_naive(train_df, val_df, y_col)
    return baseline_naive(train_df, val_df, y_col)


## 9. Modelos supervisados

In [None]:

def make_preprocessor(cat_cols: List[str], scale_numeric: bool):
    transformers = []
    if cat_cols:
        transformers.append(("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols))
    if scale_numeric:
        pre = ColumnTransformer(transformers, remainder="passthrough")
        return Pipeline([("ct", pre), ("scaler", StandardScaler(with_mean=False))])
    else:
        return ColumnTransformer(transformers, remainder="passthrough")

def model_registry():
    models = {
        "ridge": Pipeline([
            ("pre", make_preprocessor(CAT_COLS, scale_numeric=True)),
            ("mdl", RidgeCV(alphas=np.logspace(-3, 3, 15), cv=5))
        ]),
        "lasso": Pipeline([
            ("pre", make_preprocessor(CAT_COLS, scale_numeric=True)),
            ("mdl", LassoCV(alphas=np.logspace(-3, 1, 15), cv=5, random_state=42, n_jobs=-1, max_iter=5000))
        ]),
        "rf": Pipeline([
            ("pre", make_preprocessor(CAT_COLS, scale_numeric=False)),
            ("mdl", RandomForestRegressor(
                n_estimators=400, max_depth=None, min_samples_leaf=2, random_state=42, n_jobs=-1
            ))
        ]),
    }
    if XGB_AVAILABLE:
        models["xgb"] = Pipeline([
            ("pre", make_preprocessor(CAT_COLS, scale_numeric=False)),
            ("mdl", XGBRegressor(
                n_estimators=800, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8,
                random_state=42, tree_method="hist", n_jobs=-1
            ))
        ])
    return models


## 10. Validación cruzada temporal

In [None]:

def eval_one_target(df: pd.DataFrame, y_col: str) -> pd.DataFrame:
    X_all, y_all, feat_cols, cat_cols = build_xy(df, y_col)
    models = model_registry()
    rows = []
    X_all = X_all.copy()
    X_all.index = df.index
    y_all = y_all.loc[X_all.index]
    y_train_global = y_all.dropna()
    for fold_id, (tr_idx, va_idx, tr_years, va_years) in enumerate(CV_GENERATOR, start=1):
        X_tr = X_all.loc[tr_idx].dropna()
        y_tr = y_all.loc[X_tr.index]
        X_va = X_all.loc[va_idx].dropna()
        y_va = y_all.loc[X_va.index]
        train_df = df.loc[X_tr.index]
        val_df   = df.loc[X_va.index]
        yhat_naive  = baseline_naive(train_df, val_df, y_col)
        yhat_snaive = baseline_snaive(train_df, val_df, y_col)
        rows.append({"target": y_col, "model": "naive", "fold": fold_id,
                     **metrics_dict(y_va.values, yhat_naive, y_tr.values)})
        rows.append({"target": y_col, "model": "snaive", "fold": fold_id,
                     **metrics_dict(y_va.values, yhat_snaive, y_tr.values)})
        for name, pipe in models.items():
            mdl = clone(pipe)
            mdl.fit(X_tr, y_tr)
            pred = mdl.predict(X_va)
            rows.append({"target": y_col, "model": name, "fold": fold_id,
                         **metrics_dict(y_va.values, pred, y_tr.values)})
    return pd.DataFrame(rows)

all_results = []
for logical_name, y_col in targets.items():
    print(f"↳ Evaluando: {logical_name} [{y_col}]")
    res = eval_one_target(data, y_col)
    res["logical_target"] = logical_name
    all_results.append(res)

results_df = pd.concat(all_results, ignore_index=True)
results_df.head()


## 11. Resumen de métricas

In [None]:

def summarize_results(df: pd.DataFrame) -> pd.DataFrame:
    agg = df.groupby(["logical_target", "model"]).agg(
        RMSE_mean=("RMSE","mean"), MAE_mean=("MAE","mean"),
        sMAPE_mean=("sMAPE(%)","mean"), MdAPE_mean=("MdAPE(%)","mean"),
        MASE_mean=("MASE","mean"), folds=("fold","nunique")
    ).reset_index()
    agg["rank_sMAPE"] = agg.groupby("logical_target")["sMAPE_mean"].rank(method="first")
    return agg.sort_values(["logical_target","rank_sMAPE"])

summary_df = summarize_results(results_df)
summary_df


## 12. Entrenamiento final y guardado

In [None]:

from joblib import dump
import pathlib

MODELS_OUT = pathlib.Path("models"); MODELS_OUT.mkdir(exist_ok=True)

BEST_BY_TARGET = (
    summary_df.loc[~summary_df["model"].isin(["naive","snaive"])]
    .sort_values(["logical_target","sMAPE_mean"])
    .groupby("logical_target").first().reset_index()
)[["logical_target","model"]]

print(BEST_BY_TARGET)

final_artifacts = []
for _, row in BEST_BY_TARGET.iterrows():
    logical_target = row["logical_target"]
    y_col = targets[logical_target]
    X_all, y_all, feat_cols, cat_cols = build_xy(data, y_col)
    X_fit = X_all.dropna(); y_fit = y_all.loc[X_fit.index]
    mdl = clone(model_registry()[row["model"]])
    mdl.fit(X_fit, y_fit)
    path_m = MODELS_OUT / f"{logical_target}_{row['model']}.joblib"
    dump(mdl, path_m)
    final_artifacts.append((logical_target, row["model"], str(path_m)))

pd.DataFrame(final_artifacts, columns=["target","model","path"])
