# 📘 Inferencia

Cuaderno para entrenar y exportar modelos de `price`, `consumption` y `profit`.

Incluye parches mínimos:
- `positive_wrapper` con `TransformedTargetRegressor` (sin lambdas) para evitar valores negativos en *price/consumption*.
- Imputación simple de NaN en `X` y recorte de `y>=0` cuando aplica.
- Guardado de *artifacts* “ricos” en `models/{target}_model.joblib`, compatibles con la app/bot.


In [1]:
# === Imports y configuración ===
import os, warnings
import numpy as np
import pandas as pd
from pathlib import Path
from joblib import dump

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

warnings.filterwarnings('ignore')

DATA_CLEAN = os.getenv('DATA_CLEAN', 'data/coffee_clean.csv')
OUT_MODELS = Path('models'); OUT_MODELS.mkdir(exist_ok=True)
OUT_PREDS  = Path('predicciones'); OUT_PREDS.mkdir(exist_ok=True)
RANDOM_STATE = 42
print('Usando DATA_CLEAN =', DATA_CLEAN)

Usando DATA_CLEAN = data/coffee_clean.csv


In [2]:
# === Utilidades ===
def print_header(title: str):
    print('\n' + '='*len(title))
    print(title)
    print('='*len(title))

def smape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred))
    denom = np.where(denom==0, 1.0, denom)
    return 100.0 * np.mean(np.abs(y_pred - y_true) / denom)

def positive_wrapper(regressor):
    """Wrapper picklable: log1p/expm1 para targets no negativos."""
    return TransformedTargetRegressor(
        regressor=regressor,
        func=np.log1p,
        inverse_func=np.expm1
    )

def ensure_basic_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if 'Country' in df.columns and 'country' not in df.columns:
        df = df.rename(columns={'Country': 'country'})
    if 'Coffee type' in df.columns and 'type' not in df.columns:
        df = df.rename(columns={'Coffee type': 'type'})
    if 'año' in df.columns and 'year' not in df.columns:
        df = df.rename(columns={'año': 'year'})
    if 'year' in df.columns:
        df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')
    return df

def load_master_df():
    for p in [DATA_CLEAN, 'coffee_clean.csv', 'coffee_db.csv']:
        if os.path.exists(p):
            return ensure_basic_cols(pd.read_csv(p))
    raise FileNotFoundError('No encontré data. Revisa DATA_CLEAN o coloca coffee_clean.csv.')

def try_import_build_xy():
    try:
        from utils.features import build_xy  # si existe en tu repo
        return build_xy
    except Exception:
        return None

def build_xy_fallback(df_all: pd.DataFrame, y_col: str):
    # Fallback simple: year + dummies para country/type (sin lags para evitar NaN)
    df = df_all.copy()
    base = [c for c in ['year','country','type'] if c in df.columns]
    X = pd.get_dummies(df[base], columns=[c for c in ['country','type'] if c in base], drop_first=False)
    y = df[y_col] if y_col in df.columns else None
    meta = {'group_cols': [c for c in ['country','type'] if c in df.columns]}
    return X, y, meta

In [3]:
# === Cargar dataset ===
master_df = load_master_df()
print('Rango de años:', (int(master_df['year'].min()), int(master_df['year'].max())))
print('Columnas:', list(master_df.columns))
TARGETS = [c for c in ['price','consumption','profit'] if c in master_df.columns]
assert len(TARGETS)>0, 'No hay columnas de target (price/consumption/profit).'
print('Targets:', TARGETS)

Rango de años: (1991, 2020)
Columnas: ['country', 'type', 'consumption', 'year', 'price', 'revenue', 'profit', 'margin', 'market_share']
Targets: ['price', 'consumption', 'profit']


In [4]:
# === Entrenamiento por target (parche mínimo) ===
results = []
preds_all = []

for TARGET in TARGETS:
    print_header(f'Entrenando target = {TARGET}')

    build_xy = try_import_build_xy()
    if build_xy is None:
        X, y, meta = build_xy_fallback(master_df, TARGET)
    else:
        X, y, meta = build_xy(master_df.copy(), TARGET)

    # LIMPIEZA mínima
    mask_ok = y.notna()
    X = X.loc[mask_ok].copy(); y = y.loc[mask_ok].copy()
    if TARGET in ('price','consumption'):
        y = y.clip(lower=0)
    X = X.fillna(X.median(numeric_only=True)).fillna(0)

    # Split temporal
    years = pd.to_numeric(master_df.loc[X.index, 'year'], errors='coerce')
    uniq_years = sorted([int(x) for x in years.dropna().unique()])
    test_year = uniq_years[-1]
    val_year = uniq_years[-2] if len(uniq_years) >= 2 else uniq_years[-1]
    mask_test = (master_df.loc[X.index, 'year'] == test_year)
    mask_val  = (master_df.loc[X.index, 'year'] == val_year) & (~mask_test)
    mask_train = ~(mask_val | mask_test)
    if mask_train.sum()==0:
        mask_train = ~mask_test; mask_val = (mask_train & False)

    X_train, y_train = X.loc[mask_train], y.loc[mask_train]
    X_val,   y_val   = X.loc[mask_val],   y.loc[mask_val]
    X_test,  y_test  = X.loc[mask_test],  y.loc[mask_test]

    # Modelos candidatos
    candidates = {
        'ridge': Pipeline([('scaler', StandardScaler(with_mean=False)), ('model', Ridge(alpha=1.0, random_state=RANDOM_STATE))]),
        'lasso': Pipeline([('scaler', StandardScaler(with_mean=False)), ('model', Lasso(alpha=0.01, random_state=RANDOM_STATE, max_iter=10000))]),
        'rf':    Pipeline([('model', RandomForestRegressor(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1))])
    }

    best_name, best_model, best_mae = None, None, float('inf')
    for name, pipe in candidates.items():
        model = positive_wrapper(pipe) if TARGET in ('price','consumption') else pipe
        model.fit(X_train, y_train)
        y_hat = model.predict(X_val) if len(X_val) else model.predict(X_train)
        mae = mean_absolute_error(y_val if len(X_val) else y_train, y_hat)
        if mae < best_mae:
            best_name, best_model, best_mae = name, model, mae
    print(f'Mejor modelo en val: {best_name} (MAE={best_mae:.4f})')

    # Métricas y bandas
    yhat_val = best_model.predict(X_val) if len(X_val) else best_model.predict(X_train)
    base_resid_y = (y_val if len(X_val) else y_train)
    resid = np.asarray(base_resid_y) - np.asarray(yhat_val)
    PI80_abs = float(np.quantile(np.abs(resid), 0.80)) if len(resid)>0 else 0.0
    PI95_abs = float(np.quantile(np.abs(resid), 0.95)) if len(resid)>0 else 0.0

    yhat_test = best_model.predict(X_test) if len(X_test) else np.array([])
    rmse = float(np.sqrt(mean_squared_error(y_test, yhat_test))) if len(y_test)>0 else np.nan
    mae  = float(mean_absolute_error(y_test, yhat_test)) if len(y_test)>0 else np.nan
    sm   = float(smape(y_test, yhat_test)) if len(y_test)>0 else np.nan
    print(f'Test {TARGET}: RMSE={rmse:.4f} | MAE={mae:.4f} | sMAPE={sm:.3f}% | n_test={int(mask_test.sum())}')

    # Guardar artifact “rico”
    feat_cols = list(X.columns) if hasattr(X, 'columns') else None
    artifact = {
        'model': best_model,
        'y_col': TARGET,
        'feat_cols': feat_cols,
        'group_cols': [c for c in ['country','type'] if c in master_df.columns],
        'PI80_abs': PI80_abs,
        'PI95_abs': PI95_abs,
    }
    out_path = OUT_MODELS / f'{TARGET}_model.joblib'
    dump(artifact, out_path)
    print('✅ Artifact guardado ->', out_path)

    # Predicciones test con bandas
    if len(yhat_test):
        base_cols = ['year'] + [c for c in ['country','type'] if c in master_df.columns]
        base = master_df.loc[X_test.index, base_cols].copy()
        base[f'pred_{TARGET}'] = yhat_test
        if PI80_abs>0:
            base[f'pred_{TARGET}_lo80'] = yhat_test - PI80_abs
            base[f'pred_{TARGET}_hi80'] = yhat_test + PI80_abs
        if PI95_abs>0:
            base[f'pred_{TARGET}_lo95'] = yhat_test - PI95_abs
            base[f'pred_{TARGET}_hi95'] = yhat_test + PI95_abs
        preds_all.append(base)

    results.append({'target': TARGET, 'best_model': best_name,
                    'RMSE': rmse, 'MAE': mae, 'sMAPE%': sm, 'n_test': int(mask_test.sum()),
                    'artifact': str(out_path), 'PI80_abs': PI80_abs, 'PI95_abs': PI95_abs,
                    'test_year': test_year, 'val_year': val_year})

results_df = pd.DataFrame(results)
results_df


Entrenando target = price
Mejor modelo en val: rf (MAE=10.0550)
Test price: RMSE=0.8500 | MAE=0.8500 | sMAPE=0.383% | n_test=55
✅ Artifact guardado -> models/price_model.joblib

Entrenando target = consumption
Mejor modelo en val: rf (MAE=1545649.7304)
Test consumption: RMSE=5270065.0822 | MAE=2021107.0803 | sMAPE=1.547% | n_test=55
✅ Artifact guardado -> models/consumption_model.joblib

Entrenando target = profit
Mejor modelo en val: rf (MAE=819023721.3023)
Test profit: RMSE=1236075648.5734 | MAE=376286684.7858 | sMAPE=4.984% | n_test=55
✅ Artifact guardado -> models/profit_model.joblib


Unnamed: 0,target,best_model,RMSE,MAE,sMAPE%,n_test,artifact,PI80_abs,PI95_abs,test_year,val_year
0,price,rf,0.8499985,0.8499985,0.382762,55,models/price_model.joblib,10.055,10.055,2020,2019
1,consumption,rf,5270065.0,2021107.0,1.547106,55,models/consumption_model.joblib,588040.8,7660839.0,2020,2019
2,profit,rf,1236076000.0,376286700.0,4.983959,55,models/profit_model.joblib,410555400.0,2720336000.0,2020,2019


In [5]:
# === Exportar predicciones de test ===
if len(preds_all):
    df_preds = preds_all[0]
    for p in preds_all[1:]:
        df_preds = df_preds.merge(p, on=[c for c in ['year','country','type'] if c in df_preds.columns and c in p.columns], how='outer')
    out_csv = OUT_PREDS / 'predicciones_test.csv'
    df_preds.to_csv(out_csv, index=False)
    print('📦 Predicciones de test ->', out_csv)
else:
    print('No se generaron predicciones de test.')

📦 Predicciones de test -> predicciones/predicciones_test.csv
