Script 1A: Predicciones de Modelos de Machine Learning (LGBM, XGBoost)

In [3]:
# SCRIPT 1A COMPLETO: generar_preds_ml.py (Corregido)

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from tqdm.auto import tqdm
import gc

# ==============================================================================
# FUNCIÓN DE FEATURE ENGINEERING
# ==============================================================================
def create_ultimate_features(dataframe):
    print("Creando características...")
    df_feat = dataframe.copy()
    lag_base = 2
    lags = list(range(lag_base, 13)) + [18]
    windows = [3, 6, 12]
    for lag in tqdm(lags, desc="Lags"):
        df_feat[f'lag_{lag}'] = df_feat.groupby('product_id')['tn'].shift(lag)
    for window in windows:
        series = df_feat.groupby('product_id')['tn'].shift(lag_base)
        df_feat[f'rolling_mean_{window}'] = series.rolling(window, min_periods=1).mean()
        df_feat[f'rolling_std_{window}'] = series.rolling(window, min_periods=1).std()
    df_feat['month'] = df_feat['periodo'].dt.month
    df_feat['year'] = df_feat['periodo'].dt.year
    return df_feat

# ==============================================================================
# EJECUCIÓN DEL SCRIPT
# ==============================================================================
if __name__ == "__main__":
    print("--- Generando predicciones históricas para modelos de ML ---")

    # --- Carga y Preparación ---
    df_original = pd.read_csv('dataframe_final_exportado.csv')
    df_original['periodo'] = pd.to_datetime(df_original['periodo'], format='%Y%m')
    df_agg = df_original.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()
    df_agg = df_agg.sort_values(['product_id', 'periodo'])

    # --- Feature Engineering ---
    df_featured = create_ultimate_features(df_agg)
    
    # --- Creación de Target y Set de Datos Completo ---
    df_featured['target'] = df_featured.groupby('product_id')['tn'].shift(-2)
    
    # --- INICIO DE LA CORRECCIÓN ---
    # Rellenamos NaNs solo en las columnas de features
    features = [col for col in df_featured.columns if col not in ['product_id', 'periodo', 'tn', 'target']]
    df_featured[features] = df_featured[features].fillna(0)
    # --- FIN DE LA CORRECCIÓN ---
    
    # Este es el DataFrame que faltaba
    df_train_full = df_featured.dropna(subset=['target']).copy()
    
    del df_original, df_agg, df_featured
    gc.collect()

    # --- Dividir para predecir Dic 2019 ---
    # Entrenamos con todos los datos cuyo target es anterior a Dic 2019
    # El target para las filas de Oct 2019 es Dic 2019, así que entrenamos hasta Sep 2019
    train_ml = df_train_full[df_train_full['periodo'] < '2019-10-01'].copy()
    # El set de predicción son las filas de Oct 2019
    pred_ml_base = df_train_full[df_train_full['periodo'] == '2019-10-01'].copy()

    X_train, y_train = train_ml[features], train_ml['target']
    X_pred = pred_ml_base[features]

    # --- Entrenar y Predecir con LGBM ---
    print("\nEntrenando LGBM...")
    model_lgbm = lgb.LGBMRegressor(random_state=42)
    model_lgbm.fit(X_train, y_train)
    preds_lgbm = model_lgbm.predict(X_pred)
    pd.DataFrame({'product_id': pred_ml_base['product_id'], 'tn_lgbm': preds_lgbm}).to_csv('preds_lgbm_dec2019.csv', index=False)
    print("Predicción de LGBM para Dic 2019 guardada.")
    
    # --- Entrenar y Predecir con XGBoost ---
    print("\nEntrenando XGBoost...")
    model_xgb = xgb.XGBRegressor(random_state=42)
    model_xgb.fit(X_train, y_train)
    preds_xgb = model_xgb.predict(X_pred)
    pd.DataFrame({'product_id': pred_ml_base['product_id'], 'tn_xgb': preds_xgb}).to_csv('preds_xgb_dec2019.csv', index=False)
    print("Predicción de XGBoost para Dic 2019 guardada.")

    print("\n🎉 Script 1A completado.")

--- Generando predicciones históricas para modelos de ML ---
Creando características...


Lags: 100%|██████████| 12/12 [00:00<00:00, 2475.25it/s]


Entrenando LGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000540 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4607
[LightGBM] [Info] Number of data points in the train set: 20009, number of used features: 20
[LightGBM] [Info] Start training from score 51.660180





Predicción de LGBM para Dic 2019 guardada.

Entrenando XGBoost...
Predicción de XGBoost para Dic 2019 guardada.

🎉 Script 1A completado.


In [1]:
# SCRIPT 1B: generar_preds_clasicos.py

import pandas as pd
from statsmodels.tsa.api import ExponentialSmoothing
from prophet import Prophet
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings("ignore")

print("--- Generando predicciones históricas para ETS y Prophet ---")

INPUT_FILE = 'dataframe_final_exportado.csv'
OUTPUT_FILE = 'preds_clasicos_dec2019.csv'

df_original = pd.read_csv(INPUT_FILE)
df_original['periodo'] = pd.to_datetime(df_original['periodo'], format='%Y%m')
df_agg = df_original.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()

all_products = df_agg['product_id'].unique()
all_preds = []

for product_id in tqdm(all_products, desc="Procesando productos"):
    ts = df_agg[df_agg['product_id'] == product_id].set_index('periodo')['tn']
    train_data = ts[ts.index < '2019-12-01']

    if len(train_data) < 13:
        all_preds.append({'product_id': product_id, 'pred_ets': 0, 'pred_prophet': 0})
        continue

    # Modelo 1: ETS (Holt-Winters)
    pred_ets = 0
    try:
        model_ets = ExponentialSmoothing(train_data, trend="add", seasonal="add", seasonal_periods=12, initialization_method="estimated").fit()
        pred_ets = model_ets.forecast(1).iloc[0]
    except Exception:
        pass

    # Modelo 2: Prophet
    pred_prophet = 0
    try:
        df_prophet = train_data.reset_index().rename(columns={'periodo': 'ds', 'tn': 'y'})
        model_prophet = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
        model_prophet.fit(df_prophet)
        future = model_prophet.make_future_dataframe(periods=1, freq='MS')
        pred_prophet = model_prophet.predict(future)['yhat'].iloc[-1]
    except Exception:
        pass

    all_preds.append({
        'product_id': product_id,
        'pred_ets': pred_ets,
        'pred_prophet': pred_prophet
    })

pd.DataFrame(all_preds).to_csv(OUTPUT_FILE, index=False)
print(f"\n🎉 Predicciones Clásicas (ETS, Prophet) generadas en '{OUTPUT_FILE}'.")

  from .autonotebook import tqdm as notebook_tqdm


--- Generando predicciones históricas para ETS y Prophet ---


Procesando productos:   0%|          | 0/780 [00:00<?, ?it/s]18:03:33 - cmdstanpy - INFO - Chain [1] start processing
18:03:33 - cmdstanpy - INFO - Chain [1] done processing
Procesando productos:   0%|          | 1/780 [00:01<24:56,  1.92s/it]18:03:33 - cmdstanpy - INFO - Chain [1] start processing
18:03:33 - cmdstanpy - INFO - Chain [1] done processing
Procesando productos:   0%|          | 2/780 [00:02<11:07,  1.17it/s]18:03:33 - cmdstanpy - INFO - Chain [1] start processing
18:03:34 - cmdstanpy - INFO - Chain [1] done processing
Procesando productos:   0%|          | 3/780 [00:02<06:44,  1.92it/s]18:03:34 - cmdstanpy - INFO - Chain [1] start processing
18:03:34 - cmdstanpy - INFO - Chain [1] done processing
Procesando productos:   1%|          | 4/780 [00:02<04:45,  2.72it/s]18:03:34 - cmdstanpy - INFO - Chain [1] start processing
18:03:34 - cmdstanpy - INFO - Chain [1] done processing
Procesando productos:   1%|          | 5/780 [00:02<03:39,  3.53it/s]18:03:34 - cmdstanpy - INFO -


🎉 Predicciones Clásicas (ETS, Prophet) generadas en 'preds_clasicos_dec2019.csv'.





In [2]:
# SCRIPT 1C: generar_preds_autogluon.py

import pandas as pd
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from dateutil.relativedelta import relativedelta

print("--- Fase 1C: Generando la predicción histórica de AutoGluon ---")

INPUT_FILE = 'dataframe_final_exportado.csv'
OUTPUT_FILE = 'preds_autogluon_dec2019.csv'
MODELS_PATH = './autogluon_models_bakeoff'

# Cargar y preparar los datos
df_original = pd.read_csv(INPUT_FILE)
df_original['periodo'] = pd.to_datetime(df_original['periodo'], format='%Y%m')
df_agg = df_original.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()
df_agg.rename(columns={'periodo': 'timestamp', 'product_id': 'item_id', 'tn': 'target'}, inplace=True)
ts_df = TimeSeriesDataFrame.from_data_frame(df_agg, id_column="item_id", timestamp_column="timestamp")

# Dividir los datos: entrenar hasta Noviembre de 2019
train_data = ts_df[ts_df.index.get_level_values('timestamp') <= '2019-11-01']
print(f"Datos de entrenamiento para AutoGluon: {len(train_data)} filas, hasta 2019-11-01")

# Entrenar AutoGluon para predecir 1 mes en el futuro
predictor = TimeSeriesPredictor(
    prediction_length=1, # <-- La clave: solo predecimos 1 mes
    path=MODELS_PATH,
    target='target',
    eval_metric='MASE',
    freq='MS'
)
predictor.fit(
    train_data,
    presets='medium_quality',
    time_limit=3600 * 2 # Límite de 2 horas
)

# Predecir Diciembre 2019
predictions = predictor.predict(train_data)

# Formatear y guardar
predictions_df = predictions.reset_index()
predictions_df.rename(columns={'mean': 'tn_autogluon'}, inplace=True)
final_df = predictions_df[['item_id', 'tn_autogluon']].rename(columns={'item_id': 'product_id'})
final_df.to_csv(OUTPUT_FILE, index=False)

print(f"\n🎉 Predicción de AutoGluon para Dic 2019 generada en '{OUTPUT_FILE}'.")

--- Fase 1C: Generando la predicción histórica de AutoGluon ---


Beginning AutoGluon training... Time limit = 7200s
AutoGluon will save models to '/Users/wilmerandresalarcon/Library/CloudStorage/OneDrive-Personal/MCD/2 AÑO/Labo III/Trabajo_final/autogluon_models_bakeoff'


Datos de entrenamiento para AutoGluon: 21569 filas, hasta 2019-11-01


AutoGluon Version:  1.3.1
Python Version:     3.11.13
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 24.5.0: Tue Apr 22 19:53:27 PDT 2025; root:xnu-11417.121.6~2/RELEASE_ARM64_T6041
CPU Count:          12
GPU Count:          0
Memory Avail:       10.45 GB / 24.00 GB (43.5%)
Disk Space Avail:   370.93 GB / 460.43 GB (80.6%)
Setting presets to: medium_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': MASE,
 'freq': 'MS',
 'hyperparameters': 'light',
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 1,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'target',
 'time_limit': 7200,
 'verbosity': 2}

train_data with frequency 'IRREG' has been resampled to frequency 'MS'.
Provided train_data has 21595 rows (NaN fraction=0.1%), 780 time series. Median time series l


🎉 Predicción de AutoGluon para Dic 2019 generada en 'preds_autogluon_dec2019.csv'.


In [3]:
# SCRIPT: generar_preds_ml_final.py

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from tqdm.auto import tqdm
import gc

# ==============================================================================
# FUNCIÓN DE FEATURE ENGINEERING
# ==============================================================================
def create_ultimate_features(dataframe):
    print("Creando características...")
    df_feat = dataframe.copy()
    lag_base = 2
    lags = list(range(lag_base, 13)) + [18]
    windows = [3, 6, 12]
    for lag in tqdm(lags, desc="Lags"):
        df_feat[f'lag_{lag}'] = df_feat.groupby('product_id')['tn'].shift(lag)
    for window in windows:
        series = df_feat.groupby('product_id')['tn'].shift(lag_base)
        df_feat[f'rolling_mean_{window}'] = series.rolling(window, min_periods=1).mean()
        df_feat[f'rolling_std_{window}'] = series.rolling(window, min_periods=1).std()
    df_feat['month'] = df_feat['periodo'].dt.month
    df_feat['year'] = df_feat['periodo'].dt.year
    return df_feat

# ==============================================================================
# EJECUCIÓN DEL SCRIPT
# ==============================================================================
if __name__ == "__main__":
    print("--- Generando predicciones históricas para modelos de ML ---")

    # --- Carga y Preparación ---
    df_original = pd.read_csv('dataframe_final_exportado.csv')
    df_original['periodo'] = pd.to_datetime(df_original['periodo'], format='%Y%m')
    df_agg = df_original.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()
    df_agg = df_agg.sort_values(['product_id', 'periodo'])

    # --- Feature Engineering ---
    df_featured = create_ultimate_features(df_agg)
    
    # --- Creación de Target y Set de Datos Completo ---
    df_featured['target'] = df_featured.groupby('product_id')['tn'].shift(-2)
    features = [col for col in df_featured.columns if col not in ['product_id', 'periodo', 'tn', 'target']]
    df_featured[features] = df_featured[features].fillna(0)
    df_train_full = df_featured.dropna(subset=['target']).copy()
    
    del df_original, df_agg, df_featured
    gc.collect()

    # --- Dividir para predecir Dic 2019 ---
    train_ml = df_train_full[df_train_full['periodo'] < '2019-10-01'].copy()
    pred_ml_base = df_train_full[df_train_full['periodo'] == '2019-10-01'].copy()

    X_train, y_train = train_ml[features], train_ml['target']
    X_pred = pred_ml_base[features]

    # --- Entrenar y Predecir con LGBM ---
    print("\nEntrenando LGBM...")
    model_lgbm = lgb.LGBMRegressor(random_state=42)
    model_lgbm.fit(X_train, y_train)
    preds_lgbm = model_lgbm.predict(X_pred)
    pd.DataFrame({'product_id': pred_ml_base['product_id'], 'tn_lgbm': preds_lgbm}).to_csv('preds_lgbm_dec2019.csv', index=False)
    print("Predicción de LGBM para Dic 2019 guardada.")
    
    # --- Entrenar y Predecir con XGBoost ---
    print("\nEntrenando XGBoost...")
    model_xgb = xgb.XGBRegressor(random_state=42)
    model_xgb.fit(X_train, y_train)
    preds_xgb = model_xgb.predict(X_pred)
    pd.DataFrame({'product_id': pred_ml_base['product_id'], 'tn_xgb': preds_xgb}).to_csv('preds_xgb_dec2019.csv', index=False)
    print("Predicción de XGBoost para Dic 2019 guardada.")

    # --- Entrenar y Predecir con Random Forest ---
    print("\nEntrenando Random Forest...")
    model_rf = RandomForestRegressor(random_state=42, n_jobs=-1)
    model_rf.fit(X_train, y_train)
    preds_rf = model_rf.predict(X_pred)
    pd.DataFrame({'product_id': pred_ml_base['product_id'], 'tn_rf': preds_rf}).to_csv('preds_rf_dec2019.csv', index=False)
    print("Predicción de Random Forest para Dic 2019 guardada.")

    print("\n🎉 Todos los scripts de ML han completado la Fase 1.")

--- Generando predicciones históricas para modelos de ML ---
Creando características...


Lags: 100%|██████████| 12/12 [00:00<00:00, 2830.96it/s]


Entrenando LGBM...





Predicción de LGBM para Dic 2019 guardada.

Entrenando XGBoost...
Predicción de XGBoost para Dic 2019 guardada.

Entrenando Random Forest...
Predicción de Random Forest para Dic 2019 guardada.

🎉 Todos los scripts de ML han completado la Fase 1.


In [1]:
# SCRIPT: generar_preds_plottwist.py

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings("ignore")

print("--- Generando predicciones históricas con la estrategia 'Plot Twist' ---")

# --- CONFIGURACIÓN ---
INPUT_FILE = 'dataframe_final_exportado.csv'
OUTPUT_FILE = 'preds_plottwist_dec2019.csv'

# --- CARGA Y PREPARACIÓN ---
df_original = pd.read_csv(INPUT_FILE)
df_original['periodo'] = pd.to_datetime(df_original['periodo'], format='%Y%m')
df = df_original.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()
df = df.sort_values(['product_id', 'periodo'])

# --- FEATURE ENGINEERING Y TARGET ---
for lag in tqdm(range(1, 12), desc="Creando Lags"):
    df[f'tn_{lag}'] = df.groupby('product_id')['tn'].shift(lag)
df['clase'] = df.groupby('product_id')['tn'].shift(-2)

# --- ENTRENAMIENTO (Lógica Original) ---
df_201812 = df[df['periodo'] == '2018-12-01'].copy()
magicos = [ 20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
            20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
            20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
            20193, 20320, 20532, 20612, 20637, 20807, 20838 ]
df_training_base = df_201812[df_201812['product_id'].isin(magicos)]
feature_cols = ['tn'] + [f'tn_{i}' for i in range(1, 12)]
target_col = 'clase'
df_training = df_training_base.dropna(subset=feature_cols + [target_col])

X_train = df_training[feature_cols]
y_train = df_training[target_col]

model = LinearRegression()
model.fit(X_train, y_train)
print("Modelo 'Plot Twist' entrenado exitosamente.")

# --- PREDICCIÓN PARA DIC 2019 (LA ADAPTACIÓN CLAVE) ---
print("\nPreparando la predicción para Diciembre 2019...")

# Para predecir Dic 2019 (target de Oct 2019), usamos los datos de Octubre 2019
df_201910 = df[df['periodo'] == '2019-10-01'].copy()

# Estrategia híbrida (igual que el original)
df_201910['history_complete'] = ~df_201910[feature_cols].isnull().any(axis=1)
df_to_predict_lr = df_201910[df_201910['history_complete']].copy()
df_to_predict_avg = df_201910[~df_201910['history_complete']].copy()

if not df_to_predict_lr.empty:
    X_predict_lr = df_to_predict_lr[feature_cols]
    df_to_predict_lr['prediction'] = model.predict(X_predict_lr)

if not df_to_predict_avg.empty:
    df_to_predict_avg['prediction'] = df_to_predict_avg[feature_cols].mean(axis=1)

df_final_prediction = pd.concat([df_to_predict_lr, df_to_predict_avg])
df_final_prediction['prediction'] = df_final_prediction['prediction'].clip(lower=0)

# --- GUARDAR RESULTADO ---
submission_df = df_final_prediction[['product_id', 'prediction']].rename(columns={'prediction': 'tn_plottwist'})
submission_df.to_csv(OUTPUT_FILE, index=False)

print(f"\n🎉 Predicción de 'Plot Twist' para Dic 2019 guardada en '{OUTPUT_FILE}'.")

  from .autonotebook import tqdm as notebook_tqdm


--- Generando predicciones históricas con la estrategia 'Plot Twist' ---


Creando Lags: 100%|██████████| 11/11 [00:00<00:00, 2197.02it/s]

Modelo 'Plot Twist' entrenado exitosamente.

Preparando la predicción para Diciembre 2019...

🎉 Predicción de 'Plot Twist' para Dic 2019 guardada en 'preds_plottwist_dec2019.csv'.





In [2]:
# SCRIPT: generar_preds_cnn.py (Versión Corregida y Robusta)

import pandas as pd
import numpy as np
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Input, Dense, Conv1D, MaxPooling1D, Flatten
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm
import gc

# ==============================================================================
# CONFIGURACIÓN
# ==============================================================================
INPUT_FILE = 'dataframe_final_exportado.csv'
OUTPUT_FILE = 'preds_cnn_dec2019.csv'
N_STEPS_IN = 12  # Usaremos 12 meses de historia para predecir
N_STEPS_OUT = 2   # Queremos predecir 2 meses en el futuro (t+2)

# ==============================================================================
# FUNCIÓN PARA CREAR SECUENCIAS
# ==============================================================================
def create_sequences(data, n_steps_in, n_steps_out):
    X, y = [], []
    for i in range(len(data)):
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out -1
        if out_end_ix >= len(data):
            break
        seq_x, seq_y = data[i:end_ix], data[out_end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

# ==============================================================================
# EJECUCIÓN DEL SCRIPT
# ==============================================================================
if __name__ == "__main__":
    print("--- Generando predicciones históricas con CNN (v2) ---")

    # --- Carga y Preparación ---
    df_original = pd.read_csv(INPUT_FILE)
    df_original['periodo'] = pd.to_datetime(df_original['periodo'], format='%Y%m')
    df_agg = df_original.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()
    
    # --- Escalar los Datos Correctamente ---
    print("Escalando todos los datos de ventas...")
    # Entrenamos un único scaler con TODOS los valores de ventas
    scaler = StandardScaler()
    df_agg['tn_scaled'] = scaler.fit_transform(df_agg[['tn']])

    # --- Crear Secuencias para Entrenamiento ---
    print("Creando secuencias para la red neuronal...")
    all_X_train, all_y_train = [], []
    # Creamos secuencias para cada producto por separado y luego las unimos
    for product_id in tqdm(df_agg['product_id'].unique(), desc="Procesando secuencias"):
        product_series = df_agg[df_agg['product_id'] == product_id]['tn_scaled'].values
        # Usamos datos hasta Oct 2019 para que el target sea Dic 2019
        series_for_training = product_series[:-2] 
        
        X_seq, y_seq = create_sequences(series_for_training, N_STEPS_IN, N_STEPS_OUT)
        if len(X_seq) > 0:
            all_X_train.append(X_seq)
            all_y_train.append(y_seq)
    
    X_train_final = np.concatenate(all_X_train)
    y_train_final = np.concatenate(all_y_train)
    X_train_final = X_train_final.reshape((X_train_final.shape[0], X_train_final.shape[1], 1))
    
    print(f"Set de entrenamiento creado con {len(X_train_final)} secuencias.")

    # --- Construir y Entrenar el Modelo CNN ---
    print("\nConstruyendo y entrenando el modelo CNN...")
    model = Sequential([
        Input(shape=(N_STEPS_IN, 1)),
        Conv1D(filters=64, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(50, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mae')
    model.fit(X_train_final, y_train_final, epochs=50, batch_size=64, verbose=1)
    print("Modelo CNN entrenado.")
    
    # --- Generar Predicciones para Dic 2019 ---
    print("\nGenerando predicciones para Diciembre 2019...")
    final_preds = []
    
    for product_id in tqdm(df_agg['product_id'].unique(), desc="Prediciendo con CNN"):
        # Tomar los últimos 12 meses de datos (Dic 2018 a Nov 2019) para predecir Dic 2019
        product_series_scaled = df_agg[df_agg['product_id'] == product_id]['tn_scaled'].values
        input_sequence = product_series_scaled[-N_STEPS_IN-2:-2]
        
        if len(input_sequence) == N_STEPS_IN:
            input_seq_reshaped = input_sequence.reshape((1, N_STEPS_IN, 1))
            prediction_scaled = model.predict(input_seq_reshaped, verbose=0)
            
            # Revertir la escala para obtener el valor real
            prediction_unscaled = scaler.inverse_transform(prediction_scaled)
            final_preds.append({'product_id': product_id, 'tn_cnn': prediction_unscaled[0][0]})
        else:
            # Si no hay suficiente historia, predecir 0
            final_preds.append({'product_id': product_id, 'tn_cnn': 0})
            
    # Guardar resultados
    pd.DataFrame(final_preds).to_csv(OUTPUT_FILE, index=False)
    print(f"\n🎉 Predicción de CNN para Dic 2019 guardada en '{OUTPUT_FILE}'.")

--- Generando predicciones históricas con CNN (v2) ---
Escalando todos los datos de ventas...
Creando secuencias para la red neuronal...


Procesando secuencias: 100%|██████████| 780/780 [00:00<00:00, 11372.72it/s]

Set de entrenamiento creado con 11596 secuencias.

Construyendo y entrenando el modelo CNN...



2025-07-13 19:02:46.334864: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4 Pro
2025-07-13 19:02:46.334899: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 24.00 GB
2025-07-13 19:02:46.334905: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 8.00 GB
I0000 00:00:1752447766.334915  245417 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1752447766.334931  245417 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/50


2025-07-13 19:02:47.085632: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.1483
Epoch 2/50
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.1044
Epoch 3/50
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.1028
Epoch 4/50
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0966
Epoch 5/50
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.1017
Epoch 6/50
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.1039
Epoch 7/50
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0989
Epoch 8/50
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.1008
Epoch 9/50
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0955
Epoch 10/50
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0957


Prediciendo con CNN: 100%|██████████| 780/780 [00:14<00:00, 52.04it/s] 


🎉 Predicción de CNN para Dic 2019 guardada en 'preds_cnn_dec2019.csv'.





Script A Completo: Bake-off y Creación de Pendientes

In [7]:
# SCRIPT A: bakeoff_maestro_modificado.py (Corrección Final)

import pandas as pd
import numpy as np
import json
import os
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings("ignore")

# ==============================================================================
# CONFIGURACIÓN
# ==============================================================================
INPUT_FILE = 'dataframe_final_exportado.csv'
CHAMPIONS_FILE = 'bakeoff_champions_map.json'
PENDING_ARIMA_FILE = 'productos_para_arima.json'
PARTIAL_SUBMISSION_FILE = 'submission_parcial.csv'

# ==============================================================================
# SCRIPT MAESTRO
# ==============================================================================
if __name__ == "__main__":
    # --- PASO 1: CARGAR TODO Y PREPARAR LA COMPETENCIA ---
    print("--- PASO 1: Cargando todas las predicciones históricas ---")
    
    df_original = pd.read_csv(INPUT_FILE)
    df_original['periodo'] = pd.to_datetime(df_original['periodo'], format='%Y%m')
    df_agg = df_original.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()
    df_reales_dec2019 = df_agg[df_agg['periodo'] == '2019-12-01'][['product_id', 'tn']].rename(columns={'tn': 'actual'})

    df_bakeoff = df_reales_dec2019
    pred_files = {
        'LGBM': 'preds_lgbm_dec2019.csv', 'XGBoost': 'preds_xgb_dec2019.csv',
        'RandomForest': 'preds_rf_dec2019.csv', 'ARIMA': 'preds_arima_dec2019.csv',
        'AutoGluon': 'preds_autogluon_dec2019.csv', 'CNN': 'preds_cnn_dec2019.csv',
        'PlotTwist': 'preds_plottwist_dec2019.csv'
    }
    
    for model_name, file_name in pred_files.items():
        try:
            df_pred = pd.read_csv(file_name)
            df_pred.rename(columns={df_pred.columns[1]: model_name}, inplace=True)
            df_bakeoff = pd.merge(df_bakeoff, df_pred, on='product_id', how='left')
        except FileNotFoundError:
            print(f"Advertencia: No se encontró {file_name}. Se omitirá {model_name}.")
            df_bakeoff[model_name] = 0

    df_clasicos = pd.read_csv('preds_clasicos_dec2019.csv').rename(columns={'pred_ets': 'ETS', 'pred_prophet': 'Prophet'})
    df_bakeoff = pd.merge(df_bakeoff, df_clasicos, on='product_id', how='left')
    df_bakeoff.fillna(0, inplace=True)
    
    # --- PASO 2: ELEGIR AL CAMPEÓN PARA CADA PRODUCTO ---
    print("\n--- PASO 2: Realizando Bake-off para encontrar el mejor modelo por producto ---")
    
    if os.path.exists(CHAMPIONS_FILE):
        with open(CHAMPIONS_FILE, 'r') as f: champions_map = json.load(f)
        champions_map = {int(k): v for k, v in champions_map.items()}
        print(f"Mapa de campeones cargado. {len(champions_map)} productos ya procesados.")
    else:
        champions_map = {}

    model_cols = [col for col in df_bakeoff.columns if col not in ['product_id', 'actual']]
    
    for _, row in tqdm(df_bakeoff.iterrows(), total=len(df_bakeoff), desc="Eligiendo campeones"):
        product_id = int(row['product_id'])
        if product_id in champions_map: continue
            
        errors = {model_name: abs(row['actual'] - row[model_name]) for model_name in model_cols if model_name in row}
        if not errors: continue
        best_model = min(errors, key=errors.get)
        champions_map[product_id] = best_model
        
    with open(CHAMPIONS_FILE, 'w') as f: json.dump(champions_map, f)
        
    print("Bake-off completado. Mapa de campeones guardado.")
    print("\nDistribución de modelos campeones:")
    print(pd.Series(champions_map).value_counts())

    # --- PASO 3: GENERAR PREDICCIÓN FINAL (PARCIAL) ---
    print("\n--- PASO 3: Generando predicción parcial y lista de pendientes para ARIMA ---")
    
    # --- INICIO DE LA CORRECCIÓN ---
    # Cargar y renombrar las columnas CORRECTAMENTE antes de unir
    df_ag_final = pd.read_csv('predicciones_febrero2020_fecha_01_07_v2.csv').rename(columns={'tn': 'tn_autogluon'})
    df_pt_final = pd.read_csv('submission_plot_twist.csv').rename(columns={'tn': 'tn_plottwist'})
    # --- FIN DE LA CORRECCIÓN ---
    
    df_ensemble = pd.merge(df_ag_final, df_pt_final, on='product_id')
    df_ensemble['Ensemble'] = df_ensemble['tn_autogluon'] * 0.6 + df_ensemble['tn_plottwist'] * 0.4
    ensemble_map = df_ensemble.set_index('product_id')['Ensemble']

    final_predictions = []
    products_for_arima = []

    for product_id, best_model_name in tqdm(champions_map.items(), desc="Generando predicción parcial"):
        if best_model_name == 'ARIMA':
            products_for_arima.append(product_id)
            final_predictions.append({'product_id': product_id, 'tn': np.nan})
            continue

        final_pred = ensemble_map.get(product_id, 0)
        final_predictions.append({'product_id': product_id, 'tn': final_pred})

    with open(PENDING_ARIMA_FILE, 'w') as f:
        json.dump(products_for_arima, f)
    print(f"Se guardó la lista de {len(products_for_arima)} productos para ARIMA en '{PENDING_ARIMA_FILE}'.")

    submission_df = pd.DataFrame(final_predictions).sort_values('product_id')
    submission_df.to_csv(PARTIAL_SUBMISSION_FILE, index=False)
    print(f"Sumisión parcial guardada en '{PARTIAL_SUBMISSION_FILE}'.")

--- PASO 1: Cargando todas las predicciones históricas ---

--- PASO 2: Realizando Bake-off para encontrar el mejor modelo por producto ---
Mapa de campeones cargado. 780 productos ya procesados.


Eligiendo campeones: 100%|██████████| 780/780 [00:00<00:00, 127825.16it/s]


Bake-off completado. Mapa de campeones guardado.

Distribución de modelos campeones:
AutoGluon       156
ARIMA            92
RandomForest     91
ETS              91
PlotTwist        83
LGBM             77
Prophet          74
CNN              58
XGBoost          58
Name: count, dtype: int64

--- PASO 3: Generando predicción parcial y lista de pendientes para ARIMA ---


Generando predicción parcial: 100%|██████████| 780/780 [00:00<00:00, 900759.12it/s]

Se guardó la lista de 92 productos para ARIMA en 'productos_para_arima.json'.
Sumisión parcial guardada en 'submission_parcial.csv'.





In [1]:
# SCRIPT C: ensamblar_final.py

import pandas as pd

print("--- Ensamblando resultados finales ---")

PARTIAL_SUBMISSION_FILE = 'submission_parcial.csv'
ARIMA_SUBMISSION_FILE = 'submission_only_arima.csv'
FINAL_OUTPUT_FILE = 'submission_GRAND_BAKEOFF_FINAL.csv'

# Cargar la sumisión parcial y las predicciones de ARIMA
df_parcial = pd.read_csv(PARTIAL_SUBMISSION_FILE)
df_arima = pd.read_csv(ARIMA_SUBMISSION_FILE)

# Crear un mapa de las predicciones de ARIMA
arima_map = df_arima.set_index('product_id')['tn_arima']

# Rellenar los valores faltantes en la sumisión parcial usando el mapa de ARIMA
# .fillna() solo rellenará los valores que son NaN
df_parcial['tn'] = df_parcial['tn'].fillna(df_parcial['product_id'].map(arima_map))

# Asegurarse de que no queden nulos por si acaso
df_parcial.fillna(0, inplace=True)

# Guardar la sumisión final y completa
df_parcial.to_csv(FINAL_OUTPUT_FILE, index=False)

print(f"\n🎉 ¡SUMISIÓN FINAL CREADA! Revisa el archivo '{FINAL_OUTPUT_FILE}'.")

--- Ensamblando resultados finales ---

🎉 ¡SUMISIÓN FINAL CREADA! Revisa el archivo 'submission_GRAND_BAKEOFF_FINAL.csv'.


Ensamble Ponderado Dinámico Seguerido por Ariel

In [2]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# ==============================================================================
# CONFIGURACIÓN
# ==============================================================================
INPUT_FILE = 'dataframe_final_exportado.csv'
OUTPUT_FILE = 'submission_DYNAMIC_ENSEMBLE_v2.csv'
EPSILON = 1.0 

# Archivos de predicciones FINALES (para Feb 2020)
AG_FINAL_PREDS_FILE = 'predicciones_febrero2020_fecha_01_07_v2.csv'
PT_FINAL_PREDS_FILE = 'submission_plot_twist.csv'

# ==============================================================================
# SCRIPT DE ENSAMBLE DINÁMICO
# ==============================================================================
if __name__ == "__main__":
    # --- PASO 1: GENERAR Y CARGAR PREDICCIONES HISTÓRICAS (Dic 2019) ---
    print("--- PASO 1: Preparando el Bake-off histórico ---")
    
    # Cargar datos reales y predicciones de los modelos principales
    df_original = pd.read_csv(INPUT_FILE)
    df_original['periodo'] = pd.to_datetime(df_original['periodo'], format='%Y%m')
    df_agg = df_original.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()
    df_reales_dec2019 = df_agg[df_agg['periodo'] == '2019-12-01'][['product_id', 'tn']].rename(columns={'tn': 'actual'})
    
    df_ag_hist = pd.read_csv('preds_autogluon_dec2019.csv').rename(columns={'tn_autogluon': 'AutoGluon'})
    df_pt_hist = pd.read_csv('preds_plottwist_dec2019.csv').rename(columns={'tn_plottwist': 'PlotTwist'})
    
    # Generar predicción del baseline "Promedio 3 Meses"
    df_baseline_hist = df_agg[df_agg['periodo'].between('2019-09-01', '2019-11-01')]
    df_baseline_hist = df_baseline_hist.groupby('product_id')['tn'].mean().reset_index().rename(columns={'tn': 'Baseline'})
    
    # Unir todo para la competencia
    df_bakeoff = pd.merge(df_reales_dec2019, df_ag_hist, on='product_id', how='left')
    df_bakeoff = pd.merge(df_bakeoff, df_pt_hist, on='product_id', how='left')
    df_bakeoff = pd.merge(df_bakeoff, df_baseline_hist, on='product_id', how='left')
    df_bakeoff.fillna(0, inplace=True)

    # --- PASO 2: CALCULAR LOS PESOS DINÁMICOS ---
    print("\n--- PASO 2: Calculando pesos dinámicos para cada producto ---")
    
    model_cols = ['AutoGluon', 'PlotTwist', 'Baseline']
    weight_cols = []

    for model in model_cols:
        df_bakeoff[f'error_{model}'] = (df_bakeoff['actual'] - df_bakeoff[model]).abs()
        weight_col_name = f'w_{model}'
        weight_cols.append(weight_col_name)
        df_bakeoff[weight_col_name] = 1 / (df_bakeoff[f'error_{model}'] + EPSILON)
    
    df_bakeoff['total_weight'] = df_bakeoff[weight_cols].sum(axis=1)
    for w_col in weight_cols:
        df_bakeoff[w_col] = df_bakeoff[w_col] / df_bakeoff['total_weight']
        
    df_weights = df_bakeoff[['product_id'] + weight_cols]
    print("Pesos dinámicos calculados.")
    
    # --- PASO 3: APLICAR PESOS A LAS PREDICCIONES FINALES ---
    print("\n--- PASO 3: Aplicando pesos dinámicos a las predicciones de Feb 2020 ---")
    
    df_ag_final = pd.read_csv(AG_FINAL_PREDS_FILE).rename(columns={'tn': 'AutoGluon'})
    df_pt_final = pd.read_csv(PT_FINAL_PREDS_FILE).rename(columns={'tn': 'PlotTwist'})
    
    # Generar predicción final del baseline (promedio de los últimos 3 meses reales)
    df_baseline_final = df_agg[df_agg['periodo'].between('2019-10-01', '2019-12-01')]
    df_baseline_final = df_baseline_final.groupby('product_id')['tn'].mean().reset_index().rename(columns={'tn': 'Baseline'})

    # Unir todas las predicciones finales
    df_final_preds = pd.merge(df_weights, df_ag_final, on='product_id', how='left')
    df_final_preds = pd.merge(df_final_preds, df_pt_final, on='product_id', how='left')
    df_final_preds = pd.merge(df_final_preds, df_baseline_final, on='product_id', how='left')
    df_final_preds.fillna(0, inplace=True)
    
    # Calcular la predicción final ponderada
    df_final_preds['tn'] = (df_final_preds['AutoGluon'] * df_final_preds['w_AutoGluon'] +
                            df_final_preds['PlotTwist'] * df_final_preds['w_PlotTwist'] +
                            df_final_preds['Baseline'] * df_final_preds['w_Baseline'])

    # --- PASO 4: GUARDAR SUMISIÓN ---
    submission_df = df_final_preds[['product_id', 'tn']]
    submission_df['tn'] = submission_df['tn'].clip(lower=0)
    submission_df.to_csv(OUTPUT_FILE, index=False)
    
    print(f"\n🎉 ¡Pipeline de Ensamble Dinámico completado! Revisa el archivo '{OUTPUT_FILE}'.")

--- PASO 1: Preparando el Bake-off histórico ---

--- PASO 2: Calculando pesos dinámicos para cada producto ---
Pesos dinámicos calculados.

--- PASO 3: Aplicando pesos dinámicos a las predicciones de Feb 2020 ---

🎉 ¡Pipeline de Ensamble Dinámico completado! Revisa el archivo 'submission_DYNAMIC_ENSEMBLE_v2.csv'.


Ensamble Ponderado Dinámico Seguerido por Ariel v2

In [3]:
# SCRIPT: ENSAMBLE DINÁMICO DEFINITIVO

import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# ==============================================================================
# CONFIGURACIÓN
# ==============================================================================
INPUT_FILE = 'dataframe_final_exportado.csv'
OUTPUT_FILE = 'submission_ULTIMATE_DYNAMIC_ENSEMBLE.csv'
EPSILON = 1.0 

# ==============================================================================
# SCRIPT DE ENSAMBLE DINÁMICO
# ==============================================================================
if __name__ == "__main__":
    # --- PASO 1: CALCULAR PESOS DINÁMICOS BASADOS EN EL BAKE-OFF ---
    print("--- PASO 1: Calculando pesos dinámicos a partir del Bake-off ---")
    
    # Cargar datos reales de Dic 2019
    df_original = pd.read_csv(INPUT_FILE)
    df_original['periodo'] = pd.to_datetime(df_original['periodo'], format='%Y%m')
    df_agg = df_original.groupby(['product_id', 'periodo'])['tn'].sum().reset_index()
    df_reales_dec2019 = df_agg[df_agg['periodo'] == '2019-12-01'][['product_id', 'tn']].rename(columns={'tn': 'actual'})

    # Cargar y unir todas las predicciones históricas
    df_bakeoff = df_reales_dec2019
    model_cols = ['AutoGluon', 'ARIMA', 'RandomForest', 'ETS', 'PlotTwist', 'LGBM', 'Prophet', 'CNN', 'XGBoost']
    hist_pred_files = {
        'LGBM': 'preds_lgbm_dec2019.csv', 'XGBoost': 'preds_xgb_dec2019.csv',
        'RandomForest': 'preds_rf_dec2019.csv', 'ARIMA': 'preds_arima_dec2019.csv',
        'AutoGluon': 'preds_autogluon_dec2019.csv', 'CNN': 'preds_cnn_dec2019.csv',
        'PlotTwist': 'preds_plottwist_dec2019.csv'
    }
    for model_name, file_name in hist_pred_files.items():
        df_pred = pd.read_csv(file_name); df_pred.rename(columns={df_pred.columns[1]: model_name}, inplace=True)
        df_bakeoff = pd.merge(df_bakeoff, df_pred, on='product_id', how='left')
    df_clasicos = pd.read_csv('preds_clasicos_dec2019.csv').rename(columns={'pred_ets': 'ETS', 'pred_prophet': 'Prophet'})
    df_bakeoff = pd.merge(df_bakeoff, df_clasicos, on='product_id', how='left')
    df_bakeoff.fillna(0, inplace=True)

    # Calcular pesos
    weight_cols = []
    for model in model_cols:
        if model in df_bakeoff.columns:
            df_bakeoff[f'error_{model}'] = (df_bakeoff['actual'] - df_bakeoff[model]).abs()
            weight_col_name = f'w_{model}'; weight_cols.append(weight_col_name)
            df_bakeoff[weight_col_name] = 1 / (df_bakeoff[f'error_{model}'] + EPSILON)
    
    df_bakeoff['total_weight'] = df_bakeoff[weight_cols].sum(axis=1)
    for w_col in weight_cols: df_bakeoff[w_col] = df_bakeoff[w_col] / df_bakeoff['total_weight']
    df_weights = df_bakeoff[['product_id'] + weight_cols]
    print("Pesos dinámicos calculados.")

    # --- PASO 2: APLICAR PESOS A LAS PREDICCIONES FINALES ---
    print("\n--- PASO 2: Aplicando pesos a las predicciones de Feb 2020 ---")
    
    # Cargar todas las predicciones FINALES (sumisiones)
    # NOTA: Debes generar un archivo de sumisión final para cada modelo.
    # Aquí se asume que los tienes nombrados como 'submission_MODELO.csv'
    df_final_preds = df_weights
    for model in model_cols:
        try:
            # Asumimos que tienes archivos como submission_LGBM.csv, submission_ARIMA.csv, etc.
            # Aquí usamos tus dos archivos conocidos como ejemplo
            if model == 'AutoGluon':
                df_model_final = pd.read_csv('predicciones_febrero2020_fecha_01_07_v2.csv').rename(columns={'tn': model})
            elif model == 'PlotTwist':
                df_model_final = pd.read_csv('submission_plot_twist.csv').rename(columns={'tn': model})
            else:
                 # DEBES REEMPLAZAR ESTO CON LA CARGA DE TUS OTROS ARCHIVOS DE SUMISIÓN
                print(f"Cargando predicción proxy para {model}")
                df_model_final = pd.read_csv('predicciones_febrero2020_fecha_01_07_v2.csv').rename(columns={'tn': model})

            df_final_preds = pd.merge(df_final_preds, df_model_final, on='product_id', how='left')
        except FileNotFoundError:
            print(f"Advertencia: No se encontró el archivo de sumisión final para {model}. Se usará 0.")
            df_final_preds[model] = 0

    df_final_preds.fillna(0, inplace=True)
    
    # Calcular la predicción final ponderada
    df_final_preds['tn'] = 0
    for model in model_cols:
        if f'w_{model}' in df_final_preds.columns and model in df_final_preds.columns:
            df_final_preds['tn'] += df_final_preds[model] * df_final_preds[f'w_{model}']

    # --- PASO 3: GUARDAR SUMISIÓN ---
    submission_df = df_final_preds[['product_id', 'tn']]
    submission_df['tn'] = submission_df['tn'].clip(lower=0)
    submission_df.to_csv(OUTPUT_FILE, index=False)
    
    print(f"\n🎉 ¡Pipeline de Ensamble Dinámico Definitivo completado! Revisa el archivo '{OUTPUT_FILE}'.")

--- PASO 1: Calculando pesos dinámicos a partir del Bake-off ---
Pesos dinámicos calculados.

--- PASO 2: Aplicando pesos a las predicciones de Feb 2020 ---
Cargando predicción proxy para ARIMA
Cargando predicción proxy para RandomForest
Cargando predicción proxy para ETS
Cargando predicción proxy para LGBM
Cargando predicción proxy para Prophet
Cargando predicción proxy para CNN
Cargando predicción proxy para XGBoost

🎉 ¡Pipeline de Ensamble Dinámico Definitivo completado! Revisa el archivo 'submission_ULTIMATE_DYNAMIC_ENSEMBLE.csv'.
