In [1]:
import os
import pandas as pd
import pickle
import torch
import wandb
from utils.modeling import make_model
os.environ["WANDB_QUIET"] = "true" 
os.environ["WANDB_NOTEBOOK_NAME"] = "dynamic_opt_data_prep.ipynb"
all_data = pd.read_csv('./data/bimbo/train.csv')
all_data.head()

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil
0,3,1110,7,3301,15766,1212,3,25.14,0,0.0,3
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,4
2,3,1110,7,3301,15766,1238,4,39.32,0,0.0,4
3,3,1110,7,3301,15766,1240,4,33.52,0,0.0,4
4,3,1110,7,3301,15766,1242,3,22.92,0,0.0,3


In [2]:
all_data.Semana.value_counts().sort_index()

3    11165207
4    11009593
5    10615397
6    10191837
7    10382849
8    10406868
9    10408713
Name: Semana, dtype: int64

In [3]:
MIN_ML_MODEL_WEEK = 3
MAX_ML_MODEL_WEEK = 4
MIN_DECISION_MODEL_WEEK = 5
MAX_DECISION_MODEL_WEEK = 9

In [4]:
store_product_group_cols = ['Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID']
store_product_value_counts = all_data.groupby(store_product_group_cols).size()

In [5]:
store_product_value_counts.describe()

count    2.639665e+07
mean     2.810223e+00
std      1.964561e+00
min      1.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      4.000000e+00
max      7.000000e+00
dtype: float64

In [6]:
full_filled_cases = (store_product_value_counts == 7)
full_filled_data = all_data.set_index(store_product_group_cols)[full_filled_cases]
full_filled_data.reset_index(inplace=True)

  full_filled_data = all_data.set_index(store_product_group_cols)[full_filled_cases]


In [7]:
full_filled_data.shape

(17606645, 11)

In [8]:
prediction_data = full_filled_data.query("Semana >= @MIN_ML_MODEL_WEEK and Semana <= @MAX_ML_MODEL_WEEK")
decision_data = full_filled_data.query("Semana >= @MIN_DECISION_MODEL_WEEK and Semana <= @MAX_DECISION_MODEL_WEEK")
model, encoder = make_model(prediction_data, run_name="prediction_model")
torch.save(model, 'predictive_model.pt')
with open('catgeorical_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)


In [9]:
prediction_data.to_parquet('./data/bimbo/prediction_data.parquet')
decision_data.to_parquet('./data/bimbo/decision_data.parquet')

wandb_project = "decision_opt_bimbo"
with wandb.init(wandb_project):
    dynamic_optimization_artifact = wandb.Artifact('dynamic_optimization_data', type='dataset')
    dynamic_optimization_artifact.add_file('./data/bimbo/prediction_data.parquet', name='prediction_data.parquet')    
    dynamic_optimization_artifact.add_file('./data/bimbo/decision_data.parquet', name='decision_data.parquet')
    wandb.log_artifact(dynamic_optimization_artifact)

