In [2]:

import pandas as pd
import numpy as np
import pickle
from datetime import datetime
from pathlib import Path
import sys
sys.path.insert(0, "..")   
from src.utils import load_X_y_data, build_feature_and_label_cols
from src.model_utils import generate_loaders



In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
today_str = datetime.today().strftime("%Y-%m-%d")
today_str

'2025-07-17'

In [5]:
# load
df = load_X_y_data(Path("../output/data/train_top_store_15_item_clusters_sales_cyclical_features_X_1_day_y.parquet"),
                   window_size=1,
                   log_level="DEBUG",
                   )
df.head()

2025-07-17 10:09:44,071 - INFO - Loading data from ../output/data/train_top_store_15_item_clusters_sales_cyclical_features_X_1_day_y.parquet
2025-07-17 10:09:44,432 - INFO - Loaded data with shape (3948, 22)


Unnamed: 0,start_date,store_item,store_cluster,item_cluster,sales_day_1,store_med_day_1,item_med_day_1,store_med_change_1,item_med_change_1,store_cluster_logpct_change_1,...,dayofweek_cos_1,weekofmonth_sin_1,weekofmonth_cos_1,monthofyear_sin_1,monthofyear_cos_1,paycycle_sin_1,paycycle_cos_1,season_sin_1,season_cos_1,y_sales_day_1
0,2014-01-01,10_1038962,0,0,0.0,0.0,0.0,,,,...,-0.222521,0.951057,0.309017,0.5,0.866025,0.406737,0.913545,-0.9741,0.226116,0.0
1,2014-01-02,10_1038962,0,0,0.0,0.0,0.0,,,,...,-0.900969,0.951057,0.309017,0.5,0.866025,0.743145,0.669131,-0.970064,0.24285,0.0
2,2014-01-03,10_1038962,0,0,0.0,0.0,0.0,,,,...,-0.900969,0.951057,0.309017,0.5,0.866025,0.951057,0.309017,-0.96574,0.259512,0.0
3,2014-01-04,10_1038962,0,0,0.0,0.0,0.0,,,,...,-0.222521,0.951057,0.309017,0.5,0.866025,0.994522,-0.104529,-0.96113,0.276097,0.0
4,2014-01-05,10_1038962,0,0,0.0,0.0,0.0,,,,...,0.62349,0.951057,0.309017,0.5,0.866025,0.866025,-0.5,-0.956235,0.2926,0.0


In [11]:
df.dtypes


start_date                       datetime64[ns]
store_item                               object
store_cluster                            object
item_cluster                             object
sales_day_1                             float32
store_med_day_1                         float32
item_med_day_1                          float32
store_med_change_1                      float64
item_med_change_1                       float64
store_cluster_logpct_change_1           float64
item_cluster_logpct_change_1            float64
dayofweek_sin_1                         float32
dayofweek_cos_1                         float32
weekofmonth_sin_1                       float32
weekofmonth_cos_1                       float32
monthofyear_sin_1                       float32
monthofyear_cos_1                       float32
paycycle_sin_1                          float32
paycycle_cos_1                          float32
season_sin_1                            float32
season_cos_1                            

In [18]:
df.to_csv("../output/data/train_top_store_15_item_clusters_sales_cyclical_features_X_1_day_y.csv", index=False)

In [6]:
meta_cols, x_sales_features, x_cyclical_features, x_feature_cols, label_cols = build_feature_and_label_cols(window_size=1)

In [7]:
x_sales_features

['sales_day_1',
 'store_med_day_1',
 'item_med_day_1',
 'store_med_change_1',
 'item_med_change_1',
 'store_cluster_logpct_change_1',
 'item_cluster_logpct_change_1']

In [8]:
df.loc[:,x_sales_features]

Unnamed: 0,sales_day_1,store_med_day_1,item_med_day_1,store_med_change_1,item_med_change_1,store_cluster_logpct_change_1,item_cluster_logpct_change_1
0,0.0,0.0,0.0,,,,
1,0.0,0.0,0.0,,,,
2,0.0,0.0,0.0,,,,
3,0.0,0.0,0.0,,,,
4,0.0,0.0,0.0,,,,
...,...,...,...,...,...,...,...
3943,0.0,0.0,0.0,,,,
3944,0.0,0.0,0.0,,,,
3945,0.0,0.0,0.0,,,,
3946,0.0,0.0,0.0,,,,


In [9]:
x_cyclical_features

['dayofweek_sin_1',
 'dayofweek_cos_1',
 'weekofmonth_sin_1',
 'weekofmonth_cos_1',
 'monthofyear_sin_1',
 'monthofyear_cos_1',
 'paycycle_sin_1',
 'paycycle_cos_1',
 'season_sin_1',
 'season_cos_1']

In [16]:
dataloader_dir = Path("../output/data/dataloader")
scalers_dir = Path("../output/data/scalers")
generate_loaders(
    df,
    meta_cols,
    x_feature_cols,
    x_sales_features,
    x_cyclical_features,
    label_cols,
    scalers_dir,
    dataloader_dir,
    window_size=1,
    batch_size=32,
    num_workers=5,
    log_level="DEBUG",
)
    

2025-07-17 10:35:29,847 - INFO - Preparing global loaders from 3948 samples
2025-07-17 10:35:29,847 - INFO - Processing 3947 windows
Processing windows: 100%|██████████| 3947/3947 [00:14<00:00, 276.26window/s]
2025-07-17 10:35:44,517 - INFO - Saving train loader to ../output/data/dataloader/2025-07-17_train_loader.pt
2025-07-17 10:35:44,519 - INFO - Saving val loader to ../output/data/dataloader/2025-07-17_val_loader.pt
2025-07-17 10:35:44,522 - INFO - Saving x_sales_scaler to ../output/data/scalers/2025-07-17_x_sales_scaler.pkl
2025-07-17 10:35:44,524 - INFO - Saving x_cyc_scaler to ../output/data/scalers/2025-07-17_x_cyc_scaler.pkl
2025-07-17 10:35:44,525 - INFO - Saving y_scaler to ../output/data/scalers/2025-07-17_y_scaler.pkl
2025-07-17 10:35:44,526 - INFO - Saving train meta to ../output/data/dataloader/2025-07-17_train_meta.parquet
2025-07-17 10:35:44,534 - INFO - Saving val meta to ../output/data/dataloader/2025-07-17_val_meta.parquet
2025-07-17 10:35:44,541 - INFO - Saved load

(<torch.utils.data.dataloader.DataLoader at 0x1509ec7a0>,
 <torch.utils.data.dataloader.DataLoader at 0x14fe51e80>)