In [1]:
import numpy as np
import pandas as pd

In [3]:
import sys
sys.path.insert(0, "..")   
from src.utils import (
generate_cyclical_features, 
generate_sales_features, 
add_y_targets_from_shift,
prepare_training_data_from_raw_df,
build_feature_and_label_cols,
)

In [5]:
%load_ext autoreload
%autoreload 2

In [4]:
dtype_dict={"id":np.uint32,
            "store_nbr":np.uint8,
            "item_nbr":np.uint32,
            "unit_sales":np.float32
           }
df = pd.read_csv("../output/data/20250627_train_top_store_500_item.csv", dtype=dtype_dict)
df.rename(columns={"store_nbr": "store", "item_nbr": "item"}, inplace=True)
df["store_item"] = df["store"].astype(str) + "_" + df["item"].astype(str)
cols = ['date', 'store_item', 'store', 'item'] + [c for c in df.columns 
                                  if c not in ('date', 'store_item', 'store', 'item')]
df = df[cols]
df["date"] = pd.to_datetime(df["date"])
df.sort_values("date", inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(5)

  df = pd.read_csv("../output/data/20250627_train_top_store_500_item.csv", dtype=dtype_dict)


Unnamed: 0,date,store_item,store,item,unit_sales,onpromotion
0,2013-01-01,25_108786,25,108786,3.0,
1,2013-01-01,25_769314,25,769314,3.0,
2,2013-01-01,25_771156,25,771156,1.0,
3,2013-01-01,25_783243,25,783243,5.0,
4,2013-01-01,25_801217,25,801217,3.0,


In [6]:
cluster_df = pd.read_csv("../output/data/20250629_cluster_df.csv")
cluster_df


Unnamed: 0,store,item,store_cluster,item_cluster
0,16,215352,0,0
1,16,220435,0,0
2,16,222879,0,0
3,16,265559,0,0
4,16,273528,0,0
...,...,...,...,...
26995,48,1149579,2,7
26996,48,1157561,2,7
26997,48,1165988,2,7
26998,48,1165989,2,7


In [7]:
sales_df = generate_sales_features(df.query("store_item == '3_114790'"),
                                   window_size=16,
                                   cluster_df=cluster_df,
                                   calendar_aligned=True,
                                   debug=True,
                                   debug_fn="../output/data/20250629_debug_cluster_mappings.csv")   
sales_df.head()

Unnamed: 0,start_date,store_item,store,item,storeClusterId,itemClusterId,sales_day_1,sales_day_2,sales_day_3,sales_day_4,...,item_med_day_7,item_med_day_8,item_med_day_9,item_med_day_10,item_med_day_11,item_med_day_12,item_med_day_13,item_med_day_14,item_med_day_15,item_med_day_16
0,2013-01-09,3_114790,3,114790,1,3,13.0,16.0,15.0,20.0,...,14.0,22.0,19.0,23.0,27.0,35.0,21.0,14.0,17.0,14.0
1,2013-01-25,3_114790,3,114790,1,3,17.0,18.0,37.0,13.0,...,18.0,18.0,33.0,36.0,31.0,18.0,25.0,17.0,20.0,23.0
2,2013-02-10,3_114790,3,114790,1,3,15.0,15.0,17.0,10.0,...,30.0,26.0,22.0,28.0,29.0,19.0,18.0,28.0,19.0,11.0
3,2013-02-26,3_114790,3,114790,1,3,16.0,16.0,14.0,22.0,...,24.0,18.0,24.0,17.0,19.0,38.0,35.0,23.0,20.0,20.0
4,2013-03-14,3_114790,3,114790,1,3,11.0,29.0,33.0,63.0,...,36.0,17.0,21.0,27.0,33.0,21.0,18.0,26.0,21.0,20.0


In [8]:
sales_df.columns.to_list()

['start_date',
 'store_item',
 'store',
 'item',
 'storeClusterId',
 'itemClusterId',
 'sales_day_1',
 'sales_day_2',
 'sales_day_3',
 'sales_day_4',
 'sales_day_5',
 'sales_day_6',
 'sales_day_7',
 'sales_day_8',
 'sales_day_9',
 'sales_day_10',
 'sales_day_11',
 'sales_day_12',
 'sales_day_13',
 'sales_day_14',
 'sales_day_15',
 'sales_day_16',
 'store_med_day_1',
 'store_med_day_2',
 'store_med_day_3',
 'store_med_day_4',
 'store_med_day_5',
 'store_med_day_6',
 'store_med_day_7',
 'store_med_day_8',
 'store_med_day_9',
 'store_med_day_10',
 'store_med_day_11',
 'store_med_day_12',
 'store_med_day_13',
 'store_med_day_14',
 'store_med_day_15',
 'store_med_day_16',
 'item_med_day_1',
 'item_med_day_2',
 'item_med_day_3',
 'item_med_day_4',
 'item_med_day_5',
 'item_med_day_6',
 'item_med_day_7',
 'item_med_day_8',
 'item_med_day_9',
 'item_med_day_10',
 'item_med_day_11',
 'item_med_day_12',
 'item_med_day_13',
 'item_med_day_14',
 'item_med_day_15',
 'item_med_day_16']

In [9]:
sales_df.shape

(105, 54)

In [10]:
cyc_df = generate_cyclical_features(df.query("store_item == '3_114790'"), 
                                    window_size=16,
                                    calendar_aligned=True,
                                    cluster_df=cluster_df,
                                    debug=True,
                                    debug_fn="../output/data/20250629_debug_cluster_mappings.csv")
cyc_df.head()

Unnamed: 0,start_date,store_item,store,item,storeClusterId,itemClusterId,dayofweek_sin_1,dayofweek_cos_1,weekofmonth_sin_1,weekofmonth_cos_1,...,dayofweek_sin_16,dayofweek_cos_16,weekofmonth_sin_16,weekofmonth_cos_16,monthofyear_sin_16,monthofyear_cos_16,paycycle_sin_16,paycycle_cos_16,season_sin_16,season_cos_16
0,2013-01-09,3_114790,3,114790,1,3,0.974928,-0.222521,0.587785,-0.809017,...,0.433884,-0.900969,-0.9510565,0.309017,0.5,0.8660254,-0.382683,-0.92388,-0.811539,0.584298
1,2013-01-25,3_114790,3,114790,1,3,-0.433884,-0.900969,-0.951057,0.309017,...,-0.974928,-0.222521,0.5877853,-0.809017,0.866025,0.5,-0.587785,-0.809017,-0.622047,0.78298
2,2013-02-10,3_114790,3,114790,1,3,-0.781831,0.62349,0.587785,-0.809017,...,0.0,1.0,-0.9510565,0.309017,0.866025,0.5,-0.992709,0.120537,-0.385663,0.92264
3,2013-02-26,3_114790,3,114790,1,3,0.781831,0.62349,-0.951057,0.309017,...,0.974928,-0.222521,0.5877853,-0.809017,1.0,6.123234000000001e-17,-0.743145,0.669131,-0.120208,0.992749
4,2013-03-14,3_114790,3,114790,1,3,0.433884,-0.900969,0.587785,-0.809017,...,-0.433884,-0.900969,-2.449294e-16,1.0,1.0,6.123234000000001e-17,-0.707107,0.707107,0.154309,0.988023


In [11]:
cyc_df.shape

(105, 166)

In [12]:
# Define the keys used for merging
merge_keys = ["start_date", "store_item", "store", "item"]

# Identify missing rows from non_overlap_df2 that are not in non_overlap_df
missing_from_df = sales_df.merge(
    cyc_df[merge_keys],
    on=merge_keys,
    how="left",
    indicator=True
).query('_merge == "left_only"')

# Optional: drop the merge indicator column
missing_from_df = missing_from_df.drop(columns="_merge")

# Show how many are missing
print(f"Missing rows in non_overlap_df: {missing_from_df.shape[0]}")


Missing rows in non_overlap_df: 0


In [13]:
merged_df = pd.merge(sales_df, cyc_df, on=["store_item", "start_date", "store", "item","storeClusterId","itemClusterId" ])
merged_df.head()

Unnamed: 0,start_date,store_item,store,item,storeClusterId,itemClusterId,sales_day_1,sales_day_2,sales_day_3,sales_day_4,...,dayofweek_sin_16,dayofweek_cos_16,weekofmonth_sin_16,weekofmonth_cos_16,monthofyear_sin_16,monthofyear_cos_16,paycycle_sin_16,paycycle_cos_16,season_sin_16,season_cos_16
0,2013-01-09,3_114790,3,114790,1,3,13.0,16.0,15.0,20.0,...,0.433884,-0.900969,-0.9510565,0.309017,0.5,0.8660254,-0.382683,-0.92388,-0.811539,0.584298
1,2013-01-25,3_114790,3,114790,1,3,17.0,18.0,37.0,13.0,...,-0.974928,-0.222521,0.5877853,-0.809017,0.866025,0.5,-0.587785,-0.809017,-0.622047,0.78298
2,2013-02-10,3_114790,3,114790,1,3,15.0,15.0,17.0,10.0,...,0.0,1.0,-0.9510565,0.309017,0.866025,0.5,-0.992709,0.120537,-0.385663,0.92264
3,2013-02-26,3_114790,3,114790,1,3,16.0,16.0,14.0,22.0,...,0.974928,-0.222521,0.5877853,-0.809017,1.0,6.123234000000001e-17,-0.743145,0.669131,-0.120208,0.992749
4,2013-03-14,3_114790,3,114790,1,3,11.0,29.0,33.0,63.0,...,-0.433884,-0.900969,-2.449294e-16,1.0,1.0,6.123234000000001e-17,-0.707107,0.707107,0.154309,0.988023


In [14]:
merged_df.shape

(105, 214)

In [15]:
final_df = add_y_targets_from_shift(merged_df, window_size=16)
y_cols = [col for col in final_df.columns if col.startswith("y_")]
final_df = final_df.dropna(subset=y_cols)
final_df.shape


(104, 422)

In [16]:
final_df.head()

Unnamed: 0,start_date,store_item,store,item,storeClusterId,itemClusterId,sales_day_1,sales_day_2,sales_day_3,sales_day_4,...,y_dayofweek_sin_16,y_dayofweek_cos_16,y_weekofmonth_sin_16,y_weekofmonth_cos_16,y_monthofyear_sin_16,y_monthofyear_cos_16,y_paycycle_sin_16,y_paycycle_cos_16,y_season_sin_16,y_season_cos_16
0,2013-01-09,3_114790,3,114790,1,3,13.0,16.0,15.0,20.0,...,-0.974928,-0.222521,0.5877853,-0.809017,0.866025,0.5,-0.587785,-0.809017,-0.622047,0.78298
1,2013-01-25,3_114790,3,114790,1,3,17.0,18.0,37.0,13.0,...,0.0,1.0,-0.9510565,0.309017,0.866025,0.5,-0.992709,0.120537,-0.385663,0.92264
2,2013-02-10,3_114790,3,114790,1,3,15.0,15.0,17.0,10.0,...,0.974928,-0.222521,0.5877853,-0.809017,1.0,6.123234000000001e-17,-0.743145,0.669131,-0.120208,0.992749
3,2013-02-26,3_114790,3,114790,1,3,16.0,16.0,14.0,22.0,...,-0.433884,-0.900969,-2.449294e-16,1.0,1.0,6.123234000000001e-17,-0.707107,0.707107,0.154309,0.988023
4,2013-03-14,3_114790,3,114790,1,3,11.0,29.0,33.0,63.0,...,-0.781831,0.62349,0.5877853,-0.809017,0.866025,-0.5,-0.406737,0.913545,0.417194,0.908818


In [26]:
store_item_ids = [3_114790]
df.query("store_item == '3_114790'")

Unnamed: 0,date,store_item,store,item,unit_sales,onpromotion
11544,2013-01-02,3_114790,3,114790,27.0,
28350,2013-01-03,3_114790,3,114790,19.0,
45392,2013-01-04,3_114790,3,114790,25.0,
62820,2013-01-05,3_114790,3,114790,34.0,
80657,2013-01-06,3_114790,3,114790,38.0,
...,...,...,...,...,...,...
34730878,2017-08-11,3_114790,3,114790,17.0,True
34752592,2017-08-12,3_114790,3,114790,25.0,True
34775129,2017-08-13,3_114790,3,114790,23.0,True
34795781,2017-08-14,3_114790,3,114790,24.0,True


In [25]:
df.head()

Unnamed: 0,date,store_item,store,item,unit_sales,onpromotion
0,2013-01-01,25_108786,25,108786,3.0,
1,2013-01-01,25_769314,25,769314,3.0,
2,2013-01-01,25_771156,25,771156,1.0,
3,2013-01-01,25_783243,25,783243,5.0,
4,2013-01-01,25_801217,25,801217,3.0,


In [28]:
prepare_training_data_from_raw_df(
    df.query("store_item == '3_114790'"),
    window_size=16,
    cluster_df=cluster_df,
    calendar_aligned=True,
    debug=True,
    debug_cyc_fn="../output/data/20250629_cyc_debug.csv",
    debug_sales_fn="../output/data/20250629_sales_debug.csv",
)

sales_df.shape: (105, 54)
cyc_df.shape: (105, 166)
merged_df.shape: (105, 214)
merged_df.shape: (104, 422)


Unnamed: 0,start_date,store_item,store,item,storeClusterId,itemClusterId,sales_day_1,sales_day_2,sales_day_3,sales_day_4,...,y_dayofweek_sin_16,y_dayofweek_cos_16,y_weekofmonth_sin_16,y_weekofmonth_cos_16,y_monthofyear_sin_16,y_monthofyear_cos_16,y_paycycle_sin_16,y_paycycle_cos_16,y_season_sin_16,y_season_cos_16
0,2013-01-09,3_114790,3,114790,1,3,13.0,16.0,15.0,20.0,...,-0.974928,-0.222521,5.877853e-01,-0.809017,8.660254e-01,5.000000e-01,-0.587785,-0.809017,-0.622047,0.782980
1,2013-01-25,3_114790,3,114790,1,3,17.0,18.0,37.0,13.0,...,0.000000,1.000000,-9.510565e-01,0.309017,8.660254e-01,5.000000e-01,-0.992709,0.120537,-0.385663,0.922640
2,2013-02-10,3_114790,3,114790,1,3,15.0,15.0,17.0,10.0,...,0.974928,-0.222521,5.877853e-01,-0.809017,1.000000e+00,6.123234e-17,-0.743145,0.669131,-0.120208,0.992749
3,2013-02-26,3_114790,3,114790,1,3,16.0,16.0,14.0,22.0,...,-0.433884,-0.900969,-2.449294e-16,1.000000,1.000000e+00,6.123234e-17,-0.707107,0.707107,0.154309,0.988023
4,2013-03-14,3_114790,3,114790,1,3,11.0,29.0,33.0,63.0,...,-0.781831,0.623490,5.877853e-01,-0.809017,8.660254e-01,-5.000000e-01,-0.406737,0.913545,0.417194,0.908818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,2017-05-12,3_114790,3,114790,1,3,11.0,31.0,18.0,18.0,...,0.000000,1.000000,5.877853e-01,-0.809017,1.224647e-16,-1.000000e+00,-0.951057,0.309017,0.992222,0.124479
100,2017-05-28,3_114790,3,114790,1,3,22.0,13.0,15.0,17.0,...,0.974928,-0.222521,-9.510565e-01,0.309017,1.224647e-16,-1.000000e+00,-0.743145,0.669131,0.988678,-0.150055
101,2017-06-13,3_114790,3,114790,1,3,34.0,24.0,21.0,20.0,...,-0.433884,-0.900969,5.877853e-01,-0.809017,-5.000000e-01,-8.660254e-01,-0.406737,0.913545,0.910605,-0.413279
102,2017-06-29,3_114790,3,114790,1,3,17.0,15.0,23.0,37.0,...,-0.781831,0.623490,-2.449294e-16,1.000000,-5.000000e-01,-8.660254e-01,-0.382683,0.923880,0.763889,-0.645348


In [17]:
final_df = prepare_training_data_from_raw_df(
    df,
    window_size=16,
    cluster_df=cluster_df,
    calendar_aligned=True,
    debug=True,
    debug_cyc_fn="../output/data/20250629_cyc_debug.csv",
    debug_sales_fn="../output/data/20250629_sales_debug.csv",
)

IndexError: single positional indexer is out-of-bounds

In [None]:
final_df.head(5)

In [None]:
meta_cols,\
x_sales_features,\
x_cyclical_features,\
x_feature_cols,\
label_cols,\
y_sales_features,\
y_cyclical_features = build_feature_and_label_cols(window_size=16)
print(meta_cols)
print(x_feature_cols)
print(label_cols)

In [None]:
final_df[meta_cols + x_feature_cols + label_cols].to_excel("../output/data/20250629_train_store_500_item_sales_cyclical_features_16_days_X_y.xlsx",
            index=False)