In [1]:
import numpy as np
import pandas as pd
from pathlib import Path


In [12]:
import sys
sys.path.insert(0, "..")   
from src.utils import (
generate_cyclical_features, 
generate_sales_features, 
add_y_targets_from_shift,
prepare_training_data_from_raw_df,
build_feature_and_label_cols,
)

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
# load
dtype_dict = {
    "store": "uint16",
    "item": "uint32",
    "store_item": "string",        # allow NaNs as <NA>
    "unit_sales": "float32",
    "id": "Int64",                 # nullable integer
    "onpromotion": "boolean",      # if you want True/False with nulls
}
df = pd.read_csv("../output/data/20250710_train_top_51_store_100_item_clusters.csv",
                 dtype=dtype_dict,
                 parse_dates=["date"],
                 keep_default_na=True,
                 na_values=[""])
# Convert nullable Int64 or boolean to float64 with NaN
cols = ['date', 'store_item', 'store', 'item'] + [c for c in df.columns 
                                   if c not in ('date', 'store_item', 'store', 'item')]
df = df[cols]
df["id"] = df["id"].astype("float64")   # <NA> → np.nan
df["id"] = df["id"].astype(object).where(df["id"].notna(), np.nan)
df["store_item"] = df["store_item"].astype(object).where(df["store_item"].notna(), np.nan)
df["onpromotion"] = df["onpromotion"].astype(object).where(df["onpromotion"].notna(), np.nan)
df["date"] = pd.to_datetime(df["date"])


In [5]:
df.head()

Unnamed: 0,date,store_item,store,item,unit_sales,id,onpromotion,store_cluster,item_cluster,cluster
0,2014-01-02,44_1503844,44,1503844,966.148987,16377457.0,,2,1,2_1
1,2014-01-03,44_1503844,44,1503844,663.405029,16442863.0,,2,1,2_1
2,2014-01-04,44_1503844,44,1503844,1075.0,16511061.0,,2,1,2_1
3,2014-01-05,44_1503844,44,1503844,892.607971,16578789.0,,2,1,2_1
4,2014-01-06,44_1503844,44,1503844,261.553986,16642520.0,,2,1,2_1


In [56]:
sales_df = generate_sales_features(df.query("store_item == '44_1503844'"),
                                   window_size=1,
                                   calendar_aligned=True,
                                   log_level="DEBUG")   
sales_df.head()


Window 2014-01-02: 100%|██████████| 1/1 [00:00<00:00, 2033.11it/s]
Window 2014-01-03: 100%|██████████| 1/1 [00:00<00:00, 1264.11it/s]
Window 2014-01-04: 100%|██████████| 1/1 [00:00<00:00, 1196.32it/s]
Window 2014-01-05: 100%|██████████| 1/1 [00:00<00:00, 1510.37it/s]
Window 2014-01-06: 100%|██████████| 1/1 [00:00<00:00, 1039.74it/s]
Window 2014-01-07: 100%|██████████| 1/1 [00:00<00:00, 1550.00it/s]
Window 2014-01-08: 100%|██████████| 1/1 [00:00<00:00, 1520.23it/s]
Window 2014-01-09: 100%|██████████| 1/1 [00:00<00:00, 1462.45it/s]
Window 2014-01-10: 100%|██████████| 1/1 [00:00<00:00, 1579.78it/s]
Window 2014-01-11: 100%|██████████| 1/1 [00:00<00:00, 1597.83it/s]
Window 2014-01-12: 100%|██████████| 1/1 [00:00<00:00, 1462.45it/s]
Window 2014-01-13: 100%|██████████| 1/1 [00:00<00:00, 1305.01it/s]
Window 2014-01-14: 100%|██████████| 1/1 [00:00<00:00, 1324.38it/s]
Window 2014-01-15: 100%|██████████| 1/1 [00:00<00:00, 1176.52it/s]
Window 2014-01-16: 100%|██████████| 1/1 [00:00<00:00, 1488.40i

Unnamed: 0,start_date,id,store_item,store,item,store_cluster,item_cluster,sales_day_1,store_med_day_1,item_med_day_1,store_cluster_logpct_change_1,item_cluster_logpct_change_1
0,2014-01-02,16377457.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,,
1,2014-01-02,16442863.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,,
2,2014-01-02,16511061.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,,
3,2014-01-02,16578789.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,,
4,2014-01-02,16642520.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,,


In [57]:
sales_df["item_cluster_logpct_change_1"]

0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
             ...   
1032251    0.632222
1032252    0.632222
1032253    0.632222
1032254    0.632222
1032255    0.632222
Name: item_cluster_logpct_change_1, Length: 1032256, dtype: float64

In [58]:
sales_df.columns.to_list()

['start_date',
 'id',
 'store_item',
 'store',
 'item',
 'store_cluster',
 'item_cluster',
 'sales_day_1',
 'store_med_day_1',
 'item_med_day_1',
 'store_cluster_logpct_change_1',
 'item_cluster_logpct_change_1']

In [59]:
sales_df.shape

(1032256, 12)

In [60]:
cyc_df = generate_cyclical_features(df.query("store_item == '44_1503844'"), 
                                    window_size=1,
                                    calendar_aligned=True,
                                    log_level="DEBUG")
cyc_df.head()

Generating cyclical features: 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]


Unnamed: 0,start_date,id,store_item,store,item,dayofweek_sin_1,dayofweek_cos_1,weekofmonth_sin_1,weekofmonth_cos_1,monthofyear_sin_1,monthofyear_cos_1,paycycle_sin_1,paycycle_cos_1,season_sin_1,season_cos_1
0,2014-01-02,16377457.0,44_1503844,44,1503844,0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.743145,0.669131,-0.970064,0.24285
1,2014-01-02,16442863.0,44_1503844,44,1503844,0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.743145,0.669131,-0.970064,0.24285
2,2014-01-02,16511061.0,44_1503844,44,1503844,0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.743145,0.669131,-0.970064,0.24285
3,2014-01-02,16578789.0,44_1503844,44,1503844,0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.743145,0.669131,-0.970064,0.24285
4,2014-01-02,16642520.0,44_1503844,44,1503844,0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.743145,0.669131,-0.970064,0.24285


In [61]:
sales_df.shape

(1032256, 12)

In [62]:
cyc_df.shape

(1032256, 15)

In [63]:
# Define the keys used for merging
merge_keys = ["start_date", "id", "store_item", "store", "item"]

# Identify missing rows from non_overlap_df2 that are not in non_overlap_df
missing_from_df = sales_df.merge(
    cyc_df[merge_keys],
    on=merge_keys,
    how="left",
    indicator=True
).query('_merge == "left_only"')

# Optional: drop the merge indicator column
missing_from_df = missing_from_df.drop(columns="_merge")

# Show how many are missing
print(f"Missing rows in non_overlap_df: {missing_from_df.shape[0]}")


Missing rows in non_overlap_df: 0


In [64]:
sales_df["start_date"] = pd.to_datetime(sales_df["start_date"])
cyc_df["start_date"] = pd.to_datetime(cyc_df["start_date"])

merged_df = pd.merge(sales_df, cyc_df, on=["store_item", "id","start_date", "store", "item"])

merged_df.head()

Unnamed: 0,start_date,id,store_item,store,item,store_cluster,item_cluster,sales_day_1,store_med_day_1,item_med_day_1,...,dayofweek_sin_1,dayofweek_cos_1,weekofmonth_sin_1,weekofmonth_cos_1,monthofyear_sin_1,monthofyear_cos_1,paycycle_sin_1,paycycle_cos_1,season_sin_1,season_cos_1
0,2014-01-02,16377457.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,...,0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.743145,0.669131,-0.970064,0.24285
1,2014-01-02,16442863.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,...,0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.743145,0.669131,-0.970064,0.24285
2,2014-01-02,16511061.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,...,0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.743145,0.669131,-0.970064,0.24285
3,2014-01-02,16578789.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,...,0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.743145,0.669131,-0.970064,0.24285
4,2014-01-02,16642520.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,...,0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.743145,0.669131,-0.970064,0.24285


In [66]:
merged_df.columns.to_list()

['start_date',
 'id',
 'store_item',
 'store',
 'item',
 'store_cluster',
 'item_cluster',
 'sales_day_1',
 'store_med_day_1',
 'item_med_day_1',
 'store_cluster_logpct_change_1',
 'item_cluster_logpct_change_1',
 'dayofweek_sin_1',
 'dayofweek_cos_1',
 'weekofmonth_sin_1',
 'weekofmonth_cos_1',
 'monthofyear_sin_1',
 'monthofyear_cos_1',
 'paycycle_sin_1',
 'paycycle_cos_1',
 'season_sin_1',
 'season_cos_1']

In [67]:
merged_df.shape

(1032256, 22)

In [68]:
final_df = add_y_targets_from_shift(merged_df, window_size=1)
y_cols = [col for col in final_df.columns if col.startswith("y_")]
final_df = final_df.dropna(subset=y_cols)
final_df.shape


(1006, 37)

In [69]:
final_df.head()

Unnamed: 0,start_date,id,store_item,store,item,store_cluster,item_cluster,sales_day_1,store_med_day_1,item_med_day_1,...,y_dayofweek_sin_1,y_dayofweek_cos_1,y_weekofmonth_sin_1,y_weekofmonth_cos_1,y_monthofyear_sin_1,y_monthofyear_cos_1,y_paycycle_sin_1,y_paycycle_cos_1,y_season_sin_1,y_season_cos_1
1015,2014-01-02,58524463.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,...,-0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.951057,0.309017,-0.96574,0.259512
2031,2014-01-03,58524463.0,44_1503844,44,1503844,2,1,663.405029,663.405029,663.405029,...,-0.974928,-0.222521,0.951057,0.309017,0.5,0.866025,0.994522,-0.104528,-0.96113,0.276097
3047,2014-01-04,58434636.0,44_1503844,44,1503844,2,1,1075.0,1075.0,1075.0,...,-0.781831,0.62349,0.951057,0.309017,0.5,0.866025,0.866025,-0.5,-0.956235,0.2926
4063,2014-01-05,58243290.0,44_1503844,44,1503844,2,1,892.607971,892.607971,892.607971,...,0.0,1.0,0.951057,0.309017,0.5,0.866025,0.587785,-0.809017,-0.951057,0.309017
5079,2014-01-06,58434636.0,44_1503844,44,1503844,2,1,261.553986,261.553986,261.553986,...,0.781831,0.62349,0.951057,0.309017,0.5,0.866025,0.207912,-0.978148,-0.945596,0.325342


In [72]:
final_df = prepare_training_data_from_raw_df(
    df.query("store_item == '44_1503844'"),
    window_size=1,
    calendar_aligned=True,
    sales_fn=Path("../output/data/20250710_train_top_51_store_100_item_cluster_sales.csv"),
    cyc_fn=Path("../output/data/20250710_train_top_51_store_100_item_cluster_cyc.csv"),
)

2025-07-10 11:12:40,316 - INFO - Generating sales features to ../output/data/20250710_500_train_top_51_store_100_item_cluster_sales.csv
2025-07-10 11:12:56,890 - INFO - Generating cyclical features to ../output/data/20250710_500_train_top_51_store_100_item_cluster_cyc.csv
2025-07-10 11:12:59,069 - INFO - Saving cyclical features to ../output/data/20250710_500_train_top_51_store_100_item_cluster_cyc.csv
2025-07-10 11:13:15,172 - INFO - Merging sales and cyclical features
2025-07-10 11:13:15,872 - INFO - merged_df.shape: (1032256, 22)
2025-07-10 11:16:28,456 - INFO - merged_df.shape: (1006, 37)


In [73]:
final_df.head()

Unnamed: 0,start_date,id,store_item,store,item,store_cluster,item_cluster,sales_day_1,store_med_day_1,item_med_day_1,...,y_dayofweek_sin_1,y_dayofweek_cos_1,y_weekofmonth_sin_1,y_weekofmonth_cos_1,y_monthofyear_sin_1,y_monthofyear_cos_1,y_paycycle_sin_1,y_paycycle_cos_1,y_season_sin_1,y_season_cos_1
1015,2014-01-02,58524463.0,44_1503844,44,1503844,2,1,966.148987,966.148987,966.148987,...,-0.433884,-0.900969,0.951057,0.309017,0.5,0.866025,0.951057,0.309017,-0.96574,0.259512
2031,2014-01-03,58524463.0,44_1503844,44,1503844,2,1,663.405029,663.405029,663.405029,...,-0.974928,-0.222521,0.951057,0.309017,0.5,0.866025,0.994522,-0.104528,-0.96113,0.276097
3047,2014-01-04,58434636.0,44_1503844,44,1503844,2,1,1075.0,1075.0,1075.0,...,-0.781831,0.62349,0.951057,0.309017,0.5,0.866025,0.866025,-0.5,-0.956235,0.2926
4063,2014-01-05,58243290.0,44_1503844,44,1503844,2,1,892.607971,892.607971,892.607971,...,0.0,1.0,0.951057,0.309017,0.5,0.866025,0.587785,-0.809017,-0.951057,0.309017
5079,2014-01-06,58434636.0,44_1503844,44,1503844,2,1,261.553986,261.553986,261.553986,...,0.781831,0.62349,0.951057,0.309017,0.5,0.866025,0.207912,-0.978148,-0.945596,0.325342


In [75]:
final_df.columns.to_list()


['start_date',
 'id',
 'store_item',
 'store',
 'item',
 'store_cluster',
 'item_cluster',
 'sales_day_1',
 'store_med_day_1',
 'item_med_day_1',
 'store_cluster_logpct_change_1',
 'item_cluster_logpct_change_1',
 'dayofweek_sin_1',
 'dayofweek_cos_1',
 'weekofmonth_sin_1',
 'weekofmonth_cos_1',
 'monthofyear_sin_1',
 'monthofyear_cos_1',
 'paycycle_sin_1',
 'paycycle_cos_1',
 'season_sin_1',
 'season_cos_1',
 'y_sales_day_1',
 'y_store_med_day_1',
 'y_item_med_day_1',
 'y_store_cluster_logpct_change_1',
 'y_item_cluster_logpct_change_1',
 'y_dayofweek_sin_1',
 'y_dayofweek_cos_1',
 'y_weekofmonth_sin_1',
 'y_weekofmonth_cos_1',
 'y_monthofyear_sin_1',
 'y_monthofyear_cos_1',
 'y_paycycle_sin_1',
 'y_paycycle_cos_1',
 'y_season_sin_1',
 'y_season_cos_1']

In [74]:
(
            meta_cols,
            _,
            _,
            x_feature_cols,
            label_cols,
            _,
            _,
) = build_feature_and_label_cols(window_size=1)
final_df[meta_cols + x_feature_cols + label_cols]


Unnamed: 0,start_date,id,store_item,store_cluster,item_cluster,sales_day_1,store_med_day_1,item_med_day_1,store_cluster_logpct_change_1,item_cluster_logpct_change_1,...,y_dayofweek_sin_1,y_dayofweek_cos_1,y_weekofmonth_sin_1,y_weekofmonth_cos_1,y_monthofyear_sin_1,y_monthofyear_cos_1,y_paycycle_sin_1,y_paycycle_cos_1,y_season_sin_1,y_season_cos_1
1015,2014-01-02,58524463.0,44_1503844,2,1,966.148987,966.148987,966.148987,,,...,-0.433884,-0.900969,0.951057,0.309017,0.500000,0.866025,0.951057,0.309017,-0.965740,0.259512
2031,2014-01-03,58524463.0,44_1503844,2,1,663.405029,663.405029,663.405029,0.591646,0.591646,...,-0.974928,-0.222521,0.951057,0.309017,0.500000,0.866025,0.994522,-0.104528,-0.961130,0.276097
3047,2014-01-04,58434636.0,44_1503844,2,1,1075.000000,1075.000000,1075.000000,0.907811,0.907811,...,-0.781831,0.623490,0.951057,0.309017,0.500000,0.866025,0.866025,-0.500000,-0.956235,0.292600
4063,2014-01-05,58243290.0,44_1503844,2,1,892.607971,892.607971,892.607971,0.484802,0.484802,...,0.000000,1.000000,0.951057,0.309017,0.500000,0.866025,0.587785,-0.809017,-0.951057,0.309017
5079,2014-01-06,58434636.0,44_1503844,2,1,261.553986,261.553986,261.553986,0.722487,0.722487,...,0.781831,0.623490,0.951057,0.309017,0.500000,0.866025,0.207912,-0.978148,-0.945596,0.325342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027175,2017-08-10,58613249.0,44_1503844,2,1,408.032013,408.032013,408.032013,0.679622,0.679622,...,-0.433884,-0.900969,0.587785,-0.809017,-0.866025,-0.500000,-0.994522,-0.104528,0.615285,-0.788305
1028191,2017-08-11,58524463.0,44_1503844,2,1,682.445007,682.445007,682.445007,1.376919,1.376919,...,-0.974928,-0.222521,0.587785,-0.809017,-0.866025,-0.500000,-0.951057,0.309017,0.601624,-0.798779
1029207,2017-08-12,58524463.0,44_1503844,2,1,720.913025,720.913025,720.913025,0.538577,0.538577,...,-0.781831,0.623490,0.587785,-0.809017,-0.866025,-0.500000,-0.743145,0.669131,0.587785,-0.809017
1030223,2017-08-13,58524463.0,44_1503844,2,1,960.043030,960.043030,960.043030,0.760311,0.760311,...,0.000000,1.000000,0.587785,-0.809017,-0.866025,-0.500000,-0.406737,0.913545,0.573772,-0.819015
