In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [2]:
import sys
sys.path.insert(0, "..")   
from src.utils import (
generate_cyclical_features, 
generate_nonoverlap_window_features, 
add_next_window_targets,
prepare_training_data_from_raw_df
)


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
dtype_dict={"id":np.uint32,
            "store_nbr":np.uint8,
            "item_nbr":np.uint32,
            "unit_sales":np.float32
           }
df = pd.read_csv("../output/data/train_top_10_store_10_item.csv", dtype=dtype_dict)
cols = ['date', 'store_item', 'store_nbr', 'item_nbr'] + [c for c in df.columns 
                                  if c not in ('date', 'store_item', 'store_nbr', 'item_nbr')]
df = df[cols]
df.rename(columns={"store_nbr": "store", "item_nbr": "item"}, inplace=True)
df["date"] = pd.to_datetime(df["date"])
df.sort_values("date", inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(5)



  df = pd.read_csv("../output/data/train_top_10_store_10_item.csv", dtype=dtype_dict)


Unnamed: 0,date,store_item,store,item,unit_sales,onpromotion
0,2013-01-07,48_502331,48,502331,70.0,
1,2013-01-07,46_584028,46,584028,304.27301,
2,2013-01-07,3_114790,3,114790,28.0,
3,2013-01-07,3_305229,3,305229,89.0,
4,2013-01-07,3_314384,3,314384,106.0,


In [5]:
df.head(5)

Unnamed: 0,date,store_item,store,item,unit_sales,onpromotion
0,2013-01-07,48_502331,48,502331,70.0,
1,2013-01-07,46_584028,46,584028,304.27301,
2,2013-01-07,3_114790,3,114790,28.0,
3,2013-01-07,3_305229,3,305229,89.0,
4,2013-01-07,3_314384,3,314384,106.0,


In [6]:
df.dtypes

date           datetime64[ns]
store_item             object
store                   uint8
item                   uint32
unit_sales            float32
onpromotion            object
dtype: object

In [7]:
sales_df = generate_nonoverlap_window_features(df.query("store_item == '3_114790'"), window_size=16)
sales_df.head()

Unnamed: 0,start_date,store_item,store,item,sales_day_1,sales_day_2,sales_day_3,sales_day_4,sales_day_5,sales_day_6,...,item_med_day_7,item_med_day_8,item_med_day_9,item_med_day_10,item_med_day_11,item_med_day_12,item_med_day_13,item_med_day_14,item_med_day_15,item_med_day_16
0,2013-01-07,3_114790,3,114790,28.0,11.0,13.0,16.0,15.0,20.0,...,28.0,16.0,14.0,22.0,19.0,23.0,27.0,35.0,21.0,14.0
1,2013-01-08,3_114790,3,114790,11.0,13.0,16.0,15.0,20.0,28.0,...,16.0,14.0,22.0,19.0,23.0,27.0,35.0,21.0,14.0,17.0
2,2013-01-09,3_114790,3,114790,13.0,16.0,15.0,20.0,28.0,16.0,...,14.0,22.0,19.0,23.0,27.0,35.0,21.0,14.0,17.0,14.0
3,2013-01-10,3_114790,3,114790,16.0,15.0,20.0,28.0,16.0,14.0,...,22.0,19.0,23.0,27.0,35.0,21.0,14.0,17.0,14.0,17.0
4,2013-01-11,3_114790,3,114790,15.0,20.0,28.0,16.0,14.0,22.0,...,19.0,23.0,27.0,35.0,21.0,14.0,17.0,14.0,17.0,18.0


In [8]:
sales_df.columns.to_list()

['start_date',
 'store_item',
 'store',
 'item',
 'sales_day_1',
 'sales_day_2',
 'sales_day_3',
 'sales_day_4',
 'sales_day_5',
 'sales_day_6',
 'sales_day_7',
 'sales_day_8',
 'sales_day_9',
 'sales_day_10',
 'sales_day_11',
 'sales_day_12',
 'sales_day_13',
 'sales_day_14',
 'sales_day_15',
 'sales_day_16',
 'store_med_day_1',
 'store_med_day_2',
 'store_med_day_3',
 'store_med_day_4',
 'store_med_day_5',
 'store_med_day_6',
 'store_med_day_7',
 'store_med_day_8',
 'store_med_day_9',
 'store_med_day_10',
 'store_med_day_11',
 'store_med_day_12',
 'store_med_day_13',
 'store_med_day_14',
 'store_med_day_15',
 'store_med_day_16',
 'item_med_day_1',
 'item_med_day_2',
 'item_med_day_3',
 'item_med_day_4',
 'item_med_day_5',
 'item_med_day_6',
 'item_med_day_7',
 'item_med_day_8',
 'item_med_day_9',
 'item_med_day_10',
 'item_med_day_11',
 'item_med_day_12',
 'item_med_day_13',
 'item_med_day_14',
 'item_med_day_15',
 'item_med_day_16']

In [9]:
sales_df.shape

(1659, 52)

In [10]:
sales_df.to_excel("../output/data/train_3_114790_sales_16_days.xlsx",
            index=False)

In [11]:
cyc_df = generate_cyclical_features(df.query("store_item == '3_114790'"), window_size=16)
cyc_df.head()

Unnamed: 0,start_date,store_item,store,item,dayofweek_sin_1,dayofweek_cos_1,weekofmonth_sin_1,weekofmonth_cos_1,monthofyear_sin_1,monthofyear_cos_1,...,weekofmonth_sin_15,weekofmonth_cos_15,monthofyear_sin_15,monthofyear_cos_15,dayofweek_sin_16,dayofweek_cos_16,weekofmonth_sin_16,weekofmonth_cos_16,monthofyear_sin_16,monthofyear_cos_16
0,2013-01-07,3_114790,3,114790,0.0,1.0,0.951057,0.309017,0.5,0.866025,...,-0.587785,-0.809017,0.5,0.866025,0.781831,0.62349,-0.951057,0.309017,0.5,0.866025
1,2013-01-08,3_114790,3,114790,0.781831,0.62349,0.587785,-0.809017,0.5,0.866025,...,-0.951057,0.309017,0.5,0.866025,0.974928,-0.222521,-0.951057,0.309017,0.5,0.866025
2,2013-01-09,3_114790,3,114790,0.974928,-0.222521,0.587785,-0.809017,0.5,0.866025,...,-0.951057,0.309017,0.5,0.866025,0.433884,-0.900969,-0.951057,0.309017,0.5,0.866025
3,2013-01-10,3_114790,3,114790,0.433884,-0.900969,0.587785,-0.809017,0.5,0.866025,...,-0.951057,0.309017,0.5,0.866025,-0.433884,-0.900969,-0.951057,0.309017,0.5,0.866025
4,2013-01-11,3_114790,3,114790,-0.433884,-0.900969,0.587785,-0.809017,0.5,0.866025,...,-0.951057,0.309017,0.5,0.866025,-0.974928,-0.222521,-0.951057,0.309017,0.5,0.866025


In [12]:
cyc_df.columns.to_list()

['start_date',
 'store_item',
 'store',
 'item',
 'dayofweek_sin_1',
 'dayofweek_cos_1',
 'weekofmonth_sin_1',
 'weekofmonth_cos_1',
 'monthofyear_sin_1',
 'monthofyear_cos_1',
 'dayofweek_sin_2',
 'dayofweek_cos_2',
 'weekofmonth_sin_2',
 'weekofmonth_cos_2',
 'monthofyear_sin_2',
 'monthofyear_cos_2',
 'dayofweek_sin_3',
 'dayofweek_cos_3',
 'weekofmonth_sin_3',
 'weekofmonth_cos_3',
 'monthofyear_sin_3',
 'monthofyear_cos_3',
 'dayofweek_sin_4',
 'dayofweek_cos_4',
 'weekofmonth_sin_4',
 'weekofmonth_cos_4',
 'monthofyear_sin_4',
 'monthofyear_cos_4',
 'dayofweek_sin_5',
 'dayofweek_cos_5',
 'weekofmonth_sin_5',
 'weekofmonth_cos_5',
 'monthofyear_sin_5',
 'monthofyear_cos_5',
 'dayofweek_sin_6',
 'dayofweek_cos_6',
 'weekofmonth_sin_6',
 'weekofmonth_cos_6',
 'monthofyear_sin_6',
 'monthofyear_cos_6',
 'dayofweek_sin_7',
 'dayofweek_cos_7',
 'weekofmonth_sin_7',
 'weekofmonth_cos_7',
 'monthofyear_sin_7',
 'monthofyear_cos_7',
 'dayofweek_sin_8',
 'dayofweek_cos_8',
 'weekofmonth_s

In [13]:
cyc_df.shape

(1659, 100)

In [14]:
# Define the keys used for merging
merge_keys = ["start_date", "store_item", "store", "item"]

# Identify missing rows from non_overlap_df2 that are not in non_overlap_df
missing_from_df = sales_df.merge(
    cyc_df[merge_keys],
    on=merge_keys,
    how="left",
    indicator=True
).query('_merge == "left_only"')

# Optional: drop the merge indicator column
missing_from_df = missing_from_df.drop(columns="_merge")

# Show how many are missing
print(f"Missing rows in non_overlap_df: {missing_from_df.shape[0]}")


Missing rows in non_overlap_df: 0


In [16]:
cyc_df.to_excel("../output/data/train_3_114790_cyclical_features_16_days.xlsx",
            index=False)

In [17]:
merged_df = pd.merge(sales_df, cyc_df, on=["store_item", "start_date", "store", "item" ])
merged_df.head()

Unnamed: 0,start_date,store_item,store,item,sales_day_1,sales_day_2,sales_day_3,sales_day_4,sales_day_5,sales_day_6,...,weekofmonth_sin_15,weekofmonth_cos_15,monthofyear_sin_15,monthofyear_cos_15,dayofweek_sin_16,dayofweek_cos_16,weekofmonth_sin_16,weekofmonth_cos_16,monthofyear_sin_16,monthofyear_cos_16
0,2013-01-07,3_114790,3,114790,28.0,11.0,13.0,16.0,15.0,20.0,...,-0.587785,-0.809017,0.5,0.866025,0.781831,0.62349,-0.951057,0.309017,0.5,0.866025
1,2013-01-08,3_114790,3,114790,11.0,13.0,16.0,15.0,20.0,28.0,...,-0.951057,0.309017,0.5,0.866025,0.974928,-0.222521,-0.951057,0.309017,0.5,0.866025
2,2013-01-09,3_114790,3,114790,13.0,16.0,15.0,20.0,28.0,16.0,...,-0.951057,0.309017,0.5,0.866025,0.433884,-0.900969,-0.951057,0.309017,0.5,0.866025
3,2013-01-10,3_114790,3,114790,16.0,15.0,20.0,28.0,16.0,14.0,...,-0.951057,0.309017,0.5,0.866025,-0.433884,-0.900969,-0.951057,0.309017,0.5,0.866025
4,2013-01-11,3_114790,3,114790,15.0,20.0,28.0,16.0,14.0,22.0,...,-0.951057,0.309017,0.5,0.866025,-0.974928,-0.222521,-0.951057,0.309017,0.5,0.866025


In [18]:
merged_df.shape


(1659, 148)

In [19]:
merged_df.to_excel("../output/data/train_3_114790_sales_cyclical_features_16_days.xlsx",
            index=False)

In [20]:
final_df = add_next_window_targets(merged_df, window_size=16)
final_df.to_excel("../output/data/train_3_114790_sales_cyclical_features_16_days_X_y.xlsx",
            index=False)


In [23]:
final_df = prepare_training_data_from_raw_df(df, window_size=16)

(165804, 100)
(165804, 52)
(165704, 228)


In [24]:
final_df.head(5)

Unnamed: 0,start_date,store_item,store,item,dayofweek_sin_1,dayofweek_cos_1,weekofmonth_sin_1,weekofmonth_cos_1,monthofyear_sin_1,monthofyear_cos_1,...,y_weekofmonth_sin_14,y_weekofmonth_cos_14,y_dayofweek_sin_15,y_dayofweek_cos_15,y_weekofmonth_sin_15,y_weekofmonth_cos_15,y_dayofweek_sin_16,y_dayofweek_cos_16,y_weekofmonth_sin_16,y_weekofmonth_cos_16
0,2013-01-07,3_1047679,3,1047679,0.0,1.0,0.951057,0.309017,0.5,0.866025,...,-0.587785,-0.809017,0.781831,0.62349,-0.951057,0.309017,0.974928,-0.222521,-0.951057,0.309017
1,2013-01-08,3_1047679,3,1047679,0.781831,0.62349,0.587785,-0.809017,0.5,0.866025,...,-0.951057,0.309017,0.974928,-0.222521,-0.951057,0.309017,0.433884,-0.900969,-0.951057,0.309017
2,2013-01-09,3_1047679,3,1047679,0.974928,-0.222521,0.587785,-0.809017,0.5,0.866025,...,-0.951057,0.309017,0.433884,-0.900969,-0.951057,0.309017,-0.433884,-0.900969,-0.951057,0.309017
3,2013-01-10,3_1047679,3,1047679,0.433884,-0.900969,0.587785,-0.809017,0.5,0.866025,...,-0.951057,0.309017,-0.433884,-0.900969,-0.951057,0.309017,-0.974928,-0.222521,-0.951057,0.309017
4,2013-01-11,3_1047679,3,1047679,-0.433884,-0.900969,0.587785,-0.809017,0.5,0.866025,...,-0.951057,0.309017,-0.974928,-0.222521,-0.951057,0.309017,-0.781831,0.62349,-0.951057,0.309017


In [25]:
final_df.to_excel("../output/data/train_top_10_store_10_item_sales_cyclical_features_16_days.xlsx",
            index=False)

In [None]:
# # Define the keys used for merging
# merge_keys = ["start_date", "store_item", "store", "item"]

# # Identify missing rows from non_overlap_df3 that are not in non_overlap_df
# missing_from_df = non_overlap_df2.merge(
#     non_overlap_df3[merge_keys],
#     on=merge_keys,
#     how="left",
#     indicator=True
# ).query('_merge == "left_only"')

# # Optional: drop the merge indicator column
# missing_from_df = missing_from_df.drop(columns="_merge")

# # Show how many are missing
# print(f"Missing rows in non_overlap_df: {missing_from_df.shape[0]}")

Missing rows in non_overlap_df: 0


In [26]:
# --- Apply MinMax Scaling ---
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(final_df.drop(columns=["start_date","store_item", "store", "item"]))
scaled_df = pd.DataFrame(scaled_values, columns=final_df.columns[4:])
scaled_df.insert(0, "start_date", final_df["start_date"])
scaled_df.insert(1, "store_item", final_df["store_item"])
scaled_df.insert(2, "store", final_df["store"])
scaled_df.insert(3, "item", final_df["item"])
scaled_df.head()

Unnamed: 0,start_date,store_item,store,item,dayofweek_sin_1,dayofweek_cos_1,weekofmonth_sin_1,weekofmonth_cos_1,monthofyear_sin_1,monthofyear_cos_1,...,y_weekofmonth_sin_14,y_weekofmonth_cos_14,y_dayofweek_sin_15,y_dayofweek_cos_15,y_weekofmonth_sin_15,y_weekofmonth_cos_15,y_dayofweek_sin_16,y_dayofweek_cos_16,y_weekofmonth_sin_16,y_weekofmonth_cos_16
0,2013-01-07,3_1047679,3.0,1047679.0,0.5,1.0,1.0,0.618034,0.75,0.933013,...,0.190983,0.0,0.900969,0.8019377,0.0,0.618034,1.0,0.3568959,0.0,0.618034
1,2013-01-08,3_1047679,3.0,1047679.0,0.900969,0.8019377,0.809017,5.5511150000000004e-17,0.75,0.933013,...,0.0,0.618034,1.0,0.3568959,0.0,0.618034,0.722521,5.5511150000000004e-17,0.0,0.618034
2,2013-01-09,3_1047679,3.0,1047679.0,1.0,0.3568959,0.809017,5.5511150000000004e-17,0.75,0.933013,...,0.0,0.618034,0.722521,5.5511150000000004e-17,0.0,0.618034,0.277479,0.0,0.0,0.618034
3,2013-01-10,3_1047679,3.0,1047679.0,0.722521,5.5511150000000004e-17,0.809017,5.5511150000000004e-17,0.75,0.933013,...,0.0,0.618034,0.277479,0.0,0.0,0.618034,0.0,0.3568959,0.0,0.618034
4,2013-01-11,3_1047679,3.0,1047679.0,0.277479,0.0,0.809017,5.5511150000000004e-17,0.75,0.933013,...,0.0,0.618034,0.0,0.3568959,0.0,0.618034,0.099031,0.8019377,0.0,0.618034


In [27]:
# Save to dataframe
import pickle
with open('../output/data/minmax_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    
scaled_df.to_excel("../output/data/scaled_train_top_10_store_10_item_sales_cyclical_features_16_days.xlsx", index=False)

In [28]:
print("Unique (store, item) pairs:", scaled_df["store_item"].nunique())


Unique (store, item) pairs: 100
