In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


In [32]:
dtype_dict={"id":np.uint32,
            "store_nbr":np.uint8,
            "item_nbr":np.uint32,
            "unit_sales":np.float32
           }
df = pd.read_csv("../output/data/train_top_10_store_10_item.csv", dtype=dtype_dict)

  df = pd.read_csv("../output/data/train_top_10_store_10_item.csv", dtype=dtype_dict)


In [33]:

df["date"] = pd.to_datetime(df["date"])
df["unit_sales"] = df["unit_sales"].fillna(0)
df['unit_sales'] = df['unit_sales'].clip(lower=0)

In [34]:
df["date"].unique()

<DatetimeArray>
['2013-01-02 00:00:00', '2013-01-03 00:00:00', '2013-01-04 00:00:00',
 '2013-01-05 00:00:00', '2013-01-06 00:00:00', '2013-01-07 00:00:00',
 '2013-01-08 00:00:00', '2013-01-09 00:00:00', '2013-01-10 00:00:00',
 '2013-01-11 00:00:00',
 ...
 '2017-08-06 00:00:00', '2017-08-07 00:00:00', '2017-08-08 00:00:00',
 '2017-08-09 00:00:00', '2017-08-10 00:00:00', '2017-08-11 00:00:00',
 '2017-08-12 00:00:00', '2017-08-13 00:00:00', '2017-08-14 00:00:00',
 '2017-08-15 00:00:00']
Length: 1679, dtype: datetime64[ns]

In [35]:

# assume df["date"] is datetime64[ns]
min_date = df["date"].min()

# compute the Monday of the week containing min_date
week_start = min_date - pd.Timedelta(days=min_date.weekday())
week_start

Timestamp('2012-12-31 00:00:00')

In [36]:

# if min_date is already a Monday, keep that; otherwise bump to next week’s Monday
if min_date.weekday() == 0:
    first_monday = week_start
else:
    first_monday = week_start + pd.Timedelta(days=7)

print("min_date:", min_date)
print("first day of the first full week:", first_monday)


min_date: 2013-01-02 00:00:00
first day of the first full week: 2013-01-07 00:00:00


In [37]:
df = df[df['date'] >= first_monday]


In [38]:
df.describe()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales
count,167304,167304.0,167304.0,167304.0
mean,2015-04-27 04:57:39.015923200,39.099155,496479.9,77.878761
min,2013-01-07 00:00:00,3.0,114790.0,0.0
25%,2014-03-03 00:00:00,44.0,314384.0,29.42325
50%,2015-04-28 00:00:00,47.0,502331.0,53.0
75%,2016-06-21 00:00:00,49.0,582864.0,91.321499
max,2017-08-15 00:00:00,51.0,1047679.0,4635.0
std,,16.962276,237477.8,88.393265


In [39]:
df.to_csv(
    "../output/data/train_top_10_store_10_item.csv",
    index=False
)

In [7]:
def generate_nonoverlap_window_features(
    df: pd.DataFrame,
    window_size: int = 5
) -> pd.DataFrame:
    """
    Splits the dates in train_df into non-overlapping windows of length `window_size`,
    then for each (store_nbr, item_nbr) within each window computes:
      - total sales on each day
      - median sales per store on each day
      - median sales per item on each day

    Returns a DataFrame with columns:
      - id = '{store}_{item}_{window_start:%Y-%m-%d}'
      - sales_day_1 ... sales_day_{window_size}
      - store_med_day_1 ... store_med_day_{window_size}
      - item_med_day_1 ... item_med_day_{window_size}'
    """
    # 1) Ensure datetime
    df = df.copy()
    df["date"] = pd.to_datetime(df["date"])

    # 2) Build non-overlapping windows
    unique_dates = df["date"].sort_values().unique()
    chunked_windows = [
        unique_dates[i : i + window_size]
        for i in range(0, len(unique_dates), window_size)
        if len(unique_dates[i : i + window_size]) == window_size
    ]

    records = []
    for window_dates in chunked_windows:
        window_start = pd.to_datetime(window_dates[0])
        window_str   = window_start.strftime("%Y-%m-%d")

        # subset to this window
        w_df = df[df["date"].isin(window_dates)]

        # precompute medians & sums
        store_med = (
            w_df.groupby(["store_nbr","date"])["unit_sales"]
                .median()
                .unstack(fill_value=0)
        )
        item_med = (
            w_df.groupby(["item_nbr","date"])["unit_sales"]
                .median()
                .unstack(fill_value=0)
        )
        sales = (
            w_df.groupby(["store_nbr","item_nbr","date"])["unit_sales"]
                .sum()
                .unstack(fill_value=0)
        )

        for (store, item), sales_vals in sales.iterrows():
            row = {"id": f"{store}_{item}_{window_str}"}
            
            # sales_day_i
            for i, d in enumerate(window_dates, start=1):
                row[f"sales_day_{i}"] = sales_vals.get(d, 0)

            # store_med_day_i
            if store in store_med.index:
                sm = store_med.loc[store]
            else:
                sm = pd.Series(0, index=window_dates)
            for i, d in enumerate(window_dates, start=1):
                row[f"store_med_day_{i}"] = sm.get(d, 0)

            # item_med_day_i
            if item in item_med.index:
                im = item_med.loc[item]
            else:
                im = pd.Series(0, index=window_dates)
            for i, d in enumerate(window_dates, start=1):
                row[f"item_med_day_{i}"] = im.get(d, 0)

            records.append(row)

    return pd.DataFrame.from_records(records)


In [None]:
non_overlap_df = generate_nonoverlap_window_features(df, window_size=7)

In [15]:
# --- Non-overlapping 5-day windows ---
train_df_reset = train_df.copy()
unique_dates = train_df_reset["date"].sort_values().unique()

# Create non-overlapping 5-day chunks
chunked_windows = [
    unique_dates[i:i + 5]
    for i in range(0, len(unique_dates), 5)
    if len(unique_dates[i:i + 5]) == 5
]

In [16]:
chunked_windows

[<DatetimeArray>
 ['2013-01-01 00:00:00', '2013-01-02 00:00:00', '2013-01-03 00:00:00',
  '2013-01-04 00:00:00', '2013-01-05 00:00:00']
 Length: 5, dtype: datetime64[ns],
 <DatetimeArray>
 ['2013-01-06 00:00:00', '2013-01-07 00:00:00', '2013-01-08 00:00:00',
  '2013-01-09 00:00:00', '2013-01-10 00:00:00']
 Length: 5, dtype: datetime64[ns],
 <DatetimeArray>
 ['2013-01-11 00:00:00', '2013-01-12 00:00:00', '2013-01-13 00:00:00',
  '2013-01-14 00:00:00', '2013-01-15 00:00:00']
 Length: 5, dtype: datetime64[ns],
 <DatetimeArray>
 ['2013-01-16 00:00:00', '2013-01-17 00:00:00', '2013-01-18 00:00:00',
  '2013-01-19 00:00:00', '2013-01-20 00:00:00']
 Length: 5, dtype: datetime64[ns],
 <DatetimeArray>
 ['2013-01-21 00:00:00', '2013-01-22 00:00:00', '2013-01-23 00:00:00',
  '2013-01-24 00:00:00', '2013-01-25 00:00:00']
 Length: 5, dtype: datetime64[ns],
 <DatetimeArray>
 ['2013-01-26 00:00:00', '2013-01-27 00:00:00', '2013-01-28 00:00:00',
  '2013-01-29 00:00:00', '2013-01-30 00:00:00']
 Length: 

In [None]:

records = []

# Generate feature rows per window
for window_dates in chunked_windows:
    window_start = pd.to_datetime(window_dates[0])
    window_str = window_start.strftime('%Y-%m-%d')
    
    window_df = train_df_reset[train_df_reset["date"].isin(window_dates)]

    store_medians = window_df.groupby(["store_nbr", "date"])["unit_sales"].median().unstack().fillna(0)
    item_medians = window_df.groupby(["item_nbr", "date"])["unit_sales"].median().unstack().fillna(0)
    sales = window_df.groupby(["store_nbr", "item_nbr", "date"])["unit_sales"].sum().unstack().fillna(0)

    for (store, item), sales_values in sales.iterrows():
        row = {
            "id": f"{store}_{item}_{window_str}"
        }
        for i, d in enumerate(window_dates):
            row[f"sales_day_{i+1}"] = sales_values.get(d, 0)
        store_meds = store_medians.loc[store] if store in store_medians.index else [0]*5
        for i, d in enumerate(window_dates):
            row[f"store_med_day_{i+1}"] = store_meds.get(d, 0)
        item_meds = item_medians.loc[item] if item in item_medians.index else [0]*5
        for i, d in enumerate(window_dates):
            row[f"item_med_day_{i+1}"] = item_meds.get(d, 0)
        records.append(row)

# # --- Assemble final DataFrame ---
non_overlap_df = pd.DataFrame.from_records(records)

In [22]:
non_overlap_df

Unnamed: 0,id,sales_day_1,sales_day_2,sales_day_3,sales_day_4,sales_day_5,store_med_day_1,store_med_day_2,store_med_day_3,store_med_day_4,store_med_day_5,item_med_day_1,item_med_day_2,item_med_day_3,item_med_day_4,item_med_day_5
0,1_103520_2013-01-01,0.0,0.0,0.0,2.0,3.0,0.0,4.5755,4.0,4.0,4.0,0.0,3.0,3.0,1.5,4.0
1,1_103665_2013-01-01,0.0,2.0,3.0,2.0,4.0,0.0,4.5755,4.0,4.0,4.0,7.0,6.0,3.5,5.5,7.0
2,1_105574_2013-01-01,0.0,8.0,4.0,8.0,4.0,0.0,4.5755,4.0,4.0,4.0,1.0,16.5,12.5,10.5,12.0
3,1_105575_2013-01-01,0.0,15.0,6.0,8.0,7.0,0.0,4.5755,4.0,4.0,4.0,2.0,23.5,12.0,13.5,17.5
4,1_105577_2013-01-01,0.0,2.0,2.0,3.0,2.0,0.0,4.5755,4.0,4.0,4.0,0.0,6.0,4.0,6.0,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
921412,54_1146801_2013-03-27,6.0,12.0,15.0,19.0,3.0,3.0,4.0000,3.0,3.0,4.0,7.5,5.0,7.5,13.0,11.0
921413,54_1146802_2013-03-27,0.0,9.0,2.0,16.0,2.0,3.0,4.0000,3.0,3.0,4.0,6.0,6.0,6.0,8.0,10.0
921414,54_1147731_2013-03-27,1.0,0.0,1.0,0.0,0.0,3.0,4.0000,3.0,3.0,4.0,1.0,1.0,1.0,1.0,1.0
921415,54_1148972_2013-03-27,3.0,2.0,2.0,3.0,2.0,3.0,4.0000,3.0,3.0,4.0,3.0,3.0,2.0,4.0,4.0


In [27]:
# Extract store-item pairs from the 'id' column
non_overlap_df["store_item"] = non_overlap_df["id"].apply(lambda x: "_".join(x.split("_")[:2]))

# Count unique store-item pairs
unique_pairs = non_overlap_df["store_item"].nunique()

print("Unique (store, item) pairs:", unique_pairs)


Unique (store, item) pairs: 56932


In [25]:
# --- Apply MinMax Scaling ---
scaler = MinMaxScaler()
scaled_values = scaler.fit_transform(non_overlap_df.drop(columns="id"))
scaled_df = pd.DataFrame(scaled_values, columns=non_overlap_df.columns[1:])
scaled_df.insert(0, "id", non_overlap_df["id"])

# --- Save to CSV (optional) ---
scaled_df.to_csv("./data/non_overlapping_5day_features_90days.csv", index=False)


In [26]:
scaled_df

Unnamed: 0,id,sales_day_1,sales_day_2,sales_day_3,sales_day_4,sales_day_5,store_med_day_1,store_med_day_2,store_med_day_3,store_med_day_4,store_med_day_5,item_med_day_1,item_med_day_2,item_med_day_3,item_med_day_4,item_med_day_5
0,1_103520_2013-01-01,0.108759,0.012300,0.048959,0.014780,0.027679,0.000000,0.183964,0.153846,0.166667,0.153846,0.000000,0.006897,0.016212,0.001406,0.004040
1,1_103665_2013-01-01,0.108759,0.012792,0.049669,0.014780,0.027841,0.000000,0.183964,0.153846,0.166667,0.153846,0.010493,0.013793,0.016665,0.005155,0.006244
2,1_105574_2013-01-01,0.108759,0.014268,0.049905,0.015625,0.027841,0.000000,0.183964,0.153846,0.166667,0.153846,0.001499,0.037931,0.024814,0.009841,0.009917
3,1_105575_2013-01-01,0.108759,0.015990,0.050378,0.015625,0.028326,0.000000,0.183964,0.153846,0.166667,0.153846,0.002998,0.054023,0.024362,0.012652,0.013957
4,1_105577_2013-01-01,0.108759,0.012792,0.049432,0.014921,0.027517,0.000000,0.183964,0.153846,0.166667,0.153846,0.000000,0.013793,0.017118,0.005623,0.003673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
921412,54_1146801_2013-03-27,0.113139,0.015252,0.052507,0.017173,0.027679,0.176471,0.142857,0.076923,0.083333,0.153846,0.011243,0.011494,0.020287,0.012184,0.009182
921413,54_1146802_2013-03-27,0.108759,0.014514,0.049432,0.016751,0.027517,0.176471,0.142857,0.076923,0.083333,0.153846,0.008994,0.013793,0.018929,0.007498,0.008448
921414,54_1147731_2013-03-27,0.109489,0.012300,0.049196,0.014499,0.027193,0.176471,0.142857,0.076923,0.083333,0.153846,0.001499,0.002299,0.014402,0.000937,0.001836
921415,54_1148972_2013-03-27,0.110949,0.012792,0.049432,0.014921,0.027517,0.176471,0.142857,0.076923,0.083333,0.153846,0.004497,0.006897,0.015307,0.003749,0.004040


In [103]:
store_sales = (
    train_df.groupby(["store_nbr", pd.Grouper(freq="30D")])["unit_sales"]
    .sum()
    .reset_index()
)

store_pivot = store_sales.pivot_table(
    index="store_nbr", columns="date", values="unit_sales", fill_value=0
)
store_pivot

date,2013-01-01,2013-01-31,2013-03-02,2013-04-01
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,152151.625,140467.46875,145725.671875,6144.880859
2,213163.421875,209133.03125,220587.65625,7672.456055
3,484769.125,459022.78125,514087.25,19298.017578
4,204715.90625,196006.375,214795.390625,8200.248047
5,200969.859375,202873.3125,203944.921875,8434.214844
6,270377.6875,256965.6875,287892.6875,11062.974609
7,234074.28125,235999.5,256348.90625,11345.307617
8,307475.375,293309.125,319529.34375,11246.198242
9,267736.4375,289296.65625,324653.34375,13351.862305
10,112651.617188,118470.734375,127698.03125,5434.756836


In [104]:
item_sales = (
    train_df.groupby(["item_nbr", pd.Grouper(freq="30D")])["unit_sales"]
    .sum()
    .reset_index()
)

item_pivot = item_sales.pivot_table(
    index="item_nbr", columns="date", values="unit_sales", fill_value=0
)
item_pivot

date,2013-01-01,2013-01-31,2013-03-02,2013-04-01
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
96995,239.0,318.0,510.0,24.0
103501,3318.0,3280.0,3325.0,146.0
103520,2578.0,3098.0,3747.0,91.0
103665,4060.0,4359.0,3938.0,103.0
105574,8133.0,7914.0,8572.0,344.0
...,...,...,...,...
1147495,0.0,0.0,886.0,50.0
1147731,0.0,0.0,441.0,34.0
1148972,0.0,0.0,2306.0,168.0
1149069,0.0,0.0,4037.0,64.0


In [105]:
from sklearn.preprocessing import MinMaxScaler

scaler_store = MinMaxScaler()
store_scaled = pd.DataFrame(
    scaler_store.fit_transform(store_pivot),
    index=store_pivot.index,
    columns=store_pivot.columns
)

scaler_item = MinMaxScaler()
item_scaled = pd.DataFrame(
    scaler_item.fit_transform(item_pivot),
    index=item_pivot.index,
    columns=item_pivot.columns
)
store_scaled
item_scaled

date,2013-01-01,2013-01-31,2013-03-02,2013-04-01
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
96995,0.002274,0.003121,0.004597,0.005721
103501,0.031576,0.032192,0.029973,0.034803
103520,0.024533,0.030406,0.033777,0.021692
103665,0.038637,0.042782,0.035498,0.024553
105574,0.077397,0.077674,0.077270,0.082002
...,...,...,...,...
1147495,0.000000,0.000000,0.007987,0.011919
1147731,0.000000,0.000000,0.003975,0.008105
1148972,0.000000,0.000000,0.020787,0.040048
1149069,0.000000,0.000000,0.036391,0.015256


In [106]:
store_centroid = store_scaled.median(axis=0)
item_centroid = item_scaled.median(axis=0)


In [107]:
store_centroid

date
2013-01-01    0.161456
2013-01-31    0.187900
2013-03-02    0.198503
2013-04-01    0.222326
dtype: float32

In [68]:
grouped_days = (
    train_df.groupby(["store_nbr", pd.Grouper(freq="30D", level="date")])["unit_sales_scaled"]
    .sum()
    #.reset_index(name="days_count")
)
grouped_days = grouped_days.reset_index()
grouped_days

Unnamed: 0,store_nbr,date,unit_sales_scaled
0,1,2013-01-01,800.374146
1,1,2013-01-31,812.767578
2,1,2013-03-02,833.714600
3,1,2013-04-01,30.087526
4,2,2013-01-01,909.007263
...,...,...,...
179,51,2013-04-01,34.557972
180,54,2013-01-01,492.750275
181,54,2013-01-31,533.633057
182,54,2013-03-02,561.001038


In [69]:
grouped_days.date.unique()

<DatetimeArray>
['2013-01-01 00:00:00', '2013-01-31 00:00:00', '2013-03-02 00:00:00',
 '2013-04-01 00:00:00']
Length: 4, dtype: datetime64[ns]

In [85]:
store_agg = (
    train_df.groupby(["store_nbr", pd.Grouper(freq="30D", level="date")])["unit_sales_scaled"]
    .sum()
    .reset_index()
)
store_pivot = store_agg.pivot_table(
    index="store_nbr", columns="date", values="unit_sales_scaled", fill_value=0
)

In [86]:
store_agg["date"].unique()

<DatetimeArray>
['2013-01-01 00:00:00', '2013-01-31 00:00:00', '2013-03-02 00:00:00',
 '2013-04-01 00:00:00']
Length: 4, dtype: datetime64[ns]

In [87]:
store_pivot.head(5)

date,2013-01-01,2013-01-31,2013-03-02,2013-04-01
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,800.374146,812.767578,833.7146,30.087526
2,909.007263,945.437012,976.548401,33.199982
3,1059.378174,1105.718384,1136.877808,38.948254
4,877.710022,917.396301,953.201111,33.732693
5,870.068237,918.521301,937.566467,32.415401


In [88]:
item_agg = (
    train_df.groupby(["item_nbr", pd.Grouper(freq="30D", level="date")])["unit_sales_scaled"]
    .sum()
    .reset_index()
)

item_pivot = item_agg.pivot_table(
    index="item_nbr", columns="date", values="unit_sales_scaled", fill_value=0
)
item_agg["date"].unique()
item_pivot.head(5)


date,2013-01-01,2013-01-31,2013-03-02,2013-04-01
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
96995,4.685489,6.390816,8.600028,0.347947
103501,18.007076,18.288984,18.611128,0.652053
103520,21.23571,25.328386,26.595867,0.902886
103665,23.135683,24.584351,24.612099,0.847114
105574,20.455605,21.459074,21.378052,0.708241


In [91]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Standardize
store_scaled = StandardScaler().fit_transform(store_pivot)
item_scaled = StandardScaler().fit_transform(item_pivot)

# KMeans clustering
store_kmeans = KMeans(n_clusters=1, random_state=42)
store_labels = store_kmeans.fit_predict(store_pivot)
# store_pivot["cluster"] = store_labels

# item_kmeans = KMeans(n_clusters=2, random_state=42)
# item_labels = item_kmeans.fit_predict(item_scaled)
# item_pivot["cluster"] = item_labels


In [92]:
store_labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int32)