In [1]:
import numpy as np 
import pandas as pd
from sklearn.cluster import (
    SpectralBiclustering,
    SpectralClustering,
    SpectralCoclustering,
    HDBSCAN,
)

import sys
sys.path.insert(0, "..")   
from src.data_utils import mav, add_mav_column, normalize_data, mav_by_cluster, median_mean_transform
from src.cluster_util import compute_biclustering_scores, cluster_data, compute_biclustering_scores

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
%pwd

'/Users/yvesgreatti/github/kaggle_favorita_grocery_sales_forecasting/notebook'

In [4]:
df = pd.read_parquet("../output/data/train_2014_January_12_store_20_item_cluster.parquet")
cols = [
        "date",
        "store_item",
        "store",
        "item",
        "store_cluster",
        "item_cluster",
        "weight",
        "onpromotion",
        "unit_sales"
    ]
df = df[cols]
df['store_item'] = df['store_item'].astype(str)
df.sort_values(["date", "store_item"], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,date,store_item,store,item,store_cluster,item_cluster,weight,onpromotion,unit_sales
0,2014-01-01,1_1047679,1,1047679,8,1,1,False,0.0
1,2014-01-01,1_1168718,1,1168718,8,0,1,False,0.0
2,2014-01-01,1_1463591,1,1463591,8,0,1,False,0.0
3,2014-01-01,1_1463992,1,1463992,8,4,1,False,0.0
4,2014-01-01,1_1464092,1,1464092,8,5,1,False,0.0


In [23]:
df2 = cluster_data(
    df,
    freq = "W",
    model_class=SpectralBiclustering,
    row_range = range(2, 5),
    col_range = range(2, 5),
    mav_df_fn = "../output/data/train_2014_January_12_store_20_item_cluster_mav.csv",
)

2025-08-19 14:20:39,432 - INFO - Number of items: 20
2025-08-19 14:20:39,433 - INFO - Number of stores: 12
2025-08-19 14:20:39,434 - INFO - Evaluating n_row=2, n_col=2
2025-08-19 14:20:39,660 - INFO - Evaluating n_row=2, n_col=3
2025-08-19 14:20:39,886 - INFO - Evaluating n_row=2, n_col=4
2025-08-19 14:20:40,120 - INFO - Evaluating n_row=3, n_col=2
2025-08-19 14:20:40,340 - INFO - Evaluating n_row=3, n_col=3
2025-08-19 14:20:40,580 - INFO - Evaluating n_row=3, n_col=4
2025-08-19 14:20:40,833 - INFO - Evaluating n_row=4, n_col=2
2025-08-19 14:20:41,073 - INFO - Evaluating n_row=4, n_col=3
2025-08-19 14:20:41,327 - INFO - Evaluating n_row=4, n_col=4
2025-08-19 14:20:41,607 - INFO - Saving mav_df to ../output/data/train_2014_January_12_store_20_item_cluster_mav.csv
2025-08-19 14:20:41,919 - INFO - Best clustering result: Model                                  SpectralBiclustering(n_clusters=(3, 4), random...
n_row                                                                            

In [24]:
df2.head()

Unnamed: 0,date,store_item,store,item,weight,onpromotion,unit_sales,store_cluster,item_cluster,cluster
0,2014-01-01,1_1047679,1,1047679,1,False,0.0,0,0,0_0
1,2014-01-01,1_1168718,1,1168718,1,False,0.0,0,2,0_2
2,2014-01-01,1_1463591,1,1463591,1,False,0.0,0,0,0_0
3,2014-01-01,1_1463992,1,1463992,1,False,0.0,0,2,0_2
4,2014-01-01,1_1464092,1,1464092,1,False,0.0,0,2,0_2


In [26]:
df2["store_cluster"].nunique()

3

In [27]:
df2["item_cluster"].nunique()

4

In [34]:
df2 = add_mav_column(df2, "store", "item", "unit_sales", is_log1p=False, include_zeros=True)

In [35]:
df2.to_csv("../output/data/train_2014_January_12_store_20_item_cluster.csv", index=False)

In [38]:
norm_data = median_mean_transform(df, freq="W", median_transform=False, mean_transform=True)

In [39]:
norm_data

item,401911,557286,567781,587069,671076,807493,850542,864508,864510,1047679,1168718,1463591,1463992,1464092,1473474,1503117,1503844,1639937,1695835,1695978
store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.331429,1.354286,1.074286,0.0,0.685714,11.679999,0.0,0.0,0.0,30.114285,0.691429,2.817143,83.919998,0.0,121.208153,0.0,156.522354,0.0,66.90007,0.0
3,1.314286,9.091429,4.605714,0.0,3.84,74.994286,0.0,0.0,0.0,499.079987,0.742857,6.531428,199.662857,6.84,400.646393,0.0,537.30011,0.0,223.884811,17.344542
26,1.617143,0.428571,1.8,0.0,1.891429,22.639999,0.0,0.0,0.0,5.177143,2.388572,0.531429,7.445714,9.457143,23.977423,0.0,14.550044,0.0,5.711448,2.038674
28,1.565714,1.428571,2.811429,0.0,1.474286,74.371429,0.0,0.0,0.0,7.462857,2.96,1.24,27.022858,5.851429,43.086201,0.0,0.0,0.0,20.538925,0.0
30,0.108571,0.0,0.0,0.0,1.822857,19.405714,0.0,0.0,0.0,5.697143,1.274286,0.0,5.325714,0.2,23.446419,0.0,0.0,0.0,0.0,0.0
31,3.12,0.714286,1.011429,0.0,1.554286,74.51429,0.0,0.0,0.0,7.44,1.28,0.0,55.708569,1.548571,22.213949,0.0,7.243728,0.0,0.0,0.0
32,0.291429,0.0,0.0,0.0,0.828571,9.742857,0.0,0.0,0.0,2.114286,2.085714,0.0,26.828571,0.262857,0.0,0.0,0.0,0.0,0.0,0.0
35,0.56,0.0,0.0,0.0,0.691429,38.005714,0.0,0.0,0.0,13.662857,1.251429,0.0,6.845715,0.914286,7.894524,0.0,0.0,0.0,0.0,0.0
40,1.982857,0.0,0.0,0.0,5.268571,125.897141,0.0,0.0,0.0,11.4,4.382857,0.0,61.0,19.554287,34.234875,0.0,0.0,0.0,0.0,0.0
44,4.611429,15.422857,6.754286,0.0,5.634286,86.422859,0.0,0.0,0.0,502.845703,3.794286,6.142857,153.617142,3.308571,575.543579,0.0,752.029846,0.028571,276.321564,24.047077


In [36]:
mav_by_cluster(df, norm_data)

Unnamed: 0,store_cluster,item_cluster,store_item_mav
0,0,0,0.090909
1,0,1,3.000000
2,0,2,0.000000
3,0,3,24.382999
4,0,4,4.000000
...,...,...,...
67,8,3,124.102997
68,8,4,80.000000
69,8,5,0.000000
70,8,6,11.000000


In [5]:
df.head()

Unnamed: 0,date,store_item,store,item,store_cluster,item_cluster,weight,onpromotion,unit_sales
0,2014-01-01,1_1047679,1,1047679,8,1,1,False,0.0
1,2014-01-01,1_1168718,1,1168718,8,0,1,False,0.0
2,2014-01-01,1_1463591,1,1463591,8,0,1,False,0.0
3,2014-01-01,1_1463992,1,1463992,8,4,1,False,0.0
4,2014-01-01,1_1464092,1,1464092,8,5,1,False,0.0


In [7]:
df["item"].nunique()

20

In [8]:
df = add_mav_column(df, "store", "item", "unit_sales", is_log1p=False, include_zeros=True)

In [9]:
df.head()

Unnamed: 0,date,store_item,store,item,store_cluster,item_cluster,weight,onpromotion,unit_sales,store_item_mav
0,2014-01-01,1_1047679,1,1047679,8,1,1,False,0.0,30.064516
1,2014-01-01,1_1168718,1,1168718,8,0,1,False,0.0,0.677419
2,2014-01-01,1_1463591,1,1463591,8,0,1,False,0.0,2.806452
3,2014-01-01,1_1463992,1,1463992,8,4,1,False,0.0,82.774193
4,2014-01-01,1_1464092,1,1464092,8,5,1,False,0.0,0.0


In [10]:
df2 =df.groupby([pd.Grouper(key="date", freq='W'), "store", "item"])[
                "unit_sales"\
            ].median() .reset_index()
df2 = df2.groupby(["store", "item"])["unit_sales"].median().unstack(fill_value=0)
df2

item,401911,557286,567781,587069,671076,807493,850542,864508,864510,1047679,1168718,1463591,1463992,1464092,1473474,1503117,1503844,1639937,1695835,1695978
store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,1.0,1.0,0.0,1.0,11.0,0.0,0.0,0.0,29.0,1.0,1.0,80.0,0.0,124.102997,0.0,157.649002,0.0,61.458,0.0
3,1.0,9.0,5.0,0.0,4.0,67.0,0.0,0.0,0.0,461.0,0.0,7.0,206.0,5.0,406.141998,0.0,534.202026,0.0,232.259995,15.907
26,2.0,0.0,1.0,0.0,2.0,17.0,0.0,0.0,0.0,2.0,2.0,0.0,4.0,5.0,21.117001,0.0,13.569,0.0,3.0762,1.578
28,1.0,1.0,3.0,0.0,1.0,66.0,0.0,0.0,0.0,5.0,3.0,0.0,26.0,5.0,38.901001,0.0,0.0,0.0,21.615,0.0
30,0.0,0.0,0.0,0.0,2.0,16.0,0.0,0.0,0.0,4.0,1.0,0.0,4.0,0.0,24.382999,0.0,0.0,0.0,0.0,0.0
31,3.0,1.0,1.0,0.0,1.0,61.0,0.0,0.0,0.0,3.0,1.0,0.0,45.0,2.0,24.413,0.0,6.723,0.0,0.0,0.0
32,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,1.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,0.0,0.0,0.0,0.0,0.0,37.0,0.0,0.0,0.0,12.0,1.0,0.0,7.0,0.0,7.364,0.0,0.0,0.0,0.0,0.0
40,1.0,0.0,0.0,0.0,5.0,134.0,0.0,0.0,0.0,8.0,3.0,0.0,67.0,10.0,34.141998,0.0,0.0,0.0,0.0,0.0
44,4.0,16.0,5.0,0.0,6.0,72.0,0.0,0.0,0.0,498.0,3.0,6.0,131.0,4.0,500.980988,0.0,666.377991,0.0,244.968994,22.504999


In [None]:
 matrix = matrix.loc[store_clusters.index, item_clusters.index]
    store_labels = store_clusters.loc[matrix.index]
    item_labels = item_clusters.loc[matrix.columns]

    # Long form
    long_df = matrix.stack().rename("value").reset_index()
    long_df.columns = ["store", "item", "value"]
    long_df["store_cluster"] = store_labels.loc[long_df["store"]].values
    long_df["item_cluster"] = item_labels.loc[long_df["item"]].values


In [None]:
df = df.groupby(["store", "item"])["unit_sales"].mean().unstack(fill_value=0)


In [9]:
norm_df = normalize_data(df, freq="W")

In [23]:
norm_df

item,401911,557286,567781,587069,671076,807493,850542,864508,864510,1047679,1168718,1463591,1463992,1464092,1473474,1503117,1503844,1639937,1695835,1695978
store,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,-0.758996,-0.380191,-0.380191,-0.758996,-0.380191,0.599005,-0.758996,-0.758996,-0.758996,1.099758,-0.380191,-0.380191,1.64257,-0.758996,1.880127,-0.758996,2.009951,-0.758996,1.500504,-0.758996
3,-0.702612,-0.010237,-0.229992,-1.000802,-0.308426,0.814418,-1.000802,-1.000802,-1.000802,1.638699,-1.000802,-0.106232,1.293318,-0.229992,1.584321,-1.000802,1.701973,-1.000802,1.344698,0.215678
26,0.123059,-0.973554,-0.281669,-0.973554,0.123059,1.911556,-0.973554,-0.973554,-0.973554,0.123059,0.123059,-0.973554,0.632954,0.814944,2.117156,-0.973554,1.700465,-0.973554,0.429053,-0.028264
28,-0.323301,-0.323301,0.181523,-0.828125,-0.323301,2.234182,-0.828125,-0.828125,-0.828125,0.476826,0.181523,-0.828125,1.572256,0.476826,1.856706,-0.828125,-0.828125,-0.828125,1.443183,-0.828125
30,-0.565049,-0.565049,-0.565049,-0.565049,0.555683,2.325209,-0.565049,-0.565049,-0.565049,1.076794,0.142054,-0.565049,1.076794,-0.565049,2.734146,-0.565049,-0.565049,-0.565049,-0.565049,-0.565049
31,0.30292,-0.232245,-0.232245,-0.76741,-0.232245,2.419067,-0.76741,-0.76741,-0.76741,0.30292,-0.232245,-0.76741,2.188607,0.080807,1.73047,-0.76741,0.810878,-0.76741,-0.76741,-0.76741
32,-0.365647,-0.365647,-0.365647,-0.365647,-0.365647,2.155929,-0.365647,-0.365647,-0.365647,-0.365647,0.474878,-0.365647,3.585192,-0.365647,-0.365647,-0.365647,-0.365647,-0.365647,-0.365647,-0.365647
35,-0.517866,-0.517866,-0.517866,-0.517866,-0.517866,2.876623,-0.517866,-0.517866,-0.517866,1.87567,0.128959,-0.517866,1.422608,-0.517866,1.46413,-0.517866,-0.517866,-0.517866,-0.517866,-0.517866
40,-0.234332,-0.680095,-0.680095,-0.680095,0.472186,2.47449,-0.680095,-0.680095,-0.680095,0.732941,0.211432,-0.680095,2.033472,0.861993,1.608955,-0.680095,-0.680095,-0.680095,-0.680095,-0.680095
44,-0.389039,0.148819,-0.308908,-1.0964,-0.241157,0.789291,-1.0964,-1.0964,-1.0964,1.634089,-0.487113,-0.241157,1.04963,-0.389039,1.636707,-1.0964,1.761876,-1.0964,1.323181,0.29122


In [None]:
norm_df = add_mav_column(norm_df, "store_cluster", "item_cluster", "unit_sales", is_log1p=False, include_zeros=True)

In [7]:

def top_n_by_m(df, n_col="unit_sales", group_column="store_nbr", top_n=10):
    """
    Returns the top N stores by total unit sales.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        n_col (str): Column representing sales values.
        group_column (str): Column to group by (e.g., store number).
        top_n (int): Number of top results to return.

    Returns:
        pd.DataFrame: DataFrame of top N stores by total sales.
    """
    return (
        df.groupby(group_column)
        .agg({n_col: "sum"})
        .sort_values(n_col, ascending=False)
        .head(top_n)
    )


In [9]:
# Select top-M items globally
df_top_items = top_n_by_m(
    df, n_col="unit_sales", group_column="item", top_n=9000
)
valid_items = df_top_items.reset_index()["item"].tolist()
print(len(valid_items))
# Select top-N stores globally
df_top_stores = top_n_by_m(
    df, n_col="unit_sales", group_column="store", top_n=51
)
valid_stores = df_top_stores.reset_index()["store"].tolist()
print(len(valid_stores))
unique_dates = df["date"].dropna().unique()
grid = pd.MultiIndex.from_product(
    [valid_stores, valid_items, sorted(unique_dates)],
    names=["store", "item", "date"],
    ).to_frame(index=False)

4036
51


In [4]:
df.to_csv("../data/20250707_train.csv", index=False)

In [4]:
train_df["onpromotion"].unique()

array([nan, False, True], dtype=object)

In [5]:
def top_n_by_m(df, n_col="unit_sales", group_column="store_nbr", top_n=10):
    """
    Returns the top N stores by total unit sales.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        n_col (str): Column representing sales values.
        group_column (str): Column to group by (e.g., store number).
        top_n (int): Number of top results to return.

    Returns:
        pd.DataFrame: DataFrame of top N stores by total sales.
    """
    return df.groupby(group_column).agg({n_col: "sum"}).sort_values(n_col, ascending=False).head(top_n)

def top_values_with_percentage(df, group_column, value_column, n=5):
    """
    Returns the top N values with percentages for each group in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        group_column (str): The column to group by.
        value_column (str): The column to calculate percentages from.
        n (int): The number of top values to return.

    Returns:
        pd.DataFrame: A DataFrame containing the top N values and their percentages for each group.
    """
    grouped = df.groupby(group_column)[value_column].value_counts(normalize=True) * 100
    grouped = grouped.rename('percentage').reset_index()
    top_n = grouped.groupby(group_column).apply(lambda x: x.nlargest(n, 'percentage')).reset_index(drop=True)
    return top_n


def value_counts_with_percentage(df, column_name, top_n = 10):
    """
    Computes value counts and percentage distribution of a column.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        column_name (str): Name of the column to analyze.

    Returns:
        pd.DataFrame: DataFrame with counts and percentages.
    """
    counts = df[column_name].value_counts()
    percentages = df[column_name].value_counts(normalize=True) * 100
    df = pd.DataFrame({column_name + '_count': counts, column_name + '_percentage': percentages})
    return df.sort_values(column_name, ascending=False).head(top_n)

In [6]:
df = top_n_by_m(train_df, n_col="unit_sales", group_column="store_nbr", top_n=100)

In [7]:
df

Unnamed: 0_level_0,unit_sales
store_nbr,Unnamed: 1_level_1
44,62087544.0
45,54498012.0
47,50948308.0
3,50481900.0
49,43420088.0
46,41896052.0
48,35933132.0
51,32911484.0
8,30491336.0
50,28653018.0


In [8]:
# df.reset_index().to_excel(
#     "../output/data/20250627_top_100_store_sale.xlsx",
#     index=False
# )


In [9]:
# assuming df.reset_index()["store_nbr"] gives you the list of valid stores:
valid_stores = df.reset_index()["store_nbr"].tolist()

# Filter train_df to only rows whose store_nbr is in that list:
train_df_filtered = train_df[ train_df["store_nbr"].isin(valid_stores) ]


In [10]:
train_df_filtered = train_df_filtered.reset_index()
train_df_filtered.drop(["index"], axis=1, inplace=True)

In [11]:
train_df_filtered.head(4)

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25,103665,7.0,
1,1,2013-01-01,25,105574,1.0,
2,2,2013-01-01,25,105575,2.0,
3,3,2013-01-01,25,108079,1.0,


In [19]:
train_df_filtered["store_nbr"].nunique()

54

In [20]:
#top_values_with_percentage(train_df_filtered, "store_nbr", "unit_sales")

In [14]:
def count_percent(series, n=3):
    counts = series.value_counts().head(n)
    percentages = counts / series.count() * 100
    result = pd.DataFrame({'Count': counts, 'Percentage': percentages})
    return result

In [15]:
count_percent(train_df["item_nbr"], n=10)

Unnamed: 0_level_0,Count,Percentage
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1
502331,83475,0.066516
314384,83450,0.066496
364606,83308,0.066382
265559,83047,0.066174
559870,82513,0.065749
1036689,82134,0.065447
273528,82108,0.065426
564533,82086,0.065409
261052,81774,0.06516
414353,81755,0.065145


In [16]:
len(train_df_filtered)

125497040

In [17]:
# assuming df.reset_index()["store_nbr"] gives you the list of valid stores:
valid_item = count_percent(train_df_filtered["item_nbr"], n=500).reset_index()["item_nbr"].tolist()

# Filter train_df to only rows whose store_nbr is in that list:
train_df_filtered = train_df_filtered[ train_df_filtered["item_nbr"].isin(valid_item) ]
train_df_filtered.head()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
5,5,2013-01-01,25,108786,3.0,
6,6,2013-01-01,25,108797,1.0,
7,7,2013-01-01,25,108952,1.0,
8,8,2013-01-01,25,111397,13.0,
9,9,2013-01-01,25,114790,3.0,


In [29]:
len(train_df_filtered)

34824868

In [19]:
print(train_df_filtered["store_nbr"].nunique())
print(train_df_filtered["item_nbr"].nunique())

54
500


In [20]:
 # Save to a new CSV
 
train_df_filtered.to_csv(
    "../output/data/20250630_train_top_store_500_item.csv",
    index=False
)

In [7]:
train_df["date"] = pd.to_datetime(train_df["date"])
min_date = train_df['date'].min()
cutoff_date = min_date + pd.Timedelta(days=90)

# 3. Filter to the first 30 days
first_n_days = train_df[
    (train_df['date'] >= min_date) &
    (train_df['date'] <= cutoff_date)
].copy()

# 4. (Optional) Sort if you want chronological order
first_n_days.sort_values(['date', 'store_nbr', 'item_nbr'], inplace=True)

In [8]:
first_n_days.head(5)

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion
0,0,2013-01-01,25,103665,7.0,
1,1,2013-01-01,25,105574,1.0,
2,2,2013-01-01,25,105575,2.0,
3,3,2013-01-01,25,108079,1.0,
4,4,2013-01-01,25,108701,1.0,


In [9]:
first_n_days.reset_index(drop=True, inplace=True)

In [10]:
first_n_days["date"].unique()

<DatetimeArray>
['2013-01-01 00:00:00', '2013-01-02 00:00:00', '2013-01-03 00:00:00',
 '2013-01-04 00:00:00', '2013-01-05 00:00:00', '2013-01-06 00:00:00',
 '2013-01-07 00:00:00', '2013-01-08 00:00:00', '2013-01-09 00:00:00',
 '2013-01-10 00:00:00', '2013-01-11 00:00:00', '2013-01-12 00:00:00',
 '2013-01-13 00:00:00', '2013-01-14 00:00:00', '2013-01-15 00:00:00',
 '2013-01-16 00:00:00', '2013-01-17 00:00:00', '2013-01-18 00:00:00',
 '2013-01-19 00:00:00', '2013-01-20 00:00:00', '2013-01-21 00:00:00',
 '2013-01-22 00:00:00', '2013-01-23 00:00:00', '2013-01-24 00:00:00',
 '2013-01-25 00:00:00', '2013-01-26 00:00:00', '2013-01-27 00:00:00',
 '2013-01-28 00:00:00', '2013-01-29 00:00:00', '2013-01-30 00:00:00',
 '2013-01-31 00:00:00', '2013-02-01 00:00:00', '2013-02-02 00:00:00',
 '2013-02-03 00:00:00', '2013-02-04 00:00:00', '2013-02-05 00:00:00',
 '2013-02-06 00:00:00', '2013-02-07 00:00:00', '2013-02-08 00:00:00',
 '2013-02-09 00:00:00', '2013-02-10 00:00:00', '2013-02-11 00:00:00',
 '20

In [11]:
 # Save to a new CSV
first_n_days.to_csv(
    "./data/train_first_90_days.csv",
    index=False
)

print(f"Kept records from {min_date.date()} to {cutoff_date.date()} – {len(first_n_days)} rows saved.")


Kept records from 2013-01-01 to 2013-04-01 – 3738322 rows saved.


In [12]:
train_df.groupby(["store_nbr","item_nbr"]).agg({"id":"count"})

Unnamed: 0_level_0,Unnamed: 1_level_0,id
store_nbr,item_nbr,Unnamed: 2_level_1
1,96995,187
1,99197,185
1,103520,1119
1,103665,1358
1,105574,1546
...,...,...
54,2110456,47
54,2113343,8
54,2113914,36
54,2116416,11


In [13]:
train_df["item_nbr"].nunique()

4036

In [14]:
train_df["store_nbr"].nunique()

54

In [15]:
train_df.shape

(125497040, 6)

In [16]:
train_df.groupby("store_nbr").agg({"unit_sales":"sum"}).sort_values("unit_sales",ascending=False).head(10)

Unnamed: 0_level_0,unit_sales
store_nbr,Unnamed: 1_level_1
44,62087544.0
45,54498012.0
47,50948308.0
3,50481900.0
49,43420088.0
46,41896052.0
48,35933132.0
51,32911484.0
8,30491336.0
50,28653018.0


In [9]:
top_n_by_m(train_df)

Unnamed: 0_level_0,unit_sales
store_nbr,Unnamed: 1_level_1
3,24060.347656
8,14659.328125
6,13520.485352
7,11997.501953
5,10598.619141
2,10266.71875
4,10200.083984
9,9757.633789
1,7417.147949
25,2511.618896


In [11]:
value_counts_with_percentage(train_df, "item_nbr")

Unnamed: 0_level_0,item_nbr_count,item_nbr_percentage
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1
165718,10,0.10
103665,10,0.10
165705,10,0.10
165704,10,0.10
115267,10,0.10
...,...,...
268446,1,0.01
279125,1,0.01
302824,1,0.01
554145,1,0.01


In [13]:
df = train_df.groupby("date")["unit_sales"].sum().reset_index()  # Aggregate daily sales

In [14]:
df.shift(5)

Unnamed: 0,date,unit_sales
0,,
1,,
