In [7]:
import pandas as pd
import torch
import torch.nn as nn
from collections import defaultdict, deque

In [38]:
import sys
sys.path.insert(0, "../code")   
from model_utils import load_models_from_dir
from utils import RunningMedian, SmoothOnlineMedian

In [9]:
%load_ext autoreload
%autoreload 2

In [10]:
df = pd.read_excel(
    "../output/data/test_top_10_store_10_store_item.xlsx",
)
df = df.sort_values(by=['date', 'store_item'])
df = df.reset_index(drop=True)
df.head(5)


Unnamed: 0,date,store_item,store_nbr,item_nbr,onpromotion,unit_sales
0,2017-08-16,3_1047679,3,1047679,0,0
1,2017-08-16,3_114790,3,114790,1,0
2,2017-08-16,3_305229,3,305229,0,0
3,2017-08-16,3_314384,3,314384,1,0
4,2017-08-16,3_364606,3,364606,0,0


In [11]:
df.sort_values("date")


Unnamed: 0,date,store_item,store_nbr,item_nbr,onpromotion,unit_sales
0,2017-08-16,3_1047679,3,1047679,0,0
72,2017-08-16,50_305229,50,305229,0,0
71,2017-08-16,50_114790,50,114790,1,0
70,2017-08-16,50_1047679,50,1047679,0,0
69,2017-08-16,49_584028,49,584028,0,0
...,...,...,...,...,...,...
1527,2017-08-31,45_581078,45,581078,0,0
1526,2017-08-31,45_567623,45,567623,0,0
1525,2017-08-31,45_502331,45,502331,0,0
1535,2017-08-31,46_502331,46,502331,0,0


In [12]:

def preprocess_test_df(df, feature_cols):
    df['onpromotion'] = df['onpromotion'].astype(bool).astype(int)
    df['date'] = pd.to_datetime(df['date'])
    df[feature_cols] = df[feature_cols].astype('float32')
    return df



In [13]:
feature_cols = ["unit_sales"]
feature_cols

['unit_sales']

In [14]:
df = preprocess_test_df(df, feature_cols)
df.dtypes

date           datetime64[ns]
store_item             object
store_nbr               int64
item_nbr                int64
onpromotion             int64
unit_sales            float32
dtype: object

In [15]:
df.head(5)

Unnamed: 0,date,store_item,store_nbr,item_nbr,onpromotion,unit_sales
0,2017-08-16,3_1047679,3,1047679,0,0.0
1,2017-08-16,3_114790,3,114790,1,0.0
2,2017-08-16,3_305229,3,305229,0,0.0
3,2017-08-16,3_314384,3,314384,1,0.0
4,2017-08-16,3_364606,3,364606,0,0.0


In [16]:
%pwd

'/Users/yvesgreatti/github/kaggle_favorita_grocery_sales_forecasting/notebook'

In [17]:
models = load_models_from_dir()

In [18]:
models
models["3_1047679"]

(ShallowNN(
   (net): Sequential(
     (0): Linear(in_features=21, out_features=64, bias=True)
     (1): Tanh()
     (2): Linear(in_features=64, out_features=21, bias=True)
     (3): Sigmoid()
   )
 ),
 ['sales_day_1',
  'sales_day_2',
  'sales_day_3',
  'sales_day_4',
  'sales_day_5',
  'sales_day_6',
  'sales_day_7',
  'store_med_day_1',
  'store_med_day_2',
  'store_med_day_3',
  'store_med_day_4',
  'store_med_day_5',
  'store_med_day_6',
  'store_med_day_7',
  'item_med_day_1',
  'item_med_day_2',
  'item_med_day_3',
  'item_med_day_4',
  'item_med_day_5',
  'item_med_day_6',
  'item_med_day_7'])

In [19]:
# load
data = pd.read_excel("../output/data/scaled_train_nonoverlap_top_10_store_item_X_y.xlsx")
data.head(5)

Unnamed: 0,date,store_item,store,item,sales_day_1,sales_day_2,sales_day_3,sales_day_4,sales_day_5,sales_day_6,...,y_store_med_day_5,y_store_med_day_6,y_store_med_day_7,y_item_med_day_1,y_item_med_day_2,y_item_med_day_3,y_item_med_day_4,y_item_med_day_5,y_item_med_day_6,y_item_med_day_7
0,2013-01-07,3_1047679,3,1047679,0.45873,0.073355,0.098931,0.201942,0.392058,0.811775,...,0.248855,0.293631,0.37344,0.655298,0.424149,0.176024,0.202652,0.437086,0.675746,0.711467
1,2013-01-07,3_114790,3,114790,0.022222,0.002373,0.003231,0.006214,0.01083,0.017841,...,0.248855,0.293631,0.37344,0.01827,0.018576,0.040971,0.030303,0.027815,0.068742,0.069051
2,2013-01-07,3_305229,3,305229,0.070635,0.012513,0.020134,0.019029,0.080144,0.075825,...,0.248855,0.293631,0.37344,0.060901,0.074303,0.110774,0.056818,0.047682,0.089494,0.123305
3,2013-01-07,3_314384,3,314384,0.084127,0.014671,0.02088,0.021359,0.051986,0.080285,...,0.248855,0.293631,0.37344,0.123021,0.160991,0.201821,0.155303,0.156291,0.280156,0.314427
4,2013-01-07,3_364606,3,364606,0.099206,0.022654,0.026597,0.028738,0.062094,0.11686,...,0.248855,0.293631,0.37344,0.210719,0.243034,0.295903,0.301136,0.196026,0.315175,0.383477


In [20]:
data.dtypes

date                 datetime64[ns]
store_item                   object
store                         int64
item                          int64
sales_day_1                 float64
sales_day_2                 float64
sales_day_3                 float64
sales_day_4                 float64
sales_day_5                 float64
sales_day_6                 float64
sales_day_7                 float64
store_med_day_1             float64
store_med_day_2             float64
store_med_day_3             float64
store_med_day_4             float64
store_med_day_5             float64
store_med_day_6             float64
store_med_day_7             float64
item_med_day_1              float64
item_med_day_2              float64
item_med_day_3              float64
item_med_day_4              float64
item_med_day_5              float64
item_med_day_6              float64
item_med_day_7              float64
y_sales_day_1               float64
y_sales_day_2               float64
y_sales_day_3               

In [21]:
data.columns

Index(['date', 'store_item', 'store', 'item', 'sales_day_1', 'sales_day_2',
       'sales_day_3', 'sales_day_4', 'sales_day_5', 'sales_day_6',
       'sales_day_7', 'store_med_day_1', 'store_med_day_2', 'store_med_day_3',
       'store_med_day_4', 'store_med_day_5', 'store_med_day_6',
       'store_med_day_7', 'item_med_day_1', 'item_med_day_2', 'item_med_day_3',
       'item_med_day_4', 'item_med_day_5', 'item_med_day_6', 'item_med_day_7',
       'y_sales_day_1', 'y_sales_day_2', 'y_sales_day_3', 'y_sales_day_4',
       'y_sales_day_5', 'y_sales_day_6', 'y_sales_day_7', 'y_store_med_day_1',
       'y_store_med_day_2', 'y_store_med_day_3', 'y_store_med_day_4',
       'y_store_med_day_5', 'y_store_med_day_6', 'y_store_med_day_7',
       'y_item_med_day_1', 'y_item_med_day_2', 'y_item_med_day_3',
       'y_item_med_day_4', 'y_item_med_day_5', 'y_item_med_day_6',
       'y_item_med_day_7'],
      dtype='object')

In [22]:

last_day_data = data.sort_values("date").groupby("store_item", as_index=False).tail(1)
last_day_data

Unnamed: 0,date,store_item,store,item,sales_day_1,sales_day_2,sales_day_3,sales_day_4,sales_day_5,sales_day_6,...,y_store_med_day_5,y_store_med_day_6,y_store_med_day_7,y_item_med_day_1,y_item_med_day_2,y_item_med_day_3,y_item_med_day_4,y_item_med_day_5,y_item_med_day_6,y_item_med_day_7
23862,2017-08-08,49_314384,49,314384,0.050794,0.012945,0.010937,0.027184,0.064260,0.064228,...,,,,,,,,,,
23870,2017-08-08,50_114790,50,114790,0.020635,0.003452,0.003480,0.012039,0.028159,0.033006,...,,,,,,,,,,
23869,2017-08-08,50_1047679,50,1047679,0.016667,0.004099,0.004474,0.010485,0.020939,0.050847,...,,,,,,,,,,
23868,2017-08-08,49_584028,49,584028,0.068733,0.013985,0.018653,0.045159,0.070055,0.091351,...,,,,,,,,,,
23867,2017-08-08,49_582864,49,582864,0.030834,0.022324,0.019993,0.040632,0.022336,0.046503,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23826,2017-08-08,45_581078,45,581078,0.007937,0.003883,0.004226,0.006602,0.015162,0.019625,...,,,,,,,,,,
23825,2017-08-08,45_567623,45,567623,0.046032,0.010140,0.009943,0.022913,0.028159,0.057984,...,,,,,,,,,,
23823,2017-08-08,45_364606,45,364606,0.053968,0.014239,0.015411,0.039223,0.074368,0.090990,...,,,,,,,,,,
23834,2017-08-08,46_502331,46,502331,0.032540,0.008846,0.011683,0.009320,0.038989,0.070473,...,,,,,,,,,,


In [23]:
print("Unique (store, item) pairs:", last_day_data["store_item"].nunique())

Unique (store, item) pairs: 100


In [24]:
last_day_data.columns

Index(['date', 'store_item', 'store', 'item', 'sales_day_1', 'sales_day_2',
       'sales_day_3', 'sales_day_4', 'sales_day_5', 'sales_day_6',
       'sales_day_7', 'store_med_day_1', 'store_med_day_2', 'store_med_day_3',
       'store_med_day_4', 'store_med_day_5', 'store_med_day_6',
       'store_med_day_7', 'item_med_day_1', 'item_med_day_2', 'item_med_day_3',
       'item_med_day_4', 'item_med_day_5', 'item_med_day_6', 'item_med_day_7',
       'y_sales_day_1', 'y_sales_day_2', 'y_sales_day_3', 'y_sales_day_4',
       'y_sales_day_5', 'y_sales_day_6', 'y_sales_day_7', 'y_store_med_day_1',
       'y_store_med_day_2', 'y_store_med_day_3', 'y_store_med_day_4',
       'y_store_med_day_5', 'y_store_med_day_6', 'y_store_med_day_7',
       'y_item_med_day_1', 'y_item_med_day_2', 'y_item_med_day_3',
       'y_item_med_day_4', 'y_item_med_day_5', 'y_item_med_day_6',
       'y_item_med_day_7'],
      dtype='object')

In [28]:
store_medians = last_day_data.groupby("store")["sales_day_7"].median()
item_medians = last_day_data.groupby("item")["sales_day_7"].median()
print("Store medians:", store_medians)
print("Item medians:", item_medians)

Store medians: store
3     0.038100
8     0.022668
44    0.054992
45    0.053851
46    0.047131
47    0.059836
48    0.040984
49    0.047309
50    0.040984
51    0.023361
Name: sales_day_7, dtype: float64
Item medians: item
114790     0.019672
305229     0.029098
314384     0.057787
364606     0.053689
502331     0.045492
567623     0.020902
581078     0.015164
582864     0.051872
584028     0.088464
1047679    0.097541
Name: sales_day_7, dtype: float64


In [39]:
store_smoothOnlineMedians = {}
eta = 0.001
for store in store_medians.keys():
    print(f"Store {store}: {store_medians[store]}")
    store_smoothOnlineMedians[store] = SmoothOnlineMedian(store_medians[store], eta=eta)
    

Store 3: 0.03809959068894386
Store 8: 0.022668032906949524
Store 44: 0.054992213845253
Store 45: 0.053850820288062096
Store 46: 0.04713114909827709
Store 47: 0.05983606539666653
Store 48: 0.04098360612988472
Store 49: 0.047308607026934624
Store 50: 0.04098360799252987
Store 51: 0.023360655643045906


In [40]:
store_smoothOnlineMedians

{3: <utils.SmoothOnlineMedian at 0x13d3bf350>,
 8: <utils.SmoothOnlineMedian at 0x13a591b20>,
 44: <utils.SmoothOnlineMedian at 0x13d3bf980>,
 45: <utils.SmoothOnlineMedian at 0x13d3bf9b0>,
 46: <utils.SmoothOnlineMedian at 0x13d3bf380>,
 47: <utils.SmoothOnlineMedian at 0x13d3bee40>,
 48: <utils.SmoothOnlineMedian at 0x13d3bf0e0>,
 49: <utils.SmoothOnlineMedian at 0x13d3beea0>,
 50: <utils.SmoothOnlineMedian at 0x13d3bd730>,
 51: <utils.SmoothOnlineMedian at 0x13d3bcbf0>}

In [34]:
# Compute store-level medians for sales_day_7
store_medians = {}

for store, group in last_day_data.groupby("store"):
    rm = RunningMedian()
    for val in group["sales_day_7"]:
        rm.add(val)
    store_medians[store] = rm.median()
    
# Compute item-level medians for sales_day_7
item_medians = {}
for item, group in last_day_data.groupby("item"):
    rm = RunningMedian()
    for val in group["sales_day_7"]:
        rm.add(val)
    item_medians[item] = rm.median()

# Convert to DataFrames for viewing
store_medians_df = pd.DataFrame(store_medians.items(), columns=["store", "sales_day_7_median"])
item_medians_df = pd.DataFrame(item_medians.items(), columns=["item", "sales_day_7_median"])


In [35]:
store_medians_df

Unnamed: 0,store,sales_day_7_median
0,3,0.0381
1,8,0.022668
2,44,0.054992
3,45,0.053851
4,46,0.047131
5,47,0.059836
6,48,0.040984
7,49,0.047309
8,50,0.040984
9,51,0.023361


In [47]:
# Define initial history structure
sales_history = defaultdict(lambda: deque(maxlen=7))

# Identify sales_day columns
sales_cols = [f"sales_day_{i}" for i in range(1, 8)]

# Assume last_day_data is the DataFrame you've filtered already
for _, row in last_day_data.iterrows():
    store_item = row["store_item"]
    past_sales = row[sales_cols].tolist()
    sales_history[store_item] = deque(past_sales, maxlen=7)
sales_history

defaultdict(<function __main__.<lambda>()>,
            {'49_314384': deque([0.05079365149140358,
                    0.01294498424977064,
                    0.01093711145222187,
                    0.02718446590006351,
                    0.06425993144512177,
                    0.0642283633351326,
                    0.05573770403862],
                   maxlen=7),
             '50_114790': deque([0.02063492126762867,
                    0.003451995784416795,
                    0.003479990176856518,
                    0.01203883532434702,
                    0.02815884537994862,
                    0.03300624340772629,
                    0.01803278736770153],
                   maxlen=7),
             '50_1047679': deque([0.01666666753590107,
                    0.004099245183169842,
                    0.004474272951483727,
                    0.01048543676733971,
                    0.02093862928450108,
                    0.05084745585918427,
                    0.034426230937

In [None]:
predictions = []

for _, row in df.iterrows():
    sid = row['sid']
    model_info = models.get(sid)
    if model_info:
        features = preprocess_row(row, model_info['feature_cols'])
        input_tensor = torch.tensor(features).unsqueeze(0)  # Add batch dimension
        with torch.no_grad():
            pred = model_info['model'](input_tensor).item()
    else:
        pred = 0  # or np.nan, or some fallback model
    predictions.append(pred)

test_df['unit_sales'] = predictions