In [1]:
import pandas as pd
import torch
import torch.nn as nn
import pickle

In [2]:
import sys
sys.path.insert(0, "../src")   
from model_utils import load_models_from_dir
from model_utils import predict_next_days_for_sids

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
test_df = pd.read_excel(
        "../output/data/2025_05_30_test_top_10_store_10_store_item.xlsx",
)
test_df = test_df.sort_values(by=['date', 'store_item'])
test_df = test_df.reset_index(drop=True)
test_df.head(5)


Unnamed: 0,id,date,store_item,store_nbr,item_nbr,onpromotion,unit_sales
0,125506246,2017-08-16,3_1047679,3,1047679,0,0
1,125504870,2017-08-16,3_114790,3,114790,1,0
2,125505116,2017-08-16,3_305229,3,305229,0,0
3,125505140,2017-08-16,3_314384,3,314384,1,0
4,125505228,2017-08-16,3_364606,3,364606,0,0


In [5]:
test_df["date"].nunique()
test_df["date"].min(), test_df["date"].max()

(Timestamp('2017-08-16 00:00:00'), Timestamp('2017-08-31 00:00:00'))

In [6]:

def preprocess_test_df(df, feature_cols):
    df['onpromotion'] = df['onpromotion'].astype(bool).astype(int)
    df[feature_cols] = df[feature_cols].astype('float32')
    df["date"] = pd.to_datetime(df["date"])
    df.sort_values("date", inplace=True)
    df.reset_index(drop=True, inplace=True)
    df["store_item"] = df["store_item"].astype(str)
    df.rename(columns={"store_nbr": "store", "item_nbr": "item"}, inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df




In [7]:
test_df = preprocess_test_df(test_df, ["unit_sales"])
test_df.dtypes

id                      int64
date           datetime64[ns]
store_item             object
store                   int64
item                    int64
onpromotion             int64
unit_sales            float32
dtype: object

In [8]:
test_df.head(5)

Unnamed: 0,id,date,store_item,store,item,onpromotion,unit_sales
0,125506246,2017-08-16,3_1047679,3,1047679,0,0.0
1,125688463,2017-08-16,50_305229,50,305229,0,0.0
2,125688217,2017-08-16,50_114790,50,114790,1,0.0
3,125689593,2017-08-16,50_1047679,50,1047679,0,0.0
4,125684994,2017-08-16,49_584028,49,584028,0,0.0


In [9]:
test_df.query("store_item == '3_1047679'")

Unnamed: 0,id,date,store_item,store,item,onpromotion,unit_sales
0,125506246,2017-08-16,3_1047679,3,1047679,0,0.0
175,125716900,2017-08-17,3_1047679,3,1047679,0,0.0
271,125927554,2017-08-18,3_1047679,3,1047679,0,0.0
372,126138208,2017-08-19,3_1047679,3,1047679,0,0.0
472,126348862,2017-08-20,3_1047679,3,1047679,0,0.0
572,126559516,2017-08-21,3_1047679,3,1047679,0,0.0
671,126770170,2017-08-22,3_1047679,3,1047679,0,0.0
772,126980824,2017-08-23,3_1047679,3,1047679,0,0.0
872,127191478,2017-08-24,3_1047679,3,1047679,0,0.0
972,127402132,2017-08-25,3_1047679,3,1047679,0,0.0


In [19]:
ttrain_df = pd.read_excel("../output/data/20250529_train_top_10_store_10_item_sales_cyclical_features_16_days_X_y.xlsx")
#train_df = pd.read_excel("../output/data/20250529_scaled_train_top_10_store_10_item_sales_cyclical_features_16_days_X_y.xlsx")
train_df.head(5)

Unnamed: 0,store_item,store,item,sales_day_1,sales_day_2,sales_day_3,sales_day_4,sales_day_5,sales_day_6,sales_day_7,...,y_monthofyear_cos_7,y_monthofyear_cos_8,y_monthofyear_cos_9,y_monthofyear_cos_10,y_monthofyear_cos_11,y_monthofyear_cos_12,y_monthofyear_cos_13,y_monthofyear_cos_14,y_monthofyear_cos_15,y_monthofyear_cos_16
0,3_1047679,3,1047679,398.0,520.0,543.0,910.0,784.0,467.0,460.0,...,0.8660254,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,3_1047679,3,1047679,224.0,520.0,474.0,251.0,302.0,288.0,332.0,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,3_1047679,3,1047679,245.0,308.0,243.0,286.0,231.0,267.0,583.0,...,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17
3,3_1047679,3,1047679,173.0,352.0,231.0,292.0,525.0,736.0,502.0,...,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17,6.123234000000001e-17
4,3_1047679,3,1047679,429.0,419.0,504.0,623.0,293.0,250.0,384.0,...,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5


In [20]:
window_size = 16
cyclical_features = (
  [f"dayofweek_sin_{i}" for i in range(1, window_size+1)]
  + [f"dayofweek_cos_{i}" for i in range(1, window_size+1)]
  + [f"weekofmonth_sin_{i}" for i in range(1, window_size+1)]
  + [f"weekofmonth_cos_{i}" for i in range(1, window_size+1)]
  + [f"monthofyear_sin_{i}" for i in range(1, window_size+1)]
  + [f"monthofyear_cos_{i}" for i in range(1, window_size+1)]
)

sales_features = (
  [f"sales_day_{i}" for i in range(1, window_size+1)]
  + [f"store_med_day_{i}" for i in range(1, window_size+1)]
  + [f"item_med_day_{i}" for i in range(1, window_size+1)]
)
feature_cols = sales_features + cyclical_features
label_cols = [f'y_{c}' for c in feature_cols]
meta_cols = ["start_date", "store_item", "store", "item"]
train_df = train_df[meta_cols + feature_cols + label_cols]
train_df.head()

KeyError: "['start_date'] not in index"

In [22]:
sid = "50_114790"
train_df.query("store_item == @sid")

Unnamed: 0,store_item,store,item,sales_day_1,sales_day_2,sales_day_3,sales_day_4,sales_day_5,sales_day_6,sales_day_7,...,y_monthofyear_cos_7,y_monthofyear_cos_8,y_monthofyear_cos_9,y_monthofyear_cos_10,y_monthofyear_cos_11,y_monthofyear_cos_12,y_monthofyear_cos_13,y_monthofyear_cos_14,y_monthofyear_cos_15,y_monthofyear_cos_16
7374,50_114790,50,114790,18.0,12.0,21.0,50.0,42.0,26.0,30.0,...,8.660254e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01
7375,50_114790,50,114790,17.0,43.0,33.0,22.0,19.0,18.0,20.0,...,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01,5.000000e-01
7376,50_114790,50,114790,18.0,22.0,31.0,25.0,16.0,20.0,46.0,...,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17
7377,50_114790,50,114790,25.0,27.0,16.0,29.0,64.0,42.0,42.0,...,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17,6.123234e-17
7378,50_114790,50,114790,19.0,15.0,30.0,35.0,52.0,24.0,23.0,...,-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01,-5.000000e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7473,50_114790,50,114790,18.0,38.0,33.0,21.0,14.0,6.0,14.0,...,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00
7474,50_114790,50,114790,32.0,20.0,13.0,20.0,33.0,30.0,50.0,...,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,-1.000000e+00,0.000000e+00,0.000000e+00
7475,50_114790,50,114790,1.0,0.0,0.0,26.0,47.0,28.0,17.0,...,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01
7476,50_114790,50,114790,23.0,24.0,51.0,45.0,28.0,35.0,39.0,...,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01,-8.660254e-01


In [34]:
# Get the latest row per store_item
idx = train_df.groupby("store_item")["start_date"].idxmax()
last_date_df = train_df.loc[idx].reset_index(drop=True)

# Identify y_ columns
y_cols = [col for col in train_df.columns if col.startswith("y_")]

# Meta columns to carry over
meta_cols = ["start_date", "store_item", "store", "item"]

# Build new rows
new_rows = []

for _, row in last_date_df.iterrows():
    new_X = {col.replace("y_", "", 1): row[col] for col in y_cols}  # only first "y_" is removed
    new_y = {col: 0.0 for col in y_cols}
    meta = {col: row[col] for col in meta_cols}
    meta["start_date"] = pd.to_datetime(meta["start_date"]) + pd.Timedelta(days=16)
    new_row = {**meta, **new_X, **new_y}
    new_rows.append(new_row)

# Final DataFrame with new rows
last_date_df = pd.DataFrame(new_rows)
last_date_df


Unnamed: 0,start_date,store_item,store,item,sales_day_1,sales_day_2,sales_day_3,sales_day_4,sales_day_5,sales_day_6,...,y_monthofyear_cos_7,y_monthofyear_cos_8,y_monthofyear_cos_9,y_monthofyear_cos_10,y_monthofyear_cos_11,y_monthofyear_cos_12,y_monthofyear_cos_13,y_monthofyear_cos_14,y_monthofyear_cos_15,y_monthofyear_cos_16
0,2017-07-31,3_1047679,3,1047679,0.332237,0.073328,0.358347,0.319505,0.100755,0.274984,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2017-07-31,3_114790,3,114790,0.015625,0.004226,0.013491,0.013622,0.005178,0.019243,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2017-07-31,3_305229,3,305229,0.034539,0.015909,0.046374,0.021672,0.009709,0.034140,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017-07-31,3_314384,3,314384,0.059211,0.012926,0.083474,0.047678,0.019417,0.052142,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2017-07-31,3_364606,3,364606,0.057566,0.022123,0.107926,0.046440,0.021791,0.052142,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2017-07-31,8_502331,8,502331,0.021382,0.011683,0.030354,0.014861,0.008846,0.021726,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,2017-07-31,8_567623,8,567623,0.013158,0.002486,0.013491,0.004954,0.002157,0.011794,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,2017-07-31,8_581078,8,581078,0.007401,0.003977,0.011804,0.004334,0.003020,0.010552,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,2017-07-31,8_582864,8,582864,0.024513,0.007462,0.033646,0.025017,0.005979,0.024430,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
print("Unique (store, item) pairs:", last_date_df["store_item"].nunique())

Unique (store, item) pairs: 100


In [36]:
models = load_models_from_dir()

In [37]:

with open('../output/data/20250529_top10_store_10_item_minmax_y_scaler.pkl', 'rb') as f:
    y_scaler = pickle.load(f)

In [38]:
# --- Extract model and feature columns ---
sid = "49_584028"
model, feature_cols = models[sid]
model.eval()

# --- Prepare input ---
input_data = last_date_df.query("store_item == @sid")
x = input_data[feature_cols].values.astype("float32")
x_tensor = torch.tensor(x)

# --- Make prediction ---
with torch.no_grad():
    y_pred = model(x_tensor)

# --- Convert prediction to DataFrame with feature column names ---
y_pred_scaled = pd.DataFrame(y_pred.numpy(), columns=feature_cols)
y_pred_df = pd.DataFrame(y_scaler.inverse_transform(y_pred_scaled), columns=feature_cols)

# --- Keep only 'sales_day_' columns ---
sales_day_cols = [col for col in y_pred_df.columns if col.startswith("sales_day_")]
sales_pred_df = y_pred_df[sales_day_cols]

# --- Extract metadata ---
meta = input_data.iloc[0][["store_item", "store", "item"]].to_dict()
start_date = pd.to_datetime(input_data.iloc[0]["start_date"]) + pd.Timedelta(days=15)
# --- Build final output: one row per predicted day ---
rows = []
for i, col in enumerate(sales_day_cols):
    row = {
        "date": start_date + pd.Timedelta(days=i + 1),
        **meta,  # Add metadata first
        "unit_sales": sales_pred_df.at[0, col]  # Add 'sales' last
    }
    rows.append(row)

# --- Final DataFrame ---
final_df = pd.DataFrame(rows)
final_df


Unnamed: 0,date,store_item,store,item,unit_sales
0,2017-08-16,49_584028,49,584028,-0.792584
1,2017-08-17,49_584028,49,584028,-0.791266
2,2017-08-18,49_584028,49,584028,-0.79171
3,2017-08-19,49_584028,49,584028,-0.794866
4,2017-08-20,49_584028,49,584028,-0.790937
5,2017-08-21,49_584028,49,584028,-0.792082
6,2017-08-22,49_584028,49,584028,-0.78912
7,2017-08-23,49_584028,49,584028,-0.792353
8,2017-08-24,49_584028,49,584028,-0.795245
9,2017-08-25,49_584028,49,584028,-0.795531


In [39]:
final_df.dtypes

date          datetime64[ns]
store_item            object
store                  int64
item                   int64
unit_sales           float32
dtype: object

In [40]:
test_df.dtypes

id                      int64
date           datetime64[ns]
store_item             object
store                   int64
item                    int64
onpromotion             int64
unit_sales            float32
dtype: object

In [41]:

# Step 2: Merge with test_df using a left join

merged_df = test_df.query("store_item == @sid").merge(
    final_df,
    on=["date", "store_item", "store", "item"],
    how="left",
    suffixes=("", "_pred")
)

# Step 3: Replace test_df unit_sales with predicted ones where available
merged_df["unit_sales"] = merged_df["unit_sales_pred"].combine_first(merged_df["unit_sales"])

# Step 4: Drop the helper column
merged_df = merged_df.drop(columns=["unit_sales_pred"])
merged_df


Unnamed: 0,id,date,store_item,store,item,onpromotion,unit_sales
0,125684994,2017-08-16,49_584028,49,584028,0,-0.792584
1,125895648,2017-08-17,49_584028,49,584028,0,-0.791266
2,126106302,2017-08-18,49_584028,49,584028,1,-0.79171
3,126316956,2017-08-19,49_584028,49,584028,0,-0.794866
4,126527610,2017-08-20,49_584028,49,584028,0,-0.790937
5,126738264,2017-08-21,49_584028,49,584028,0,-0.792082
6,126948918,2017-08-22,49_584028,49,584028,0,-0.78912
7,127159572,2017-08-23,49_584028,49,584028,0,-0.792353
8,127370226,2017-08-24,49_584028,49,584028,0,-0.795245
9,127580880,2017-08-25,49_584028,49,584028,1,-0.795531


In [42]:
all_preds_df = predict_next_days_for_sids(last_date_df, models, y_scaler)
all_preds_df.head()


Unnamed: 0,date,store_item,store,item,unit_sales
0,2017-08-16,45_1047679,45,1047679,0.133876
1,2017-08-17,45_1047679,45,1047679,0.279128
2,2017-08-18,45_1047679,45,1047679,0.28843
3,2017-08-19,45_1047679,45,1047679,0.095799
4,2017-08-20,45_1047679,45,1047679,0.49954


In [43]:
# Step 2: Merge with test_df using a left join
merged_df = test_df.merge(
    all_preds_df,
    on=["date", "store_item", "store", "item"],
    how="left",
    suffixes=("", "_pred")
)

# Step 3: Replace test_df unit_sales with predicted ones where available
merged_df["unit_sales"] = merged_df["unit_sales_pred"].combine_first(merged_df["unit_sales"])

# Step 4: Drop the helper column
merged_df = merged_df.drop(columns=["unit_sales_pred"])
merged_df

Unnamed: 0,id,date,store_item,store,item,onpromotion,unit_sales
0,125506246,2017-08-16,3_1047679,3,1047679,0,338.339172
1,125688463,2017-08-16,50_305229,50,305229,0,-0.802054
2,125688217,2017-08-16,50_114790,50,114790,1,-0.801325
3,125689593,2017-08-16,50_1047679,50,1047679,0,30.347908
4,125684994,2017-08-16,49_584028,49,584028,0,-0.792584
...,...,...,...,...,...,...,...
1595,128829180,2017-08-31,45_581078,45,581078,0,41.948650
1596,128829153,2017-08-31,45_567623,45,567623,0,-0.792051
1597,128829055,2017-08-31,45_502331,45,502331,0,-0.785579
1598,128832956,2017-08-31,46_502331,46,502331,0,-0.793558


In [44]:
merged_df.query("store_item == '3_1047679'")
    

Unnamed: 0,id,date,store_item,store,item,onpromotion,unit_sales
0,125506246,2017-08-16,3_1047679,3,1047679,0,338.339172
175,125716900,2017-08-17,3_1047679,3,1047679,0,412.000946
271,125927554,2017-08-18,3_1047679,3,1047679,0,388.579803
372,126138208,2017-08-19,3_1047679,3,1047679,0,522.115356
472,126348862,2017-08-20,3_1047679,3,1047679,0,592.43927
572,126559516,2017-08-21,3_1047679,3,1047679,0,453.25412
671,126770170,2017-08-22,3_1047679,3,1047679,0,446.928162
772,126980824,2017-08-23,3_1047679,3,1047679,0,378.550537
872,127191478,2017-08-24,3_1047679,3,1047679,0,271.746216
972,127402132,2017-08-25,3_1047679,3,1047679,0,500.431335


In [45]:
merged_df.query("store_item == '50_1047679'")


Unnamed: 0,id,date,store_item,store,item,onpromotion,unit_sales
3,125689593,2017-08-16,50_1047679,50,1047679,0,30.347908
106,125900247,2017-08-17,50_1047679,50,1047679,0,134.643295
205,126110901,2017-08-18,50_1047679,50,1047679,0,44.89032
302,126321555,2017-08-19,50_1047679,50,1047679,0,52.235256
403,126532209,2017-08-20,50_1047679,50,1047679,0,112.364883
505,126742863,2017-08-21,50_1047679,50,1047679,0,77.546318
606,126953517,2017-08-22,50_1047679,50,1047679,0,34.292873
702,127164171,2017-08-23,50_1047679,50,1047679,0,28.732626
805,127374825,2017-08-24,50_1047679,50,1047679,0,30.62026
909,127585479,2017-08-25,50_1047679,50,1047679,0,33.89941


In [46]:
merged_df.head()

Unnamed: 0,id,date,store_item,store,item,onpromotion,unit_sales
0,125506246,2017-08-16,3_1047679,3,1047679,0,338.339172
1,125688463,2017-08-16,50_305229,50,305229,0,-0.802054
2,125688217,2017-08-16,50_114790,50,114790,1,-0.801325
3,125689593,2017-08-16,50_1047679,50,1047679,0,30.347908
4,125684994,2017-08-16,49_584028,49,584028,0,-0.792584


In [48]:
merged_df.to_excel("../output/data/20250530_merged_submission.xlsx", index=False)

In [47]:
submission_df = merged_df[["id", "unit_sales"]]
submission_df["unit_sales"] = submission_df["unit_sales"].clip(lower=0)
submission_df.to_csv('../output/data/20250530_submission.csv', float_format='%.4f', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df["unit_sales"] = submission_df["unit_sales"].clip(lower=0)
