In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
merged_df = pd.read_csv("../dataset/merged_retail_dataset.csv")
merged_df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,Year,Month,Week
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315,2010,2,5
1,1,1,2010-02-12,46039.49,True,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,A,151315,2010,2,6
2,1,1,2010-02-19,41595.55,False,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,A,151315,2010,2,7
3,1,1,2010-02-26,19403.54,False,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,A,151315,2010,2,8
4,1,1,2010-03-05,21827.9,False,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,A,151315,2010,3,9


### Feature Engineering

In [3]:
# Convert Date column to datetime if not already
merged_df["Date"] = pd.to_datetime(merged_df["Date"])

# Add more time-based features
merged_df["Quarter"] = merged_df["Date"].dt.quarter
merged_df["DayOfWeek"] = merged_df["Date"].dt.dayofweek
merged_df["IsMonthStart"] = merged_df["Date"].dt.is_month_start.astype(int)
merged_df["IsMonthEnd"] = merged_df["Date"].dt.is_month_end.astype(int)

In [4]:
# Select only numeric columns
num_cols = merged_df.select_dtypes(include=["float64", "int64"]).columns

# Fill NaNs in numeric columns only
merged_df[num_cols] = merged_df[num_cols].fillna(0)

In [5]:
# Sort first
merged_df = merged_df.sort_values(["Store", "Dept", "Date"])

# Add lag features (1, 2, 3 weeks back)
for lag in [1, 2, 3]:
    merged_df[f"Weekly_Sales_Lag{lag}"] = (
        merged_df.groupby(["Store", "Dept"])["Weekly_Sales"].shift(lag)
    )

In [12]:
# Rolling means (4-week and 12-week averages)
merged_df["Rolling_4"] = (
    merged_df.groupby(["Store", "Dept"])["Weekly_Sales"].transform(lambda x: x.shift(1).rolling(4).mean())
)

merged_df["Rolling_12"] = (
    merged_df.groupby(["Store", "Dept"])["Weekly_Sales"].transform(lambda x: x.shift(1).rolling(12).mean())
)

In [13]:
# Rolling mean and std over past 4 and 12 weeks
for window in [4, 12]:
    merged_df[f"Weekly_Sales_MA{window}"] = (
        merged_df.groupby(["Store", "Dept"])["Weekly_Sales"]
        .transform(lambda x: x.shift(1).rolling(window).mean())
    )
    merged_df[f"Weekly_Sales_STD{window}"] = (
        merged_df.groupby(["Store", "Dept"])["Weekly_Sales"]
        .transform(lambda x: x.shift(1).rolling(window).std())
    )


In [14]:
merged_df["IsHoliday_Lag1"] = merged_df.groupby("Store")["IsHoliday"].shift(1).fillna(False)
merged_df["IsHoliday_Lead1"] = merged_df.groupby("Store")["IsHoliday"].shift(-1).fillna(False)

  merged_df["IsHoliday_Lag1"] = merged_df.groupby("Store")["IsHoliday"].shift(1).fillna(False)
  merged_df["IsHoliday_Lead1"] = merged_df.groupby("Store")["IsHoliday"].shift(-1).fillna(False)


In [15]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
merged_df["Type"] = le.fit_transform(merged_df["Type"])

In [16]:
num_cols = merged_df.select_dtypes(include=["float64", "int64"]).columns
merged_df[num_cols] = merged_df[num_cols].fillna(0)

In [24]:
merged_df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Weekly_Sales_Lag2,Weekly_Sales_Lag3,Weekly_Sales_MA4,Weekly_Sales_STD4,Weekly_Sales_MA12,Weekly_Sales_STD12,IsHoliday_Lag1,IsHoliday_Lead1,Rolling_4,Rolling_12
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,True,0.0,0.0
1,1,1,2010-02-12,46039.49,True,38.51,2.548,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,False,0.0,0.0
2,1,1,2010-02-19,41595.55,False,39.93,2.514,0.0,0.0,0.0,...,24924.5,0.0,0.0,0.0,0.0,0.0,True,False,0.0,0.0
3,1,1,2010-02-26,19403.54,False,46.63,2.561,0.0,0.0,0.0,...,46039.49,24924.5,0.0,0.0,0.0,0.0,False,False,0.0,0.0
4,1,1,2010-03-05,21827.9,False,46.5,2.625,0.0,0.0,0.0,...,41595.55,46039.49,32990.77,12832.106391,0.0,0.0,False,False,32990.77,0.0
