# Feature Engineering and Modeling Dataset Construction

This notebook implements the modeling assumptions and feature design decisions
established in the exploratory analysis. The goal is to construct a
leakage-safe, reproducible modeling dataset suitable for time-based validation
and forecasting.


In [2]:
#imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#read csv files from data folder
DATA_DIR = "../data/raw"

train = pd.read_csv(f"{DATA_DIR}/train.csv", parse_dates=["Date"])
features = pd.read_csv(f"{DATA_DIR}/features.csv", parse_dates=["Date"])
stores = pd.read_csv(f"{DATA_DIR}/stores.csv")
test = pd.read_csv(f"{DATA_DIR}/test.csv", parse_dates=["Date"])


In [4]:
#lets start merging based on the cardianlity/relations between data in our EDA
train_feat = train.merge(
    features,
    on=["Store", "Date"],
    how= "left",
    validate="many_to_one"
)
#make sure that no records were lost
assert len(train_feat) == len(train)


In [6]:
#now merge on store level data
train_merged = train_feat.merge(
    stores,
    on="Store",
    how="left",
    validate="many_to_one"
)

assert len(train_merged) == len(train_feat)
train_merged.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y,Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096358,8.106,False,A,151315
1,1,1,2010-02-12,46039.49,True,38.51,2.548,,,,,,211.24217,8.106,True,A,151315
2,1,1,2010-02-19,41595.55,False,39.93,2.514,,,,,,211.289143,8.106,False,A,151315
3,1,1,2010-02-26,19403.54,False,46.63,2.561,,,,,,211.319643,8.106,False,A,151315
4,1,1,2010-03-05,21827.9,False,46.5,2.625,,,,,,211.350143,8.106,False,A,151315


In [8]:
#lets make sure these duplicate holiday columns after merging are identical
(train_merged["IsHoliday_x"] == train_merged["IsHoliday_y"]).all()

#since True lets drop duplicate and rename
train_merged = train_merged.drop(columns=["IsHoliday_y"])
train_merged = train_merged.rename(columns={"IsHoliday_x": "IsHoliday"})


In [9]:
#now lets make sure the merged table is sorted by date so we can correctly develop things like lagged features
train_merged = train_merged.sort_values(
    ["Store", "Dept", "Date"]
).reset_index(drop=True)


## Lagged Sales Features

Based on exploratory time series diagnostics, residual demand exhibits weak
short-term autocorrelation and no long-memory effects. As a result, lagged
sales features are constructed using only short horizons to capture recent
demand context without overfitting noise or encoding time trends.

Lag features are created within each (Store, Department) series and use
strictly historical information via time-aware shifting to prevent data
leakage. Missing values introduced by lagging represent periods with
insufficient history and will be handled consistently at model training time.


In [10]:
#lets build lagged sales for 1-2 week shifts based on eda
for lag in [1, 2]:
    train_merged[f"sales_lag_{lag}"] = (
        train_merged
        .groupby(["Store", "Dept"])["Weekly_Sales"]
        .shift(lag)
    )

#lets also add a rolling average to add some momentum signaling for sales
train_merged["sales_roll_4"] = (
    train_merged
    .groupby(["Store", "Dept"])["Weekly_Sales"]
    .shift(1)
    .rolling(4)
    .mean()
)

train_merged[
    ["Store", "Dept", "Date", "Weekly_Sales",
     "sales_lag_1", "sales_lag_2", "sales_roll_4"]
].head(10)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,sales_lag_1,sales_lag_2,sales_roll_4
0,1,1,2010-02-05,24924.5,,,
1,1,1,2010-02-12,46039.49,24924.5,,
2,1,1,2010-02-19,41595.55,46039.49,24924.5,
3,1,1,2010-02-26,19403.54,41595.55,46039.49,
4,1,1,2010-03-05,21827.9,19403.54,41595.55,32990.77
5,1,1,2010-03-12,21043.39,21827.9,19403.54,32216.62
6,1,1,2010-03-19,22136.64,21043.39,21827.9,25967.595
7,1,1,2010-03-26,26229.21,22136.64,21043.39,21102.8675
8,1,1,2010-04-02,57258.43,26229.21,22136.64,22809.285
9,1,1,2010-04-09,42960.91,57258.43,26229.21,31666.9175


In [12]:
#now lets build some datetime features for seasonality/holiday signals
train_merged['Month'] = train_merged['Date'].dt.month
train_merged['Week'] = train_merged['Date'].dt.isocalendar().week.astype(int)
train_merged['Year'] = train_merged['Date'].dt.year

#based on EDA sales vary before/after holidays so lets build a lagged holiday feature
train_merged['Holiday_Lead'] = (
    train_merged.groupby(['Store','Dept'])['IsHoliday']
    .shift(-1)
    .fillna(False)
    .astype(int)
)

train_merged['Holiday_Lag'] = (
    train_merged.groupby(['Store','Dept'])['IsHoliday']
    .shift(1)
    .fillna(False)
    .astype(int)
)

#encode boolean isholiday as binary
train_merged['IsHoliday'] = train_merged['IsHoliday'].astype(int)

train_merged.head()

  .fillna(False)
  .fillna(False)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Type,Size,sales_lag_1,sales_lag_2,sales_roll_4,Month,Week,Year,Holiday_Lead,Holiday_Lag
0,1,1,2010-02-05,24924.5,0,42.31,2.572,,,,...,A,151315,,,,2,5,2010,1,0
1,1,1,2010-02-12,46039.49,1,38.51,2.548,,,,...,A,151315,24924.5,,,2,6,2010,0,0
2,1,1,2010-02-19,41595.55,0,39.93,2.514,,,,...,A,151315,46039.49,24924.5,,2,7,2010,0,1
3,1,1,2010-02-26,19403.54,0,46.63,2.561,,,,...,A,151315,41595.55,46039.49,,2,8,2010,0,0
4,1,1,2010-03-05,21827.9,0,46.5,2.625,,,,...,A,151315,19403.54,41595.55,32990.77,3,9,2010,0,0


In [15]:
#since these datetime columns have large disparities between weeks 1/52 and mongths 1/12 lets drop these and make cyclical using trig functions
#there still is mild non stationarity seen in eda, so lets keep year as non cyclical
train_merged['Sin_Month'] = np.sin(2 * np.pi * train_merged['Month'] / 12)
train_merged['Cos_Month'] = np.cos(2 * np.pi * train_merged['Month'] / 12)

train_merged['Sin_Week'] = np.sin(2 * np.pi * train_merged['Week'] / 52)
train_merged['Cos_Week'] = np.cos(2 * np.pi * train_merged['Week'] / 52)

train_merged = train_merged.drop(columns=["Week", "Month"])
train_merged.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,sales_lag_1,sales_lag_2,sales_roll_4,Year,Holiday_Lead,Holiday_Lag,Sin_Month,Cos_Month,Sin_Week,Cos_Week
0,1,1,2010-02-05,24924.5,0,42.31,2.572,,,,...,,,,2010,1,0,0.866025,0.5,0.568065,0.822984
1,1,1,2010-02-12,46039.49,1,38.51,2.548,,,,...,24924.5,,,2010,0,0,0.866025,0.5,0.663123,0.748511
2,1,1,2010-02-19,41595.55,0,39.93,2.514,,,,...,46039.49,24924.5,,2010,0,1,0.866025,0.5,0.748511,0.663123
3,1,1,2010-02-26,19403.54,0,46.63,2.561,,,,...,41595.55,46039.49,,2010,0,0,0.866025,0.5,0.822984,0.568065
4,1,1,2010-03-05,21827.9,0,46.5,2.625,,,,...,19403.54,41595.55,32990.77,2010,0,0,1.0,6.123234000000001e-17,0.885456,0.464723


In [16]:
#make sure category friendly
train_merged["Store"] = train_merged["Store"].astype(str)
train_merged["Dept"]  = train_merged["Dept"].astype(str)

In [17]:
train_merged.to_csv(DATA_DIR + "/model_features.csv", index=False)


##  Feature Engineering Complete

### Objective
Create forecast-safe, model-ready features for weekly Store–Department sales using insights from prior EDA.

### Key Decisions
- Merged training data based on relationships
- Encoded seasonal effects using cyclical transformations (sin / cos) for week and month to preserve calendar continuity.
- Retained year as a non-cyclical feature to capture mild non-stationarity observed in EDA.
- Modeled holiday effects explicitly using IsHoliday and lead/lag indicators.
- Captured short-term explainable history via Store–Dept lagged sales and rolling means (no long autoregressive structure).
- Preserved Store and Dept identifiers to allow heterogeneous behavior across entities.
- External variables (CPI, temperature, unemployment) were kept in raw form.

### Important Notes
- No rows were dropped in this notebook.
- Lag- and roll-induced nulls are expected and will be handled during train/validation assembly.
- No scaling or encoding was applied here; this is deferred to the modeling stage if required.

### Output
- Final feature table saved to model_features.csv.

 Next step: 03_modeling.ipynb — time-based split, baseline model training, and evaluation.
