In [None]:
# 📦 Imports
import pandas as pd
import numpy as np
import os

In [2]:
# 📂 Load dataset
data_path = "../data/processed/retail_sales_extended.csv"
df = pd.read_csv(data_path, parse_dates=["date"])

In [None]:
# 🧹 Clean up
df.drop(columns=["Unnamed: 0"], inplace=True)

In [4]:
# 🔁 Sort by item and date for lag feature consistency
df.sort_values(by=["item", "date"], inplace=True)

In [None]:
# # 🧠 Add more lag features
# for lag in [1, 3, 7]:
#     df[f"sales_lag_{lag}"] = df.groupby("item")["units_sold"].shift(lag)


In [None]:
# # 📈 Rolling average & trend (over 30 days)
# df["sales_30d_avg"] = df.groupby("item")["units_sold"].transform(lambda x: x.rolling(30, min_periods=1).mean().shift(1))
# df["sales_30d_trend"] = df["sales_30d_avg"] - df["sales_7d_avg"]

In [5]:
# 🗓️ Encode time-based features
df["day_of_week"] = df["date"].dt.dayofweek  # 0=Monday, 6=Sunday
df["month"] = df["date"].dt.month

In [6]:
# 🔤 Encode categorical features (Label Encoding or One-Hot Encoding as needed)
df["item"] = df["item"].astype("category").cat.codes
df["category"] = df["category"].astype("category").cat.codes
df["brand"] = df["brand"].astype("category").cat.codes

In [7]:
# 🧽 Handle missing values from lag features
df.fillna(0, inplace=True)

In [8]:
# 💾 Save engineered features
output_path = "../data/processed/retail_sales_features.csv"
df.to_csv(output_path, index=False)

print(f"✅ Feature engineering complete. Saved to {output_path}")

✅ Feature engineering complete. Saved to ../data/processed/retail_sales_features.csv
