In [1]:
import pandas as pd
import numpy as np

# Load cleaned data
df = pd.read_csv("../data/processed_data/cleaned_data.csv")

# Ensure Date column is datetime
df['Date'] = pd.to_datetime(df['Date'])

# Sort values by Store, Dept, Date for lag creation
df.sort_values(by=['Store', 'Dept', 'Date'], inplace=True)


In [2]:
#Create Time-based Features
# Extract useful date parts
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Week'] = df['Date'].dt.isocalendar().week
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek  # Monday=0, Sunday=6
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)


In [5]:
# Lag sales by 1 and 2 weeks
df['Lag_1'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1)
df['Lag_2'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(2)
df.dropna(inplace=True)


In [6]:
#Interaction Features
markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']

for col in markdown_cols:
    df[f'{col}_holiday'] = df[col] * df['IsHoliday_x']


In [7]:
#Encode Categorical Variables
# One-hot encode 'Type'
df = pd.get_dummies(df, columns=['Type'], drop_first=True)


In [8]:
# Save the final feature-rich dataset
df.to_csv("../data/processed_data/featured_data.csv", index=False)
print("Featured dataset saved.")


Featured dataset saved.
