In [3]:
# ============================================
# 02 â€” FEATURE ENGINEERING
# ============================================

import pandas as pd
import numpy as np

# --------------------------------------------
# Load Data
# --------------------------------------------

df = pd.read_csv("../data/raw/FMCG_2022_2024.csv")
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['sku', 'date'])

print("Original shape:", df.shape)

# --------------------------------------------
# DATA CLEANING
# --------------------------------------------

# Remove negative values (data corrections / returns)
df['units_sold'] = df['units_sold'].clip(lower=0)
df['stock_available'] = df['stock_available'].clip(lower=0)
df['delivered_qty'] = df['delivered_qty'].clip(lower=0)

print("Shape after cleaning:", df.shape)

# --------------------------------------------
# Calendar Features
# --------------------------------------------

df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['week_of_year'] = df['date'].dt.isocalendar().week.astype(int)
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

# --------------------------------------------
# Lag Features (grouped by SKU)
# --------------------------------------------

df['lag_1'] = df.groupby('sku')['units_sold'].shift(1)
df['lag_7'] = df.groupby('sku')['units_sold'].shift(7)
df['lag_14'] = df.groupby('sku')['units_sold'].shift(14)

# --------------------------------------------
# Rolling Features
# --------------------------------------------

df['rolling_mean_7'] = df.groupby('sku')['units_sold'] \
                          .transform(lambda x: x.shift(1).rolling(7).mean())

df['rolling_std_7'] = df.groupby('sku')['units_sold'] \
                         .transform(lambda x: x.shift(1).rolling(7).std())

# --------------------------------------------
# Remove initial NaNs (from lags)
# --------------------------------------------

df = df.dropna().reset_index(drop=True)

print("Final shape after feature engineering:", df.shape)

df.to_csv("../data/processed/FMCG_features.csv", index=False)

Original shape: (190757, 14)
Shape after cleaning: (190757, 14)
Final shape after feature engineering: (190337, 23)
