# Feature Engineering Exploration

This notebook explores feature engineering techniques for time series forecasting.

## Objectives
1. Create temporal features
2. Generate lag features
3. Build rolling window statistics
4. Add holiday indicators
5. Analyze feature importance
6. Select optimal features

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

sys.path.append(str(Path.cwd().parent))

from src.data_loader import load_sales_data
from src.feature_engineering import FeatureEngineer

sns.set_style("whitegrid")
%matplotlib inline

## 1. Load Data

In [None]:
# Load and aggregate data
df = load_sales_data(Path.cwd().parent / "data" / "raw" / "sales_data.parquet")
df_agg = df.groupby(['store_id', 'date'])['sales'].sum().reset_index()
df_promo = df.groupby(['store_id', 'date'])['is_promo'].max().reset_index()
df_agg = df_agg.merge(df_promo, on=['store_id', 'date'])

print(f"Data shape: {df_agg.shape}")
df_agg.head()

## 2. Temporal Features

In [None]:
# Create temporal features
engineer = FeatureEngineer(df_agg, date_column='date')
df_temporal = engineer.add_temporal_features()

print("Temporal features created:")
temporal_cols = [col for col in df_temporal.columns if col not in df_agg.columns]
print(temporal_cols)

df_temporal[['date'] + temporal_cols[:5]].head(10)

## 3. Lag Features

In [None]:
# Create lag features
df_lags = engineer.add_lag_features(
    target_column='sales',
    lags=[1, 7, 14, 28],
    group_columns=['store_id']
)

lag_cols = [col for col in df_lags.columns if 'lag' in col]
print(f"\nLag features: {lag_cols}")

# Visualize lag correlations
store_1 = df_lags[df_lags['store_id'] == 1].dropna()
corr_matrix = store_1[['sales'] + lag_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Lag Feature Correlations (Store 1)')
plt.tight_layout()
plt.show()

## 4. Rolling Window Features

In [None]:
# Create rolling features
df_rolling = engineer.add_rolling_features(
    target_column='sales',
    windows=[7, 14, 28],
    stats=['mean', 'std'],
    group_columns=['store_id']
)

rolling_cols = [col for col in df_rolling.columns if 'rolling' in col]
print(f"Rolling features: {rolling_cols}")

# Plot rolling statistics
store_1_data = df_rolling[df_rolling['store_id'] == 1].set_index('date')

fig, axes = plt.subplots(2, 1, figsize=(14, 8))

axes[0].plot(store_1_data.index, store_1_data['sales'], label='Actual', alpha=0.5)
axes[0].plot(store_1_data.index, store_1_data['sales_rolling_7_mean'], label='7-day MA')
axes[0].plot(store_1_data.index, store_1_data['sales_rolling_28_mean'], label='28-day MA')
axes[0].set_title('Sales with Rolling Means')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(store_1_data.index, store_1_data['sales_rolling_7_std'], label='7-day Std')
axes[1].plot(store_1_data.index, store_1_data['sales_rolling_28_std'], label='28-day Std')
axes[1].set_title('Rolling Standard Deviation')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Holiday Features

In [None]:
# Create holiday features
df_holidays = engineer.add_holiday_features()

holiday_cols = [col for col in df_holidays.columns if 'holiday' in col]
print(f"Holiday features: {holiday_cols}")

# Analyze holiday impact
holiday_impact = df_holidays.groupby('is_holiday')['sales'].agg(['mean', 'median', 'std'])
holiday_impact.index = ['Non-Holiday', 'Holiday']
print("\nHoliday Impact:")
print(holiday_impact)

## 6. Create All Features

In [None]:
# Create complete feature set
df_features = engineer.create_all_features(
    target_column='sales',
    group_columns=['store_id']
)

print(f"Total features: {len(df_features.columns)}")
print(f"\nFeature categories:")
print(f"  - Temporal: {len([c for c in df_features.columns if any(x in c for x in ['year', 'month', 'day', 'week'])])}")
print(f"  - Lag: {len([c for c in df_features.columns if 'lag' in c])}")
print(f"  - Rolling: {len([c for c in df_features.columns if 'rolling' in c])}")
print(f"  - Holiday: {len([c for c in df_features.columns if 'holiday' in c])}")
print(f"  - Promotion: {len([c for c in df_features.columns if 'promo' in c])}")

## 7. Feature Importance (Quick ML Model)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Prepare data for one store
store_data = df_features[df_features['store_id'] == 1].dropna()
feature_cols = [col for col in store_data.columns 
                if col not in ['date', 'store_id', 'sales']]

X = store_data[feature_cols]
y = store_data['sales']

# Train quick RF model
rf = RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42)
rf.fit(X, y)

# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

# Plot top 20
plt.figure(figsize=(10, 8))
top_20 = importance_df.head(20)
plt.barh(range(len(top_20)), top_20['importance'])
plt.yticks(range(len(top_20)), top_20['feature'])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Features:")
print(importance_df.head(10))

## 8. Key Takeaways

### Feature Engineering Insights:

1. **Lag Features**: Strong correlation with recent lags (1, 7 days)
2. **Rolling Features**: Capture trend and volatility effectively
3. **Holiday Features**: Significant impact on sales
4. **Temporal Features**: Day of week shows strong patterns
5. **Feature Importance**: Lag features typically most important

### Recommendations:

- Focus on lag features (1, 7, 14, 28 days)
- Include rolling means and standard deviations
- Add holiday indicators and proximity features
- Use cyclical encoding for temporal features
- Consider interaction features for promotions