# 03. Feature Engineering Analysis

This notebook visualizes the engineered features and their relationships with the target electricity price.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add project root to path
PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.features import DataProcessor
from src.utils import load_data

sns.set_theme(style="whitegrid", palette="viridis")

# Load processed features
df = load_data('processed', 'se3_features_v1.parquet')
print(f"Loaded {len(df)} rows and {len(df.columns)} features.")

## 1. Feature Correlation Heatmap

Understand which features are most strongly related to the target price and identify potential multi-collinearity.

In [None]:
plt.figure(figsize=(15, 10))
corr = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, annot=False, cmap='coolwarm', center=0)
plt.title("Feature Correlation Matrix", fontsize=15)
plt.show()

## 2. Target vs. Key Features

Visualize relationship with Rolling Averages and Lags.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 24h Rolling Mean
sns.scatterplot(data=df.sample(2000), x='value_rolling_mean_24h', y='value', alpha=0.3, ax=axes[0, 0])
axes[0, 0].set_title("Price vs. 24h Rolling Mean")

# Lag 24h
sns.scatterplot(data=df.sample(2000), x='value_lag_24', y='value', alpha=0.3, ax=axes[0, 1])
axes[0, 1].set_title("Price vs. 24h Lag")

# Hour of day distribution
sns.boxplot(data=df, x='hour', y='value', ax=axes[1, 0], palette="flare")
axes[1, 0].set_title("Price Distribution by Hour")

# Is Peak Morning vs Standard
sns.violinplot(data=df, x='is_peak_morning', y='value', ax=axes[1, 1])
axes[1, 1].set_title("Price: Peak Morning vs. Other Hours")

plt.tight_layout()
plt.show()

## 3. Distribution of New Features

Check for skewness or outliers in engineered features like Volatility.

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(df['value_rolling_std_24h'], kde=True, color='purple')
plt.title("Distribution of 24h Price Volatility (Rolling Std)")
plt.show()