# 02. Senior Exploratory Data Analysis (EDA)
---
**Project**: Swedish Electricity price Forecasting (2024-2025)

This notebook focuses on extracting strategic insights from the raw price data across all four bidding zones (SE1-SE4). 
We aim to identify:
1. **Volatility Patterns**: When do prices spike or drop below zero?
2. **Distribution Characteristics**: Are the prices heavily skewed?
3. **Regional Dynamics**: How do the Northern (hydro-rich) and Southern (consumption-heavy) zones interact?

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Environment Setup
PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.utils import get_data_path, convert_to_swedish_time

# Aesthetic Configuration
sns.set_context("talk")
plt.style.use('seaborn-v0_8-whitegrid')
palette = sns.color_palette("viridis", 4)
ZONE_COLORS = {'SE1': '#2ecc71', 'SE2': '#3498db', 'SE3': '#f1c40f', 'SE4': '#e74c3c'}

FIGURES_DIR = PROJECT_ROOT / "reports" / "figures"
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print(f"Environment ready. Visuals will be exported to: {FIGURES_DIR}")

## 1. Professional Data Loading
Synchronizing multiple CSV sources into a unified timestamp-indexed DataFrame.

In [ ]:
RAW_DATA_DIR = get_data_path('raw')

def load_unified_prices():
    files = list(RAW_DATA_DIR.glob("*_PRICES_*.csv"))
    chunks = []
    for f in files:
        df_tmp = pd.read_csv(f)
        mtu_col = [c for c in df_tmp.columns if 'MTU' in c][0]
        p_col = [c for c in df_tmp.columns if 'Price' in c][0]
        area_col = [c for c in df_tmp.columns if 'Area' in c][0]
        
        # Standardizing Time
        df_tmp['timestamp'] = pd.to_datetime(
            df_tmp[mtu_col].str.split(' - ').str[0].str.replace(r'\(CE[S]?T\)', '', regex=True).str.strip(), 
            dayfirst=True, format='mixed'
        )
        df_tmp['area'] = df_tmp[area_col].str.replace('BZN\\|', '', regex=True)
        df_tmp['price'] = pd.to_numeric(df_tmp[p_col], errors='coerce')
        
        chunks.append(df_tmp[['timestamp', 'area', 'price']])
    
    full_df = pd.concat(chunks, ignore_index=True).dropna()
    # Localize and convert using project utils
    full_df = convert_to_swedish_time(full_df, 'timestamp')
    
    # CRITICAL FIX: Handle duplicates caused by overlapping files
    # We take the mean in case there are minor alignment issues, or just drop duplicates
    initial_count = len(full_df)
    full_df = full_df.groupby(['timestamp', 'area'])['price'].mean().reset_index()
    final_count = len(full_df)
    if initial_count > final_count:
        print(f"Removed {initial_count - final_count} duplicate entries.")
        
    return full_df.sort_values(['area', 'timestamp'])

df_all = load_unified_prices()
# Now pivot will work safely
df_pivot = df_all.pivot(index='timestamp', columns='area', values='price')
print(f"Dataset: {df_all['timestamp'].min()} to {df_all['timestamp'].max()}")
df_pivot.head()

## 2. Figure 1: Performance Timeline (Multi-Scale Trend)
Comparison of daily moving averages across the Swedish grid to identify macroeconomic shifts.

In [ ]:
plt.figure(figsize=(18, 8))

for area in df_pivot.columns:
    # Daily resample for clarity
    daily = df_pivot[area].resample('D').mean()
    plt.plot(daily.index, daily, label=area, color=ZONE_COLORS.get(area), alpha=0.9, linewidth=1.5)
    
    # 30-day Moving Average for trend emphasis
    plt.plot(daily.index, daily.rolling(30).mean(), color=ZONE_COLORS.get(area), alpha=0.3, linewidth=4)

plt.title("Swedish Electricity Price Trends: Daily Avg & 30D Trend", fontsize=20, pad=20, fontweight='bold')
plt.ylabel("Price (EUR/MWh)", fontsize=14)
plt.legend(frameon=True, facecolor='white', shadow=True)
plt.grid(True, which='both', linestyle=':', alpha=0.6)
plt.tight_layout()
plt.savefig(FIGURES_DIR / "01_price_timeline_hq.png", dpi=300)
plt.show()

## 3. Figure 2: Anatomy of Volatility (Distribution & Outliers)
Using Boxenplots and Violin plots to expose the density of negative prices and extreme spikes.

In [ ]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8), gridspec_kw={'width_ratios': [1.5, 1]})

# 1. High-Density Distribution
sns.boxenplot(data=df_all, x='area', y='price', palette=ZONE_COLORS, ax=ax1, order=['SE1','SE2','SE3','SE4'])
ax1.set_title("Price Distribution Density (Boxenplot)", fontsize=16, fontweight='bold')
ax1.axhline(0, color='black', linestyle='--', linewidth=1)
ax1.set_ylabel("Price (EUR/MWh)")

# 2. Violin Plot for Peak Density
sns.violinplot(data=df_all, x='area', y='price', palette=ZONE_COLORS, ax=ax2, inner='quartile', order=['SE1','SE2','SE3','SE4'])
ax2.set_title("Probability Density (Violin Plot)", fontsize=16, fontweight='bold')
ax2.set_ylim(-50, 200) # Zoom into the most active range

plt.suptitle("Outlier and Skewness Analysis across Bidding Zones", fontsize=22, y=1.05)
plt.tight_layout()
plt.savefig(FIGURES_DIR / "02_distribution_volatility.png", dpi=300)
plt.show()

## 4. Figure 3: Regional Interconnectivity (Correlation Matrix)
Quantifying how much the Southern zones (SE3/SE4) decouple from the Northern zones (SE1/SE2).

In [ ]:
corr = df_pivot.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

plt.figure(figsize=(10, 8))
sns.heatmap(corr, mask=mask, annot=True, cmap='RdYlGn', fmt=".3f", 
            linewidths=2, cbar_kws={"shrink": .8})

plt.title("Pearson Correlation: Regional Price Synchronization", fontsize=18, fontweight='bold', pad=20)
plt.savefig(FIGURES_DIR / "03_regional_correlation.png", dpi=300)
plt.show()

# Extra: Heatmap for Hourly Seasonality (Stockholm)
print("Analyzing Hourly Seasonality for SE3...")
df_se3 = df_pivot[['SE3']].copy()
df_se3['hour'] = df_se3.index.hour
df_se3['month'] = df_se3.index.month
pivot_seas = df_se3.pivot_table(index='hour', columns='month', values='SE3', aggfunc='mean')

plt.figure(figsize=(12, 6))
sns.heatmap(pivot_seas, cmap='magma', annot=False)
plt.title("Seasonality Heatmap: Hour vs Month (SE3 Avg Price)", fontsize=16)
plt.savefig(FIGURES_DIR / "04_hourly_seasonality_se3.png", dpi=300)
plt.show()