# Base Load Analysis

## Objectives:
1. Separate base load from total energy consumption
2. Identify consumption patterns (weekday vs weekend, seasonal, time-of-day)
3. Model selection and validation for load prediction
4. Anomaly detection in energy consumption
5. Load clustering and classification
6. Create features for load forecasting models

## Key Analyses:
- Base load extraction and decomposition
- Consumption pattern analysis by time period
- Load forecasting model development
- Anomaly detection and outlier identification
- Load clustering (typical vs atypical days)
- Peak demand analysis and load balancing opportunities
- Energy efficiency metrics and benchmarking

In [ ]:
# Import required libraries
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import asyncio
import warnings
warnings.filterwarnings('ignore')

# Add pems_v2 directory to path for imports
sys.path.append(str(Path('../pems_v2').resolve()))

# Import project modules
from analysis.core.data_extraction import DataExtractor
from analysis.analyzers.base_load_analysis import BaseLoadAnalyzer
from config.settings import PEMSSettings

# Set up plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Energy Consumption Data

Load total energy consumption and related data for base load analysis

In [None]:
# Initialize settings and extractors
settings = PEMSSettings()
extractor = DataExtractor(settings)
base_load_analyzer = BaseLoadAnalyzer()

# Define analysis period (last 120 days for comprehensive pattern analysis)
end_date = datetime.now()
start_date = end_date - timedelta(days=120)

print(f"Analysis period: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

In [None]:
# Extract consumption and related data
async def load_consumption_data():
    """Load consumption and related data for base load analysis."""
    print("Loading energy consumption data...")
    consumption_data = await extractor.extract_energy_consumption(start_date, end_date)
    
    print("Loading PV production data...")
    pv_data = await extractor.extract_pv_data(start_date, end_date)
    
    print("Loading room temperature data...")
    room_data = await extractor.extract_room_temperatures(start_date, end_date)
    
    print("Loading heating relay data...")
    relay_data = await extractor.extract_relay_states(start_date, end_date)
    
    print("Loading weather data...")
    weather_data = await extractor.extract_weather_data(start_date, end_date)
    
    print("Loading EV charging data...")
    try:
        ev_data = await extractor.extract_ev_data(start_date, end_date)
    except Exception as e:
        print(f"Warning: Could not load EV data: {e}")
        ev_data = pd.DataFrame()
    
    print("Loading battery data...")
    try:
        battery_data = await extractor.extract_battery_data(start_date, end_date)
    except Exception as e:
        print(f"Warning: Could not load battery data: {e}")
        battery_data = pd.DataFrame()
    
    return consumption_data, pv_data, room_data, relay_data, weather_data, ev_data, battery_data

# Load data
consumption_data, pv_data, room_data, relay_data, weather_data, ev_data, battery_data = await load_consumption_data()

print(f"\nData loaded:")
print(f"  Consumption records: {len(consumption_data)}")
print(f"  PV records: {len(pv_data)}")
print(f"  Room data: {len(room_data)} rooms")
print(f"  Relay data: {len(relay_data)} rooms")
print(f"  Weather records: {len(weather_data)}")
print(f"  EV records: {len(ev_data)}")
print(f"  Battery records: {len(battery_data)}")

# Display consumption data columns
if not consumption_data.empty:
    print(f"\nConsumption data columns: {list(consumption_data.columns)}")

## 2. Data Preparation and Base Load Extraction

Prepare consumption data and extract base load components

In [None]:
# Prepare consumption data for analysis
if not consumption_data.empty:
    # Resample to hourly data for cleaner analysis
    consumption_hourly = consumption_data.resample('H').mean()
    
    # Identify total consumption column (look for power or energy columns)
    power_columns = [col for col in consumption_hourly.columns if 'power' in col.lower()]
    energy_columns = [col for col in consumption_hourly.columns if 'energy' in col.lower()]
    
    print(f"Power columns found: {power_columns}")
    print(f"Energy columns found: {energy_columns}")
    
    # Use the first power column as total consumption, or create one
    if power_columns:
        total_power_col = power_columns[0]
        consumption_hourly['total_power'] = consumption_hourly[total_power_col]
    else:
        # If no power columns, try to derive from PV data
        if not pv_data.empty and 'ACPowerToUser' in pv_data.columns:
            pv_hourly = pv_data['ACPowerToUser'].resample('H').mean()
            consumption_hourly = pd.merge(consumption_hourly, pv_hourly.to_frame('total_power'), 
                                        left_index=True, right_index=True, how='outer')
        else:
            print("Warning: No suitable consumption data found")
            consumption_hourly = pd.DataFrame()
    
    if not consumption_hourly.empty and 'total_power' in consumption_hourly.columns:
        # Remove negative values and outliers
        consumption_hourly['total_power'] = consumption_hourly['total_power'].clip(lower=0)
        
        # Remove extreme outliers (above 99.5 percentile)
        upper_limit = consumption_hourly['total_power'].quantile(0.995)
        consumption_hourly['total_power'] = consumption_hourly['total_power'].clip(upper=upper_limit)
        
        # Add time features
        consumption_hourly['hour'] = consumption_hourly.index.hour
        consumption_hourly['day_of_week'] = consumption_hourly.index.dayofweek
        consumption_hourly['month'] = consumption_hourly.index.month
        consumption_hourly['is_weekend'] = consumption_hourly['day_of_week'].isin([5, 6])
        consumption_hourly['is_working_hours'] = (
            (consumption_hourly['hour'] >= 8) & 
            (consumption_hourly['hour'] <= 18) & 
            (~consumption_hourly['is_weekend'])
        )
        
        print(f"\nPrepared consumption data: {len(consumption_hourly)} hours")
        print(f"Average consumption: {consumption_hourly['total_power'].mean():.1f} W")
        print(f"Peak consumption: {consumption_hourly['total_power'].max():.1f} W")
        print(f"Minimum consumption: {consumption_hourly['total_power'].min():.1f} W")
    else:
        print("Error: Could not prepare consumption data")
        consumption_hourly = pd.DataFrame()
else:
    print("Error: No consumption data available")
    consumption_hourly = pd.DataFrame()

In [None]:
# Extract base load using multiple methods
def extract_base_load(consumption_data, method='percentile'):
    """Extract base load using different methods."""
    
    if consumption_data.empty or 'total_power' not in consumption_data.columns:
        return None
    
    power_series = consumption_data['total_power'].dropna()
    
    base_load_methods = {}
    
    # Method 1: Percentile approach (10th percentile)
    base_load_methods['percentile_10'] = power_series.quantile(0.10)
    base_load_methods['percentile_5'] = power_series.quantile(0.05)
    
    # Method 2: Minimum during night hours (2-5 AM)
    night_hours = consumption_data[(consumption_data['hour'] >= 2) & (consumption_data['hour'] <= 5)]
    if not night_hours.empty:
        base_load_methods['night_minimum'] = night_hours['total_power'].min()
        base_load_methods['night_median'] = night_hours['total_power'].median()
    
    # Method 3: Rolling minimum (24-hour window)
    rolling_min = power_series.rolling(window=24, center=True).min()
    base_load_methods['rolling_min_mean'] = rolling_min.mean()
    base_load_methods['rolling_min_median'] = rolling_min.median()
    
    # Method 4: Statistical decomposition (if enough data)
    if len(power_series) > 168:  # Need at least a week of data
        try:
            # Use STL decomposition
            stl = STL(power_series.interpolate(), seasonal=13)  # Weekly seasonality
            result = stl.fit()
            trend_component = result.trend
            base_load_methods['stl_trend_min'] = trend_component.min()
            base_load_methods['stl_trend_p10'] = trend_component.quantile(0.10)
        except Exception as e:
            print(f"STL decomposition failed: {e}")
    
    return base_load_methods

# Extract base load
if not consumption_hourly.empty:
    base_load_estimates = extract_base_load(consumption_hourly)
    
    if base_load_estimates:
        print("\nBase Load Estimation Results:")
        print("=" * 50)
        for method, value in base_load_estimates.items():
            print(f"{method:20s}: {value:6.1f} W")
        
        # Select the most conservative estimate (night median)
        if 'night_median' in base_load_estimates:
            base_load_estimate = base_load_estimates['night_median']
        else:
            base_load_estimate = base_load_estimates['percentile_10']
        
        # Calculate variable load
        consumption_hourly['base_load'] = base_load_estimate
        consumption_hourly['variable_load'] = consumption_hourly['total_power'] - base_load_estimate
        consumption_hourly['variable_load'] = consumption_hourly['variable_load'].clip(lower=0)
        
        print(f"\nSelected base load estimate: {base_load_estimate:.1f} W")
        print(f"Average variable load: {consumption_hourly['variable_load'].mean():.1f} W")
        print(f"Base load percentage: {base_load_estimate/consumption_hourly['total_power'].mean()*100:.1f}%")
    else:
        print("Error: Could not estimate base load")
else:
    print("Error: No consumption data for base load extraction")

## 3. Consumption Pattern Analysis

Analyze consumption patterns by time period and identify typical behaviors

In [None]:
# Analyze consumption patterns
if not consumption_hourly.empty and 'total_power' in consumption_hourly.columns:
    # Daily patterns
    daily_patterns = {
        'weekday': consumption_hourly[~consumption_hourly['is_weekend']].groupby('hour')['total_power'].agg(['mean', 'std', 'min', 'max']),
        'weekend': consumption_hourly[consumption_hourly['is_weekend']].groupby('hour')['total_power'].agg(['mean', 'std', 'min', 'max']),
        'all_days': consumption_hourly.groupby('hour')['total_power'].agg(['mean', 'std', 'min', 'max'])
    }
    
    # Weekly patterns
    weekly_pattern = consumption_hourly.groupby('day_of_week')['total_power'].agg(['mean', 'std', 'min', 'max'])
    
    # Monthly patterns
    monthly_pattern = consumption_hourly.groupby('month')['total_power'].agg(['mean', 'std', 'min', 'max'])
    
    print("\nConsumption Pattern Analysis:")
    print("=" * 60)
    
    # Find peak consumption hours
    peak_hour_weekday = daily_patterns['weekday']['mean'].idxmax()
    peak_power_weekday = daily_patterns['weekday']['mean'].max()
    
    peak_hour_weekend = daily_patterns['weekend']['mean'].idxmax()
    peak_power_weekend = daily_patterns['weekend']['mean'].max()
    
    print(f"Peak consumption - Weekdays: {peak_hour_weekday:02d}:00 ({peak_power_weekday:.0f}W)")
    print(f"Peak consumption - Weekends: {peak_hour_weekend:02d}:00 ({peak_power_weekend:.0f}W)")
    
    # Find lowest consumption hours
    low_hour_weekday = daily_patterns['weekday']['mean'].idxmin()
    low_power_weekday = daily_patterns['weekday']['mean'].min()
    
    low_hour_weekend = daily_patterns['weekend']['mean'].idxmin()
    low_power_weekend = daily_patterns['weekend']['mean'].min()
    
    print(f"Low consumption - Weekdays: {low_hour_weekday:02d}:00 ({low_power_weekday:.0f}W)")
    print(f"Low consumption - Weekends: {low_hour_weekend:02d}:00 ({low_power_weekend:.0f}W)")
    
    # Calculate daily load factor
    daily_load_factor = daily_patterns['all_days']['mean'].mean() / daily_patterns['all_days']['max'].max()
    print(f"\nDaily load factor: {daily_load_factor:.3f}")
    
    # Weekend vs weekday comparison
    weekend_avg = daily_patterns['weekend']['mean'].mean()
    weekday_avg = daily_patterns['weekday']['mean'].mean()
    weekend_difference = (weekend_avg - weekday_avg) / weekday_avg * 100
    
    print(f"Weekend vs weekday difference: {weekend_difference:+.1f}%")
else:
    print("No consumption data available for pattern analysis")

In [None]:
# Visualize consumption patterns
if 'daily_patterns' in locals():
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # Daily patterns comparison
    daily_patterns['weekday']['mean'].plot(ax=axes[0,0], label='Weekday', linewidth=2, marker='o')
    daily_patterns['weekend']['mean'].plot(ax=axes[0,0], label='Weekend', linewidth=2, marker='s')
    axes[0,0].fill_between(daily_patterns['weekday'].index, 
                          daily_patterns['weekday']['mean'] - daily_patterns['weekday']['std'],
                          daily_patterns['weekday']['mean'] + daily_patterns['weekday']['std'],
                          alpha=0.3)
    axes[0,0].set_title('Daily Consumption Patterns')
    axes[0,0].set_xlabel('Hour of Day')
    axes[0,0].set_ylabel('Power (W)')
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3)
    
    # Weekly pattern
    weekly_pattern['mean'].plot(kind='bar', ax=axes[0,1], color='orange', alpha=0.8)
    axes[0,1].set_title('Weekly Consumption Pattern')
    axes[0,1].set_xlabel('Day of Week (0=Monday)')
    axes[0,1].set_ylabel('Average Power (W)')
    axes[0,1].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], rotation=0)
    axes[0,1].grid(True, alpha=0.3)
    
    # Monthly pattern
    monthly_pattern['mean'].plot(kind='bar', ax=axes[0,2], color='green', alpha=0.8)
    axes[0,2].set_title('Monthly Consumption Pattern')
    axes[0,2].set_xlabel('Month')
    axes[0,2].set_ylabel('Average Power (W)')
    axes[0,2].set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                              'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)
    axes[0,2].grid(True, alpha=0.3)
    
    # Base load vs variable load
    if 'variable_load' in consumption_hourly.columns:
        daily_base = consumption_hourly.groupby('hour')['base_load'].mean()
        daily_variable = consumption_hourly.groupby('hour')['variable_load'].mean()
        
        daily_base.plot(ax=axes[1,0], label='Base Load', linewidth=2, color='red')
        daily_variable.plot(ax=axes[1,0], label='Variable Load', linewidth=2, color='blue')
        axes[1,0].set_title('Base Load vs Variable Load')
        axes[1,0].set_xlabel('Hour of Day')
        axes[1,0].set_ylabel('Power (W)')
        axes[1,0].legend()
        axes[1,0].grid(True, alpha=0.3)
    
    # Load duration curve
    sorted_loads = consumption_hourly['total_power'].sort_values(ascending=False)
    hours = np.arange(len(sorted_loads)) / len(sorted_loads) * 100
    
    axes[1,1].plot(hours, sorted_loads, linewidth=2, color='purple')
    axes[1,1].axhline(y=consumption_hourly['total_power'].mean(), color='red', 
                     linestyle='--', label=f'Average: {consumption_hourly["total_power"].mean():.0f}W')
    if 'base_load_estimate' in locals():
        axes[1,1].axhline(y=base_load_estimate, color='green', 
                         linestyle='--', label=f'Base Load: {base_load_estimate:.0f}W')
    axes[1,1].set_title('Load Duration Curve')
    axes[1,1].set_xlabel('Time (%)')
    axes[1,1].set_ylabel('Power (W)')
    axes[1,1].legend()
    axes[1,1].grid(True, alpha=0.3)
    
    # Consumption distribution
    consumption_hourly['total_power'].hist(bins=50, ax=axes[1,2], alpha=0.7, color='skyblue')
    axes[1,2].axvline(consumption_hourly['total_power'].mean(), color='red', 
                     linestyle='--', label=f'Mean: {consumption_hourly["total_power"].mean():.0f}W')
    axes[1,2].axvline(consumption_hourly['total_power'].median(), color='orange', 
                     linestyle='--', label=f'Median: {consumption_hourly["total_power"].median():.0f}W')
    axes[1,2].set_title('Consumption Distribution')
    axes[1,2].set_xlabel('Power (W)')
    axes[1,2].set_ylabel('Frequency')
    axes[1,2].legend()
    axes[1,2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 4. Seasonal Decomposition

Decompose consumption data into trend, seasonal, and residual components

In [None]:
# Perform seasonal decomposition
if not consumption_hourly.empty and len(consumption_hourly) > 168:  # Need at least a week
    power_series = consumption_hourly['total_power'].interpolate()
    
    try:
        # STL decomposition with multiple seasonalities
        stl_daily = STL(power_series, seasonal=25, period=24)  # Daily seasonality
        result_daily = stl_daily.fit()
        
        # Extract components
        trend = result_daily.trend
        seasonal = result_daily.seasonal
        residual = result_daily.resid
        
        # Calculate component statistics
        trend_strength = 1 - np.var(residual + seasonal) / np.var(power_series)
        seasonal_strength = 1 - np.var(residual + trend) / np.var(power_series)
        
        print("\nSeasonal Decomposition Results:")
        print("=" * 50)
        print(f"Trend strength: {trend_strength:.3f}")
        print(f"Seasonal strength: {seasonal_strength:.3f}")
        print(f"Residual variance: {np.var(residual):.1f}")
        
        # Visualize decomposition
        fig, axes = plt.subplots(4, 1, figsize=(15, 12), sharex=True)
        
        # Original series
        power_series.plot(ax=axes[0], title='Original Consumption', color='blue')
        axes[0].set_ylabel('Power (W)')
        axes[0].grid(True, alpha=0.3)
        
        # Trend
        trend.plot(ax=axes[1], title='Trend Component', color='red')
        axes[1].set_ylabel('Power (W)')
        axes[1].grid(True, alpha=0.3)
        
        # Seasonal
        seasonal.plot(ax=axes[2], title='Seasonal Component (Daily)', color='green')
        axes[2].set_ylabel('Power (W)')
        axes[2].grid(True, alpha=0.3)
        
        # Residual
        residual.plot(ax=axes[3], title='Residual Component', color='orange')
        axes[3].set_ylabel('Power (W)')
        axes[3].set_xlabel('Date')
        axes[3].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Analyze residuals for anomalies
        residual_std = residual.std()
        anomaly_threshold = 3 * residual_std
        anomalies = residual[abs(residual) > anomaly_threshold]
        
        print(f"\nAnomaly Detection (3-sigma rule):")
        print(f"Residual standard deviation: {residual_std:.1f} W")
        print(f"Anomaly threshold: ±{anomaly_threshold:.1f} W")
        print(f"Anomalies detected: {len(anomalies)} ({len(anomalies)/len(residual)*100:.2f}%)")
        
        # Store decomposition results
        decomposition_results = {
            'trend': trend,
            'seasonal': seasonal,
            'residual': residual,
            'trend_strength': trend_strength,
            'seasonal_strength': seasonal_strength,
            'anomalies': anomalies
        }
        
    except Exception as e:
        print(f"Seasonal decomposition failed: {e}")
        decomposition_results = None
else:
    print("Insufficient data for seasonal decomposition")
    decomposition_results = None

## 5. Load Forecasting Model Development

Develop and validate load forecasting models

In [None]:
# Prepare features for load forecasting
if not consumption_hourly.empty:
    # Create feature matrix
    features_df = consumption_hourly.copy()
    
    # Add lag features
    for lag in [1, 2, 3, 24, 48, 168]:  # 1h, 2h, 3h, 1d, 2d, 1w
        features_df[f'power_lag_{lag}h'] = features_df['total_power'].shift(lag)
    
    # Add rolling statistics
    for window in [6, 12, 24]:  # 6h, 12h, 24h windows
        features_df[f'power_rolling_mean_{window}h'] = features_df['total_power'].rolling(window).mean()
        features_df[f'power_rolling_std_{window}h'] = features_df['total_power'].rolling(window).std()
    
    # Add cyclical time features
    features_df['hour_sin'] = np.sin(2 * np.pi * features_df['hour'] / 24)
    features_df['hour_cos'] = np.cos(2 * np.pi * features_df['hour'] / 24)
    features_df['day_sin'] = np.sin(2 * np.pi * features_df['day_of_week'] / 7)
    features_df['day_cos'] = np.cos(2 * np.pi * features_df['day_of_week'] / 7)
    features_df['month_sin'] = np.sin(2 * np.pi * features_df['month'] / 12)
    features_df['month_cos'] = np.cos(2 * np.pi * features_df['month'] / 12)
    
    # Add weather features if available
    if not weather_data.empty:
        weather_hourly = weather_data.resample('H').mean()
        weather_features = ['temperature_2m', 'cloudcover', 'shortwave_radiation']
        available_weather = [col for col in weather_features if col in weather_hourly.columns]
        
        if available_weather:
            features_df = pd.merge(features_df, weather_hourly[available_weather], 
                                 left_index=True, right_index=True, how='left')
    
    # Add heating load estimate (if relay data available)
    if relay_data:
        # Estimate heating power consumption
        total_heating_power = 0
        for room, relay_df in relay_data.items():
            if not relay_df.empty:
                # Estimate room power (using simple mapping)
                room_power_map = {
                    'obyvacka': 2000, 'kuchyn': 1500, 'loznice': 1500,
                    'detsky_pokoj': 1000, 'koupelna': 800, 'pracovna': 1200
                }
                room_power = room_power_map.get(room, 1000)
                
                relay_hourly = relay_df['value'].resample('H').mean()
                heating_power = relay_hourly * room_power
                total_heating_power += heating_power
        
        if isinstance(total_heating_power, pd.Series) and not total_heating_power.empty:
            features_df = pd.merge(features_df, total_heating_power.to_frame('heating_power'), 
                                 left_index=True, right_index=True, how='left')
    
    # Remove rows with missing values
    features_df = features_df.dropna()
    
    print(f"\nFeature Engineering Completed:")
    print(f"Total features: {len(features_df.columns)}")
    print(f"Training samples: {len(features_df)}")
    
    # Select features for modeling
    feature_columns = [col for col in features_df.columns 
                      if col not in ['total_power', 'base_load', 'variable_load']]
    
    X = features_df[feature_columns]
    y = features_df['total_power']
    
    print(f"Model features: {len(feature_columns)}")
    print(f"Sample feature names: {feature_columns[:10]}")
else:
    print("No consumption data available for forecasting model")
    features_df = pd.DataFrame()

In [None]:
# Train and validate forecasting models
if not features_df.empty and len(features_df) > 100:
    # Prepare data for time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Initialize models
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    }
    
    model_results = {}
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        cv_scores = {'mae': [], 'rmse': [], 'r2': []}
        
        for train_idx, test_idx in tscv.split(X_scaled):
            X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            
            # Train model
            model.fit(X_train, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            
            cv_scores['mae'].append(mae)
            cv_scores['rmse'].append(rmse)
            cv_scores['r2'].append(r2)
        
        # Store average results
        model_results[name] = {
            'mae_mean': np.mean(cv_scores['mae']),
            'mae_std': np.std(cv_scores['mae']),
            'rmse_mean': np.mean(cv_scores['rmse']),
            'rmse_std': np.std(cv_scores['rmse']),
            'r2_mean': np.mean(cv_scores['r2']),
            'r2_std': np.std(cv_scores['r2'])
        }
    
    # Display model performance
    print("\nModel Performance (Cross-Validation):")
    print("=" * 70)
    print(f"{'Model':20s} {'MAE (W)':12s} {'RMSE (W)':12s} {'R²':12s}")
    print("-" * 70)
    
    for name, results in model_results.items():
        mae_str = f"{results['mae_mean']:.1f}±{results['mae_std']:.1f}"
        rmse_str = f"{results['rmse_mean']:.1f}±{results['rmse_std']:.1f}"
        r2_str = f"{results['r2_mean']:.3f}±{results['r2_std']:.3f}"
        
        print(f"{name:20s} {mae_str:12s} {rmse_str:12s} {r2_str:12s}")
    
    # Train final model on all data
    best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['r2_mean'])
    best_model = models[best_model_name]
    best_model.fit(X_scaled, y)
    
    # Feature importance (for Random Forest)
    if best_model_name == 'Random Forest':
        feature_importance = pd.DataFrame({
            'feature': feature_columns,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        print(f"\nTop 10 Important Features ({best_model_name}):")
        print("-" * 50)
        for _, row in feature_importance.head(10).iterrows():
            print(f"{row['feature']:30s}: {row['importance']:.4f}")
    
    # Generate forecast for next 24 hours
    if len(features_df) >= 24:
        # Use last 24 hours for forecast
        last_features = X_scaled[-24:]
        forecast_24h = best_model.predict(last_features)
        
        # Create forecast dataframe
        forecast_index = pd.date_range(start=features_df.index[-24], periods=24, freq='H')
        forecast_df = pd.DataFrame({
            'actual': y.iloc[-24:].values,
            'forecast': forecast_24h
        }, index=forecast_index)
        
        # Calculate forecast accuracy
        forecast_mae = mean_absolute_error(forecast_df['actual'], forecast_df['forecast'])
        forecast_rmse = np.sqrt(mean_squared_error(forecast_df['actual'], forecast_df['forecast']))
        
        print(f"\n24-hour Forecast Accuracy:")
        print(f"MAE: {forecast_mae:.1f} W")
        print(f"RMSE: {forecast_rmse:.1f} W")
        
        # Plot forecast
        plt.figure(figsize=(12, 6))
        plt.plot(forecast_df.index, forecast_df['actual'], 'b-', linewidth=2, label='Actual')
        plt.plot(forecast_df.index, forecast_df['forecast'], 'r--', linewidth=2, label='Forecast')
        plt.fill_between(forecast_df.index, 
                        forecast_df['forecast'] - forecast_rmse,
                        forecast_df['forecast'] + forecast_rmse,
                        alpha=0.3, color='red', label='±RMSE band')
        plt.title(f'24-hour Load Forecast ({best_model_name})')
        plt.xlabel('Time')
        plt.ylabel('Power (W)')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
else:
    print("Insufficient data for load forecasting model development")
    model_results = {}

## 6. Anomaly Detection

Detect anomalies and unusual consumption patterns

In [None]:
# Anomaly detection using multiple methods
if not consumption_hourly.empty:
    
    # Method 1: Statistical outliers (Z-score)
    power_series = consumption_hourly['total_power']
    z_scores = np.abs(stats.zscore(power_series))
    z_threshold = 3
    z_anomalies = power_series[z_scores > z_threshold]
    
    # Method 2: Isolation Forest
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    power_reshaped = power_series.values.reshape(-1, 1)
    iso_outliers = iso_forest.fit_predict(power_reshaped)
    iso_anomalies = power_series[iso_outliers == -1]
    
    # Method 3: Residual-based anomalies (if decomposition available)
    residual_anomalies = pd.Series(dtype=float)
    if 'decomposition_results' in locals() and decomposition_results:
        residual_anomalies = decomposition_results['anomalies']
    
    # Method 4: Inter-quartile range (IQR)
    Q1 = power_series.quantile(0.25)
    Q3 = power_series.quantile(0.75)
    IQR = Q3 - Q1
    iqr_lower = Q1 - 1.5 * IQR
    iqr_upper = Q3 + 1.5 * IQR
    iqr_anomalies = power_series[(power_series < iqr_lower) | (power_series > iqr_upper)]
    
    print("\nAnomaly Detection Results:")
    print("=" * 50)
    print(f"Total data points: {len(power_series)}")
    print(f"Z-score outliers (>3σ): {len(z_anomalies)} ({len(z_anomalies)/len(power_series)*100:.2f}%)")
    print(f"Isolation Forest outliers: {len(iso_anomalies)} ({len(iso_anomalies)/len(power_series)*100:.2f}%)")
    if len(residual_anomalies) > 0:
        print(f"Residual-based outliers: {len(residual_anomalies)} ({len(residual_anomalies)/len(power_series)*100:.2f}%)")
    print(f"IQR outliers: {len(iqr_anomalies)} ({len(iqr_anomalies)/len(power_series)*100:.2f}%)")
    
    # Combine anomalies (union of all methods)
    all_anomaly_indices = set()
    all_anomaly_indices.update(z_anomalies.index)
    all_anomaly_indices.update(iso_anomalies.index)
    if len(residual_anomalies) > 0:
        all_anomaly_indices.update(residual_anomalies.index)
    all_anomaly_indices.update(iqr_anomalies.index)
    
    combined_anomalies = power_series.loc[list(all_anomaly_indices)]
    
    print(f"\nCombined anomalies: {len(combined_anomalies)} ({len(combined_anomalies)/len(power_series)*100:.2f}%)")
    
    # Analyze anomaly patterns
    if len(combined_anomalies) > 0:
        anomaly_df = pd.DataFrame({
            'power': combined_anomalies,
            'hour': combined_anomalies.index.hour,
            'day_of_week': combined_anomalies.index.dayofweek,
            'month': combined_anomalies.index.month
        })
        
        print("\nAnomaly Pattern Analysis:")
        print("-" * 30)
        
        # Hour distribution
        hour_dist = anomaly_df['hour'].value_counts().sort_index()
        if len(hour_dist) > 0:
            most_common_hour = hour_dist.idxmax()
            print(f"Most common anomaly hour: {most_common_hour:02d}:00 ({hour_dist[most_common_hour]} cases)")
        
        # Day of week distribution
        day_dist = anomaly_df['day_of_week'].value_counts().sort_index()
        if len(day_dist) > 0:
            days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
            most_common_day = day_dist.idxmax()
            print(f"Most common anomaly day: {days[most_common_day]} ({day_dist[most_common_day]} cases)")
        
        # Power range analysis
        high_anomalies = anomaly_df[anomaly_df['power'] > power_series.quantile(0.95)]
        low_anomalies = anomaly_df[anomaly_df['power'] < power_series.quantile(0.05)]
        
        print(f"High consumption anomalies: {len(high_anomalies)} (>{power_series.quantile(0.95):.0f}W)")
        print(f"Low consumption anomalies: {len(low_anomalies)} (<{power_series.quantile(0.05):.0f}W)")
        
        # Recent anomalies
        recent_anomalies = combined_anomalies.tail(5)
        if len(recent_anomalies) > 0:
            print(f"\nRecent anomalies:")
            for timestamp, power in recent_anomalies.items():
                print(f"  {timestamp}: {power:.0f}W")
    
    # Visualize anomalies
    plt.figure(figsize=(15, 8))
    
    # Plot consumption with anomalies highlighted
    plt.subplot(2, 1, 1)
    plt.plot(power_series.index, power_series.values, 'b-', alpha=0.7, linewidth=1, label='Normal')
    
    if len(combined_anomalies) > 0:
        plt.scatter(combined_anomalies.index, combined_anomalies.values, 
                   c='red', s=50, label=f'Anomalies ({len(combined_anomalies)})', zorder=5)
    
    plt.title('Consumption with Anomalies Highlighted')
    plt.ylabel('Power (W)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Plot anomaly distribution by hour
    plt.subplot(2, 1, 2)
    if len(combined_anomalies) > 0:
        hourly_anomalies = combined_anomalies.groupby(combined_anomalies.index.hour).size()
        hourly_normal = power_series.groupby(power_series.index.hour).size()
        
        hours = range(24)
        anomaly_counts = [hourly_anomalies.get(h, 0) for h in hours]
        normal_counts = [hourly_normal.get(h, 0) for h in hours]
        anomaly_rates = [a/(n+a)*100 if (n+a) > 0 else 0 for a, n in zip(anomaly_counts, normal_counts)]
        
        plt.bar(hours, anomaly_rates, alpha=0.7, color='red')
        plt.title('Anomaly Rate by Hour of Day')
        plt.xlabel('Hour')
        plt.ylabel('Anomaly Rate (%)')
        plt.xticks(range(0, 24, 2))
        plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Store anomaly results
    anomaly_results = {
        'z_score_anomalies': z_anomalies,
        'isolation_forest_anomalies': iso_anomalies,
        'iqr_anomalies': iqr_anomalies,
        'combined_anomalies': combined_anomalies,
        'anomaly_rate': len(combined_anomalies) / len(power_series) * 100
    }
    
    if len(residual_anomalies) > 0:
        anomaly_results['residual_anomalies'] = residual_anomalies
        
else:
    print("No consumption data available for anomaly detection")
    anomaly_results = {}

## 7. Load Clustering Analysis

Cluster consumption patterns to identify typical vs atypical days

In [None]:
# Load clustering analysis
if not consumption_hourly.empty and len(consumption_hourly) >= 48:  # Need at least 2 days
    
    # Create daily load profiles
    daily_profiles = consumption_hourly['total_power'].groupby(consumption_hourly.index.date).apply(
        lambda x: x.reindex(pd.date_range(x.index.min().normalize(), 
                                         x.index.min().normalize() + pd.Timedelta(hours=23),
                                         freq='H')).values
    )
    
    # Filter complete days only (24 hours)
    complete_days = daily_profiles[daily_profiles.apply(lambda x: len(x) == 24 and not np.any(np.isnan(x)))]
    
    if len(complete_days) >= 7:  # Need at least a week
        # Convert to matrix for clustering
        profile_matrix = np.vstack(complete_days.values)
        
        # Normalize profiles for clustering
        scaler_cluster = StandardScaler()
        profile_matrix_scaled = scaler_cluster.fit_transform(profile_matrix)
        
        # Determine optimal number of clusters using elbow method
        max_clusters = min(8, len(complete_days) // 2)
        inertias = []
        
        for k in range(2, max_clusters + 1):
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(profile_matrix_scaled)
            inertias.append(kmeans.inertia_)
        
        # Find elbow point (simplified)
        if len(inertias) >= 3:
            diffs = np.diff(inertias)
            optimal_k = np.argmin(diffs[1:]) + 3  # +3 because we start from k=2 and use second differences
        else:
            optimal_k = 3  # Default
        
        optimal_k = min(optimal_k, max_clusters)
        
        # Perform clustering with optimal k
        kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
        cluster_labels = kmeans_final.fit_predict(profile_matrix_scaled)
        
        # Analyze clusters
        cluster_analysis = {}
        
        for cluster_id in range(optimal_k):
            cluster_mask = cluster_labels == cluster_id
            cluster_profiles = profile_matrix[cluster_mask]
            cluster_dates = complete_days.index[cluster_mask]
            
            # Calculate cluster statistics
            cluster_mean = cluster_profiles.mean(axis=0)
            cluster_std = cluster_profiles.std(axis=0)
            
            # Analyze cluster characteristics
            daily_total = cluster_profiles.sum(axis=1)
            peak_hour = cluster_mean.argmax()
            peak_power = cluster_mean.max()
            min_power = cluster_mean.min()
            
            # Day type analysis
            cluster_dates_dt = pd.to_datetime(cluster_dates)
            weekdays = sum(cluster_dates_dt.weekday < 5)
            weekends = sum(cluster_dates_dt.weekday >= 5)
            
            cluster_analysis[cluster_id] = {
                'count': len(cluster_profiles),
                'mean_profile': cluster_mean,
                'std_profile': cluster_std,
                'daily_total_mean': daily_total.mean(),
                'daily_total_std': daily_total.std(),
                'peak_hour': peak_hour,
                'peak_power': peak_power,
                'min_power': min_power,
                'load_factor': min_power / peak_power,
                'weekdays': weekdays,
                'weekends': weekends,
                'dates': cluster_dates.tolist()
            }
        
        print(f"\nLoad Clustering Analysis ({optimal_k} clusters):")
        print("=" * 80)
        print(f"{'Cluster':8s} {'Days':6s} {'Weekdays':9s} {'Weekends':9s} {'Peak Hour':10s} {'Peak (W)':9s} {'Load Factor':12s}")
        print("-" * 80)
        
        for cluster_id, analysis in cluster_analysis.items():
            print(f"{cluster_id:8d} {analysis['count']:6d} {analysis['weekdays']:9d} {analysis['weekends']:9d} "
                 f"{analysis['peak_hour']:9d}:00 {analysis['peak_power']:8.0f} {analysis['load_factor']:11.3f}")
        
        # Visualize clusters
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # Plot cluster profiles
        colors = plt.cm.Set1(np.linspace(0, 1, optimal_k))
        
        for cluster_id, analysis in cluster_analysis.items():
            hours = range(24)
            mean_profile = analysis['mean_profile']
            std_profile = analysis['std_profile']
            
            axes[0,0].plot(hours, mean_profile, linewidth=2, color=colors[cluster_id], 
                          label=f'Cluster {cluster_id} ({analysis["count"]} days)')
            axes[0,0].fill_between(hours, mean_profile - std_profile, mean_profile + std_profile,
                                  alpha=0.3, color=colors[cluster_id])
        
        axes[0,0].set_title('Daily Load Profile Clusters')
        axes[0,0].set_xlabel('Hour of Day')
        axes[0,0].set_ylabel('Power (W)')
        axes[0,0].legend()
        axes[0,0].grid(True, alpha=0.3)
        
        # Cluster size distribution
        cluster_sizes = [analysis['count'] for analysis in cluster_analysis.values()]
        cluster_names = [f'Cluster {i}' for i in range(optimal_k)]
        
        axes[0,1].bar(cluster_names, cluster_sizes, color=colors[:optimal_k], alpha=0.8)
        axes[0,1].set_title('Cluster Size Distribution')
        axes[0,1].set_ylabel('Number of Days')
        axes[0,1].grid(True, alpha=0.3)
        
        # Weekday vs weekend distribution
        weekday_counts = [analysis['weekdays'] for analysis in cluster_analysis.values()]
        weekend_counts = [analysis['weekends'] for analysis in cluster_analysis.values()]
        
        x = np.arange(optimal_k)
        width = 0.35
        
        axes[1,0].bar(x - width/2, weekday_counts, width, label='Weekdays', alpha=0.8)
        axes[1,0].bar(x + width/2, weekend_counts, width, label='Weekends', alpha=0.8)
        axes[1,0].set_title('Day Type Distribution by Cluster')
        axes[1,0].set_xlabel('Cluster')
        axes[1,0].set_ylabel('Number of Days')
        axes[1,0].set_xticks(x)
        axes[1,0].set_xticklabels([f'C{i}' for i in range(optimal_k)])
        axes[1,0].legend()
        axes[1,0].grid(True, alpha=0.3)
        
        # Daily total consumption by cluster
        daily_totals = [analysis['daily_total_mean'] for analysis in cluster_analysis.values()]
        daily_totals_std = [analysis['daily_total_std'] for analysis in cluster_analysis.values()]
        
        axes[1,1].bar(cluster_names, daily_totals, yerr=daily_totals_std, 
                     color=colors[:optimal_k], alpha=0.8, capsize=5)
        axes[1,1].set_title('Daily Total Consumption by Cluster')
        axes[1,1].set_ylabel('Daily Total (Wh)')
        axes[1,1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Identify cluster characteristics
        print(f"\nCluster Characteristics:")
        print("-" * 50)
        
        for cluster_id, analysis in cluster_analysis.items():
            weekday_pct = analysis['weekdays'] / analysis['count'] * 100
            
            if weekday_pct > 80:
                day_type = "Weekday pattern"
            elif weekday_pct < 20:
                day_type = "Weekend pattern"
            else:
                day_type = "Mixed pattern"
            
            peak_time = analysis['peak_hour']
            if 6 <= peak_time <= 9:
                peak_desc = "Morning peak"
            elif 17 <= peak_time <= 21:
                peak_desc = "Evening peak"
            elif 10 <= peak_time <= 16:
                peak_desc = "Daytime peak"
            else:
                peak_desc = "Night peak"
            
            load_factor = analysis['load_factor']
            if load_factor > 0.7:
                load_desc = "High load factor (stable)"
            elif load_factor > 0.5:
                load_desc = "Medium load factor"
            else:
                load_desc = "Low load factor (variable)"
            
            print(f"Cluster {cluster_id}: {day_type}, {peak_desc}, {load_desc}")
        
        # Store clustering results
        clustering_results = {
            'optimal_clusters': optimal_k,
            'cluster_labels': cluster_labels,
            'cluster_analysis': cluster_analysis,
            'profile_matrix': profile_matrix,
            'complete_days': complete_days
        }
        
    else:
        print(f"Insufficient complete days for clustering ({len(complete_days)} < 7)")
        clustering_results = {}
else:
    print("Insufficient data for load clustering analysis")
    clustering_results = {}

## 8. Energy Efficiency Analysis

Analyze energy efficiency metrics and benchmarking

In [None]:
# Energy efficiency analysis
if not consumption_hourly.empty:
    
    # Calculate efficiency metrics
    efficiency_metrics = {}
    
    # Basic consumption statistics
    total_period_hours = len(consumption_hourly)
    total_energy_kwh = consumption_hourly['total_power'].sum() / 1000  # Convert Wh to kWh
    avg_power_w = consumption_hourly['total_power'].mean()
    peak_power_w = consumption_hourly['total_power'].max()
    
    # Load factor
    load_factor = avg_power_w / peak_power_w
    
    # Capacity factor (assuming rated capacity)
    assumed_max_capacity = 15000  # 15kW assumed maximum load
    capacity_factor = avg_power_w / assumed_max_capacity
    
    efficiency_metrics['total_energy_kwh'] = total_energy_kwh
    efficiency_metrics['avg_power_w'] = avg_power_w
    efficiency_metrics['peak_power_w'] = peak_power_w
    efficiency_metrics['load_factor'] = load_factor
    efficiency_metrics['capacity_factor'] = capacity_factor
    
    # Daily efficiency metrics
    daily_energy = consumption_hourly['total_power'].resample('D').sum() / 1000  # kWh per day
    daily_peak = consumption_hourly['total_power'].resample('D').max()
    daily_avg = consumption_hourly['total_power'].resample('D').mean()
    
    efficiency_metrics['daily_energy_mean'] = daily_energy.mean()
    efficiency_metrics['daily_energy_std'] = daily_energy.std()
    efficiency_metrics['daily_peak_mean'] = daily_peak.mean()
    efficiency_metrics['daily_peak_std'] = daily_peak.std()
    
    # Peak demand analysis
    # Count hours above different thresholds
    thresholds = [0.5, 0.7, 0.8, 0.9]
    peak_threshold = peak_power_w
    
    for threshold in thresholds:
        threshold_power = threshold * peak_threshold
        hours_above = (consumption_hourly['total_power'] > threshold_power).sum()
        efficiency_metrics[f'hours_above_{int(threshold*100)}pct_peak'] = hours_above
        efficiency_metrics[f'pct_time_above_{int(threshold*100)}pct_peak'] = hours_above / total_period_hours * 100
    
    # Base load efficiency
    if 'base_load_estimate' in locals():
        base_load_ratio = base_load_estimate / avg_power_w
        efficiency_metrics['base_load_ratio'] = base_load_ratio
        efficiency_metrics['variable_load_ratio'] = 1 - base_load_ratio
    
    # Demand diversity (variability)
    coefficient_of_variation = consumption_hourly['total_power'].std() / consumption_hourly['total_power'].mean()
    efficiency_metrics['coefficient_of_variation'] = coefficient_of_variation
    
    # Seasonal efficiency (if data spans multiple months)
    monthly_avg = consumption_hourly.groupby(consumption_hourly.index.month)['total_power'].mean()
    if len(monthly_avg) > 1:
        seasonal_variation = (monthly_avg.max() - monthly_avg.min()) / monthly_avg.mean()
        efficiency_metrics['seasonal_variation'] = seasonal_variation
    
    # Time-of-use efficiency
    peak_hours = consumption_hourly['is_working_hours']
    peak_consumption = consumption_hourly[peak_hours]['total_power'].mean()
    off_peak_consumption = consumption_hourly[~peak_hours]['total_power'].mean()
    
    if off_peak_consumption > 0:
        peak_to_offpeak_ratio = peak_consumption / off_peak_consumption
        efficiency_metrics['peak_to_offpeak_ratio'] = peak_to_offpeak_ratio
    
    # Display efficiency metrics
    print("\nEnergy Efficiency Analysis:")
    print("=" * 60)
    
    print(f"\nConsumption Summary:")
    print(f"  Total energy: {efficiency_metrics['total_energy_kwh']:.1f} kWh")
    print(f"  Average power: {efficiency_metrics['avg_power_w']:.0f} W")
    print(f"  Peak power: {efficiency_metrics['peak_power_w']:.0f} W")
    print(f"  Analysis period: {total_period_hours} hours ({total_period_hours/24:.1f} days)")
    
    print(f"\nEfficiency Metrics:")
    print(f"  Load factor: {efficiency_metrics['load_factor']:.3f}")
    print(f"  Capacity factor: {efficiency_metrics['capacity_factor']:.3f}")
    print(f"  Coefficient of variation: {efficiency_metrics['coefficient_of_variation']:.3f}")
    
    if 'base_load_ratio' in efficiency_metrics:
        print(f"  Base load ratio: {efficiency_metrics['base_load_ratio']:.3f}")
        print(f"  Variable load ratio: {efficiency_metrics['variable_load_ratio']:.3f}")
    
    if 'peak_to_offpeak_ratio' in efficiency_metrics:
        print(f"  Peak/off-peak ratio: {efficiency_metrics['peak_to_offpeak_ratio']:.3f}")
    
    if 'seasonal_variation' in efficiency_metrics:
        print(f"  Seasonal variation: {efficiency_metrics['seasonal_variation']:.3f}")
    
    print(f"\nPeak Demand Analysis:")
    for threshold in thresholds:
        pct_time = efficiency_metrics[f'pct_time_above_{int(threshold*100)}pct_peak']
        print(f"  Time above {int(threshold*100)}% of peak: {pct_time:.1f}%")
    
    # Efficiency benchmarking
    print(f"\nEfficiency Benchmarking:")
    print("-" * 30)
    
    # Load factor benchmarks
    if load_factor > 0.8:
        load_rating = "Excellent (very stable load)"
    elif load_factor > 0.6:
        load_rating = "Good (moderately stable)"
    elif load_factor > 0.4:
        load_rating = "Fair (variable load)"
    else:
        load_rating = "Poor (highly variable)"
    
    print(f"Load factor rating: {load_rating}")
    
    # Base load benchmarks (if available)
    if 'base_load_ratio' in efficiency_metrics:
        base_ratio = efficiency_metrics['base_load_ratio']
        if base_ratio > 0.7:
            base_rating = "High base load (check for inefficiencies)"
        elif base_ratio > 0.4:
            base_rating = "Moderate base load (typical)"
        else:
            base_rating = "Low base load (efficient variable loads)"
        
        print(f"Base load rating: {base_rating}")
    
    # Variability benchmarks
    cv = coefficient_of_variation
    if cv < 0.3:
        var_rating = "Low variability (stable consumption)"
    elif cv < 0.6:
        var_rating = "Moderate variability (typical)"
    else:
        var_rating = "High variability (check for anomalies)"
    
    print(f"Variability rating: {var_rating}")
    
else:
    print("No consumption data available for efficiency analysis")
    efficiency_metrics = {}

## 9. Summary and Recommendations

Generate actionable insights and load optimization recommendations

In [None]:
print("\nBase Load Analysis - Key Insights and Recommendations:")
print("=" * 80)

# Generate insights based on analysis
insights = []
recommendations = []

# Base load insights
if 'base_load_estimate' in locals():
    base_pct = base_load_estimate / consumption_hourly['total_power'].mean() * 100
    insights.append(f"Base load: {base_load_estimate:.0f}W ({base_pct:.1f}% of average consumption)")
    
    if base_pct > 60:
        recommendations.append("High base load detected - audit always-on devices for efficiency")
    elif base_pct < 30:
        recommendations.append("Low base load indicates good energy management")

# Load pattern insights
if 'daily_patterns' in locals():
    weekday_peak = daily_patterns['weekday']['mean'].max()
    weekend_peak = daily_patterns['weekend']['mean'].max()
    peak_diff = abs(weekday_peak - weekend_peak) / weekday_peak * 100
    
    insights.append(f"Peak consumption difference (weekday vs weekend): {peak_diff:.1f}%")
    
    if peak_diff > 30:
        recommendations.append("Significant weekday/weekend pattern - optimize for different schedules")

# Model performance insights
if model_results:
    best_model = max(model_results.keys(), key=lambda x: model_results[x]['r2_mean'])
    best_r2 = model_results[best_model]['r2_mean']
    best_mae = model_results[best_model]['mae_mean']
    
    insights.append(f"Best forecasting model: {best_model} (R²={best_r2:.3f}, MAE={best_mae:.0f}W)")
    
    if best_r2 > 0.8:
        recommendations.append("High-quality load forecasting possible - implement predictive control")
    elif best_r2 < 0.6:
        recommendations.append("Load forecasting challenging - investigate additional features")

# Anomaly insights
if anomaly_results:
    anomaly_rate = anomaly_results['anomaly_rate']
    insights.append(f"Anomaly rate: {anomaly_rate:.2f}% of consumption data")
    
    if anomaly_rate > 5:
        recommendations.append("High anomaly rate detected - investigate consumption irregularities")
    elif anomaly_rate < 1:
        recommendations.append("Low anomaly rate indicates stable consumption patterns")

# Clustering insights
if clustering_results:
    n_clusters = clustering_results['optimal_clusters']
    insights.append(f"Load patterns clustered into {n_clusters} distinct daily profiles")
    
    cluster_analysis = clustering_results['cluster_analysis']
    weekday_clusters = sum(1 for analysis in cluster_analysis.values() 
                          if analysis['weekdays'] / analysis['count'] > 0.8)
    
    if weekday_clusters > 0:
        recommendations.append(f"Implement weekday-specific load management strategies")
    
    recommendations.append("Use identified load patterns for demand response optimization")

# Efficiency insights
if efficiency_metrics:
    load_factor = efficiency_metrics['load_factor']
    insights.append(f"Load factor: {load_factor:.3f}")
    
    if load_factor < 0.5:
        recommendations.append("Low load factor - consider load balancing and peak shaving")
    elif load_factor > 0.8:
        recommendations.append("High load factor indicates efficient system utilization")
    
    if 'peak_to_offpeak_ratio' in efficiency_metrics:
        ratio = efficiency_metrics['peak_to_offpeak_ratio']
        if ratio > 2:
            recommendations.append("High peak/off-peak ratio - implement time-of-use optimization")

# Seasonal insights
if 'seasonal_variation' in efficiency_metrics:
    seasonal_var = efficiency_metrics['seasonal_variation']
    insights.append(f"Seasonal variation: {seasonal_var:.3f}")
    
    if seasonal_var > 0.5:
        recommendations.append("High seasonal variation - adapt control strategies by season")

# Display insights and recommendations
print("\nKey Insights:")
for i, insight in enumerate(insights, 1):
    print(f"{i}. {insight}")

print("\nOptimization Recommendations:")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

# Summary statistics
print("\nSummary Statistics:")
print("-" * 40)
if efficiency_metrics:
    print(f"Total energy analyzed: {efficiency_metrics['total_energy_kwh']:.1f} kWh")
    print(f"Analysis period: {total_period_hours/24:.1f} days")
    print(f"Average daily consumption: {efficiency_metrics['daily_energy_mean']:.1f} kWh")
    print(f"Peak power demand: {efficiency_metrics['peak_power_w']:.0f} W")

if 'base_load_estimate' in locals():
    print(f"Estimated base load: {base_load_estimate:.0f} W")

if model_results:
    print(f"Best forecast accuracy: {best_mae:.0f}W MAE")

if clustering_results:
    print(f"Load pattern clusters: {n_clusters}")

print(f"\nBase load analysis completed.")

In [None]:
# Save base load analysis results
import pickle
from pathlib import Path

# Create comprehensive results dictionary
base_load_analysis_results = {
    'analysis_period': {'start': start_date, 'end': end_date},
    'consumption_summary': {
        'total_hours': len(consumption_hourly) if not consumption_hourly.empty else 0,
        'base_load_estimate': base_load_estimate if 'base_load_estimate' in locals() else None,
        'efficiency_metrics': efficiency_metrics
    },
    'daily_patterns': daily_patterns if 'daily_patterns' in locals() else {},
    'model_results': model_results,
    'anomaly_results': {k: v for k, v in anomaly_results.items() if k != 'combined_anomalies'},  # Exclude series
    'clustering_results': {k: v for k, v in clustering_results.items() if k not in ['profile_matrix', 'complete_days']},  # Exclude large arrays
    'insights': insights,
    'recommendations': recommendations
}

# Add decomposition results if available
if 'decomposition_results' in locals() and decomposition_results:
    base_load_analysis_results['decomposition_summary'] = {
        'trend_strength': decomposition_results['trend_strength'],
        'seasonal_strength': decomposition_results['seasonal_strength'],
        'anomalies_count': len(decomposition_results['anomalies'])
    }

# Save to files
results_dir = Path('../../../data/processed')
results_dir.mkdir(parents=True, exist_ok=True)

# Save as pickle for programmatic use
with open(results_dir / 'base_load_analysis_results.pkl', 'wb') as f:
    pickle.dump(base_load_analysis_results, f)

# Save forecasting features if available
if not features_df.empty:
    features_sample = features_df.head(1000)  # Save sample for reference
    features_sample.to_csv(results_dir / 'load_forecasting_features_sample.csv')

# Save efficiency metrics as CSV
if efficiency_metrics:
    efficiency_df = pd.DataFrame([efficiency_metrics])
    efficiency_df.to_csv(results_dir / 'energy_efficiency_metrics.csv', index=False)

# Save summary as text
with open(results_dir / 'base_load_analysis_summary.txt', 'w') as f:
    f.write("Base Load Analysis Summary\n")
    f.write("=" * 40 + "\n\n")
    f.write(f"Analysis Period: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}\n")
    f.write(f"Data Points: {len(consumption_hourly) if not consumption_hourly.empty else 0} hours\n\n")
    
    f.write("Key Insights:\n")
    for i, insight in enumerate(insights, 1):
        f.write(f"{i}. {insight}\n")
    
    f.write("\nRecommendations:\n")
    for i, rec in enumerate(recommendations, 1):
        f.write(f"{i}. {rec}\n")
    
    if efficiency_metrics:
        f.write("\nEfficiency Metrics:\n")
        f.write(f"  Load Factor: {efficiency_metrics['load_factor']:.3f}\n")
        f.write(f"  Average Power: {efficiency_metrics['avg_power_w']:.0f} W\n")
        f.write(f"  Peak Power: {efficiency_metrics['peak_power_w']:.0f} W\n")
        f.write(f"  Daily Energy (avg): {efficiency_metrics['daily_energy_mean']:.1f} kWh\n")

print("\nBase load analysis results saved to:")
print(f"  - {results_dir / 'base_load_analysis_results.pkl'}")
if efficiency_metrics:
    print(f"  - {results_dir / 'energy_efficiency_metrics.csv'}")
print(f"  - {results_dir / 'base_load_analysis_summary.txt'}")
if not features_df.empty:
    print(f"  - {results_dir / 'load_forecasting_features_sample.csv'}")