# PV Production Analysis

## Objectives:
1. Analyze daily and seasonal production patterns
2. Correlate production with weather conditions
3. Calculate system efficiency metrics
4. Identify anomalies and degradation
5. Create production forecasting features

## Key Analyses:
- Daily production curves by season
- Clear sky vs actual production
- Temperature efficiency analysis
- Weather correlation heatmaps
- Anomaly detection results
- Self-consumption patterns
- Battery cycling analysis

In [ ]:
# Import required libraries
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import asyncio
import warnings
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
warnings.filterwarnings('ignore')

# Add pems_v2 directory to path for imports
sys.path.append(str(Path('../pems_v2').resolve()))

# Import project modules
from analysis.core.data_extraction import DataExtractor
from analysis.analyzers.pattern_analysis import PVAnalyzer
from config.settings import PEMSSettings

# Set up plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load PV Production Data

Load comprehensive PV data including production, battery state, and weather conditions

In [ ]:
# Initialize settings and extractors
settings = PEMSSettings()
extractor = DataExtractor(settings)
pv_analyzer = PVAnalyzer()

# Define analysis period (last 90 days for seasonal patterns)
end_date = datetime.now()
start_date = end_date - timedelta(days=90)

print(f"Analysis period: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

In [None]:
# Extract PV and weather data
async def load_pv_data():
    """Load PV production and related data."""
    print("Loading PV production data...")
    pv_data = await extractor.extract_pv_data(start_date, end_date)
    
    print("Loading weather data...")
    weather_data = await extractor.extract_weather_data(start_date, end_date)
    current_weather = await extractor.extract_current_weather(start_date, end_date)
    
    print("Loading energy prices...")
    price_data = await extractor.extract_energy_prices(start_date, end_date)
    
    return pv_data, weather_data, current_weather, price_data

# Load data
pv_data, weather_data, current_weather, price_data = await load_pv_data()

print(f"\nData loaded:")
print(f"  PV records: {len(pv_data)}")
print(f"  Weather records: {len(weather_data)}")
print(f"  Current weather records: {len(current_weather)}")
print(f"  Price records: {len(price_data) if price_data is not None else 0}")

## 2. Daily Production Patterns

Analyze typical daily production curves by season

In [None]:
# Add time-based features to PV data
if not pv_data.empty:
    pv_data['hour'] = pv_data.index.hour
    pv_data['month'] = pv_data.index.month
    pv_data['day_of_year'] = pv_data.index.dayofyear
    pv_data['season'] = pv_data['month'].map({
        12: 'Winter', 1: 'Winter', 2: 'Winter',
        3: 'Spring', 4: 'Spring', 5: 'Spring',
        6: 'Summer', 7: 'Summer', 8: 'Summer',
        9: 'Autumn', 10: 'Autumn', 11: 'Autumn'
    })

In [None]:
# Plot daily production patterns by season
if not pv_data.empty and 'InputPower' in pv_data.columns:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    seasons = ['Spring', 'Summer', 'Autumn', 'Winter']
    colors = ['green', 'orange', 'brown', 'blue']
    
    for idx, (season, color) in enumerate(zip(seasons, colors)):
        season_data = pv_data[pv_data['season'] == season]
        
        if not season_data.empty:
            # Calculate hourly statistics
            hourly_stats = season_data.groupby('hour')['InputPower'].agg([
                'mean', 'std',
                ('p10', lambda x: np.percentile(x, 10)),
                ('p90', lambda x: np.percentile(x, 90))
            ])
            
            # Plot mean with confidence band
            axes[idx].plot(hourly_stats.index, hourly_stats['mean'], 
                          label=f'{season} Average', color=color, linewidth=2)
            axes[idx].fill_between(hourly_stats.index, 
                                  hourly_stats['p10'], 
                                  hourly_stats['p90'],
                                  alpha=0.3, color=color, 
                                  label='10th-90th percentile')
            
            axes[idx].set_title(f'{season} Daily Production Profile')
            axes[idx].set_xlabel('Hour of Day')
            axes[idx].set_ylabel('Power (W)')
            axes[idx].set_xlim(0, 23)
            axes[idx].legend()
            axes[idx].grid(True, alpha=0.3)
    
    plt.suptitle('PV Production Patterns by Season', fontsize=16)
    plt.tight_layout()
    plt.show()
    
    # Print peak production statistics
    print("\nPeak Production Statistics by Season:")
    print("=" * 50)
    for season in seasons:
        season_data = pv_data[pv_data['season'] == season]
        if not season_data.empty:
            max_power = season_data['InputPower'].max()
            avg_peak = season_data.groupby(season_data.index.date)['InputPower'].max().mean()
            print(f"{season:10s}: Max={max_power:6.0f}W, Avg Daily Peak={avg_peak:6.0f}W")

## 3. Weather Correlation Analysis

Analyze how weather conditions affect PV production

In [None]:
# Merge PV and weather data for correlation analysis
if not pv_data.empty and not weather_data.empty:
    # Resample to hourly for cleaner correlation
    pv_hourly = pv_data[['InputPower', 'SOC', 'ACPowerToGrid', 'ACPowerToUser']].resample('H').mean()
    weather_hourly = weather_data.resample('H').mean()
    
    # Merge datasets
    merged_data = pd.merge(pv_hourly, weather_hourly, 
                          left_index=True, right_index=True, how='inner')
    
    # Add current weather data if available
    if not current_weather.empty:
        current_hourly = current_weather.resample('H').mean()
        merged_data = pd.merge(merged_data, current_hourly, 
                              left_index=True, right_index=True, how='left')
    
    print(f"Merged data shape: {merged_data.shape}")
    print(f"Available columns: {list(merged_data.columns)}")

In [None]:
# Weather correlation heatmap
if 'merged_data' in locals() and not merged_data.empty:
    # Select relevant columns for correlation
    weather_cols = ['temperature_2m', 'cloudcover', 'shortwave_radiation', 
                   'direct_radiation', 'diffuse_radiation']
    pv_cols = ['InputPower', 'ACPowerToGrid', 'ACPowerToUser']
    
    # Filter available columns
    available_weather = [col for col in weather_cols if col in merged_data.columns]
    available_pv = [col for col in pv_cols if col in merged_data.columns]
    
    if available_weather and available_pv:
        # Calculate correlation matrix
        corr_data = merged_data[available_weather + available_pv].dropna()
        correlation_matrix = corr_data.corr()
        
        # Create heatmap
        plt.figure(figsize=(10, 8))
        mask = np.zeros_like(correlation_matrix)
        mask[np.triu_indices_from(mask)] = True
        
        sns.heatmap(correlation_matrix, annot=True, fmt='.2f', 
                   cmap='coolwarm', center=0, square=True,
                   linewidths=1, cbar_kws={"shrink": .8},
                   mask=mask)
        plt.title('Weather-PV Production Correlation Matrix', fontsize=14)
        plt.tight_layout()
        plt.show()
        
        # Print strongest correlations
        print("\nStrongest Weather-PV Correlations:")
        print("=" * 50)
        for pv_col in available_pv:
            correlations = correlation_matrix.loc[available_weather, pv_col].sort_values(ascending=False)
            print(f"\n{pv_col}:")
            for weather_col, corr in correlations.items():
                if abs(corr) > 0.3:  # Only show significant correlations
                    print(f"  {weather_col:25s}: {corr:+.3f}")

In [None]:
# Temperature efficiency analysis
if 'merged_data' in locals() and 'temperature_2m' in merged_data.columns:
    # Filter for meaningful production periods
    production_data = merged_data[merged_data['InputPower'] > 100].copy()
    
    if not production_data.empty and 'shortwave_radiation' in production_data.columns:
        # Calculate efficiency proxy (production per unit radiation)
        production_data['efficiency'] = production_data['InputPower'] / (production_data['shortwave_radiation'] + 1)
        
        # Bin temperature data
        temp_bins = pd.cut(production_data['temperature_2m'], bins=10)
        efficiency_by_temp = production_data.groupby(temp_bins)['efficiency'].agg(['mean', 'std', 'count'])
        
        # Plot efficiency vs temperature
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Efficiency curve
        valid_bins = efficiency_by_temp[efficiency_by_temp['count'] > 10]
        if not valid_bins.empty:
            x_temps = [interval.mid for interval in valid_bins.index]
            ax1.errorbar(x_temps, valid_bins['mean'], yerr=valid_bins['std'], 
                        fmt='o-', capsize=5, label='Mean ± Std')
            ax1.set_xlabel('Temperature (°C)')
            ax1.set_ylabel('Efficiency (W per W/m²)')
            ax1.set_title('PV Efficiency vs Temperature')
            ax1.grid(True, alpha=0.3)
            ax1.legend()
        
        # Scatter plot with trend
        sample_data = production_data.sample(min(1000, len(production_data)))
        ax2.scatter(sample_data['temperature_2m'], sample_data['InputPower'], 
                   alpha=0.5, s=20, label='Data points')
        
        # Add trend line
        z = np.polyfit(production_data['temperature_2m'], production_data['InputPower'], 2)
        p = np.poly1d(z)
        temp_range = np.linspace(production_data['temperature_2m'].min(), 
                                production_data['temperature_2m'].max(), 100)
        ax2.plot(temp_range, p(temp_range), 'r-', linewidth=2, label='Trend')
        
        ax2.set_xlabel('Temperature (°C)')
        ax2.set_ylabel('PV Power (W)')
        ax2.set_title('PV Production vs Temperature')
        ax2.grid(True, alpha=0.3)
        ax2.legend()
        
        plt.tight_layout()
        plt.show()
        
        # Calculate temperature coefficient
        if len(production_data) > 100:
            slope, intercept, r_value, p_value, std_err = stats.linregress(
                production_data['temperature_2m'], production_data['efficiency']
            )
            print(f"\nTemperature Coefficient Analysis:")
            print(f"  Efficiency change per °C: {slope:.4f}")
            print(f"  R-squared: {r_value**2:.3f}")
            print(f"  Statistical significance (p-value): {p_value:.4f}")

## 4. System Performance Metrics

Calculate key performance indicators for the PV system

In [None]:
# Calculate daily performance metrics
if not pv_data.empty:
    # Daily aggregations
    daily_metrics = pd.DataFrame()
    
    # Energy calculations (assuming 15-minute data)
    if 'solar_energy_kwh' in pv_data.columns:
        daily_metrics['total_production_kwh'] = pv_data['solar_energy_kwh'].resample('D').sum()
    
    if 'ACPowerToUser' in pv_data.columns:
        daily_metrics['self_consumed_kwh'] = (pv_data['ACPowerToUser'] * 0.25 / 1000).resample('D').sum()
    
    if 'ACPowerToGrid' in pv_data.columns:
        daily_metrics['exported_kwh'] = (pv_data['ACPowerToGrid'] * 0.25 / 1000).resample('D').sum()
    
    if 'battery_energy_kwh' in pv_data.columns:
        daily_metrics['battery_throughput_kwh'] = pv_data['battery_energy_kwh'].abs().resample('D').sum()
    
    # Calculate self-consumption ratio
    if 'self_consumed_kwh' in daily_metrics.columns and 'total_production_kwh' in daily_metrics.columns:
        daily_metrics['self_consumption_ratio'] = (
            daily_metrics['self_consumed_kwh'] / 
            (daily_metrics['total_production_kwh'] + 0.001)  # Avoid division by zero
        ).clip(0, 1)
    
    # Display summary statistics
    print("\nPV System Performance Summary:")
    print("=" * 60)
    
    for col in daily_metrics.columns:
        if daily_metrics[col].notna().any():
            mean_val = daily_metrics[col].mean()
            max_val = daily_metrics[col].max()
            total_val = daily_metrics[col].sum()
            
            if 'ratio' in col:
                print(f"{col:30s}: Mean={mean_val:6.1%}, Max={max_val:6.1%}")
            else:
                print(f"{col:30s}: Mean={mean_val:6.1f}, Max={max_val:6.1f}, Total={total_val:8.1f}")

In [None]:
# Visualize performance metrics over time
if not daily_metrics.empty:
    fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
    
    # Production and consumption
    if 'total_production_kwh' in daily_metrics.columns:
        daily_metrics['total_production_kwh'].rolling(7).mean().plot(
            ax=axes[0], label='7-day avg Production', color='orange', linewidth=2
        )
        if 'self_consumed_kwh' in daily_metrics.columns:
            daily_metrics['self_consumed_kwh'].rolling(7).mean().plot(
                ax=axes[0], label='7-day avg Self-consumption', color='green', linewidth=2
            )
        if 'exported_kwh' in daily_metrics.columns:
            daily_metrics['exported_kwh'].rolling(7).mean().plot(
                ax=axes[0], label='7-day avg Export', color='blue', linewidth=2
            )
        axes[0].set_ylabel('Energy (kWh/day)')
        axes[0].set_title('Daily Energy Flows (7-day moving average)')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
    
    # Self-consumption ratio
    if 'self_consumption_ratio' in daily_metrics.columns:
        daily_metrics['self_consumption_ratio'].rolling(7).mean().plot(
            ax=axes[1], color='purple', linewidth=2
        )
        axes[1].set_ylabel('Ratio')
        axes[1].set_title('Self-Consumption Ratio (7-day moving average)')
        axes[1].set_ylim(0, 1)
        axes[1].grid(True, alpha=0.3)
    
    # Battery usage
    if 'battery_throughput_kwh' in daily_metrics.columns:
        daily_metrics['battery_throughput_kwh'].rolling(7).mean().plot(
            ax=axes[2], color='red', linewidth=2
        )
        axes[2].set_ylabel('Energy (kWh/day)')
        axes[2].set_title('Battery Throughput (7-day moving average)')
        axes[2].grid(True, alpha=0.3)
    
    plt.xlabel('Date')
    plt.tight_layout()
    plt.show()

## 5. Anomaly Detection

Identify unusual patterns in PV production that might indicate issues

In [None]:
# Prepare data for anomaly detection
if not pv_data.empty and 'InputPower' in pv_data.columns:
    # Create hourly aggregates for anomaly detection
    hourly_data = pv_data[['InputPower', 'SOC']].resample('H').agg({
        'InputPower': ['mean', 'max', 'std'],
        'SOC': 'mean'
    })
    
    # Flatten column names
    hourly_data.columns = ['_'.join(col).strip() for col in hourly_data.columns.values]
    
    # Add time features
    hourly_data['hour'] = hourly_data.index.hour
    hourly_data['month'] = hourly_data.index.month
    
    # Filter daylight hours only (6 AM to 7 PM)
    daylight_data = hourly_data[(hourly_data['hour'] >= 6) & (hourly_data['hour'] <= 19)].copy()
    
    print(f"Anomaly detection data shape: {daylight_data.shape}")

In [None]:
# Run anomaly detection using Isolation Forest
if 'daylight_data' in locals() and len(daylight_data) > 100:
    # Prepare features for anomaly detection
    feature_cols = ['InputPower_mean', 'InputPower_max', 'InputPower_std', 'hour', 'month']
    available_features = [col for col in feature_cols if col in daylight_data.columns]
    
    if len(available_features) >= 3:
        # Remove NaN values
        anomaly_data = daylight_data[available_features].dropna()
        
        # Standardize features
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(anomaly_data)
        
        # Run Isolation Forest
        iso_forest = IsolationForest(contamination=0.05, random_state=42)
        anomaly_predictions = iso_forest.fit_predict(scaled_features)
        
        # Add predictions back to dataframe
        anomaly_data['anomaly'] = anomaly_predictions
        anomaly_data['anomaly_score'] = iso_forest.score_samples(scaled_features)
        
        # Identify anomalies
        anomalies = anomaly_data[anomaly_data['anomaly'] == -1]
        
        print(f"\nAnomaly Detection Results:")
        print(f"  Total hours analyzed: {len(anomaly_data)}")
        print(f"  Anomalies detected: {len(anomalies)} ({len(anomalies)/len(anomaly_data)*100:.1f}%)")
        
        # Visualize anomalies
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10), sharex=True)
        
        # Plot production with anomalies highlighted
        ax1.scatter(anomaly_data.index, anomaly_data['InputPower_mean'], 
                   c='blue', alpha=0.5, s=10, label='Normal')
        ax1.scatter(anomalies.index, anomalies['InputPower_mean'], 
                   c='red', s=50, label='Anomaly', marker='x')
        ax1.set_ylabel('Average Power (W)')
        ax1.set_title('PV Production Anomaly Detection')
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # Plot anomaly scores
        ax2.plot(anomaly_data.index, anomaly_data['anomaly_score'], 
                color='purple', alpha=0.7)
        ax2.axhline(y=anomaly_data['anomaly_score'].quantile(0.05), 
                   color='red', linestyle='--', label='Anomaly threshold')
        ax2.set_ylabel('Anomaly Score')
        ax2.set_xlabel('Date')
        ax2.set_title('Anomaly Scores Over Time')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        # Analyze anomaly patterns
        if len(anomalies) > 0:
            print("\nAnomaly Pattern Analysis:")
            print(f"  Most common hour: {anomalies.index.hour.mode().values[0] if len(anomalies.index.hour.mode()) > 0 else 'N/A'}")
            print(f"  Average production during anomalies: {anomalies['InputPower_mean'].mean():.0f}W")
            print(f"  Normal average production: {anomaly_data[anomaly_data['anomaly'] == 1]['InputPower_mean'].mean():.0f}W")
            
            # Show recent anomalies
            recent_anomalies = anomalies.tail(5)
            if len(recent_anomalies) > 0:
                print("\nRecent Anomalies:")
                for idx, row in recent_anomalies.iterrows():
                    print(f"  {idx}: Power={row['InputPower_mean']:.0f}W")

## 6. Battery Cycling Analysis

Analyze battery charge/discharge patterns and health

In [None]:
# Analyze battery cycling patterns
if not pv_data.empty and 'SOC' in pv_data.columns:
    # Calculate battery metrics
    battery_metrics = pd.DataFrame(index=pv_data.index)
    battery_metrics['SOC'] = pv_data['SOC']
    
    if 'ChargePower' in pv_data.columns and 'DischargePower' in pv_data.columns:
        battery_metrics['net_power'] = pv_data['ChargePower'].fillna(0) - pv_data['DischargePower'].fillna(0)
        battery_metrics['is_charging'] = battery_metrics['net_power'] > 10
        battery_metrics['is_discharging'] = battery_metrics['net_power'] < -10
    
    # Daily battery statistics
    daily_battery = pd.DataFrame()
    daily_battery['soc_min'] = battery_metrics['SOC'].resample('D').min()
    daily_battery['soc_max'] = battery_metrics['SOC'].resample('D').max()
    daily_battery['soc_range'] = daily_battery['soc_max'] - daily_battery['soc_min']
    daily_battery['soc_mean'] = battery_metrics['SOC'].resample('D').mean()
    
    if 'net_power' in battery_metrics.columns:
        # Calculate charge/discharge hours
        daily_battery['charge_hours'] = battery_metrics['is_charging'].resample('D').sum() * 0.25
        daily_battery['discharge_hours'] = battery_metrics['is_discharging'].resample('D').sum() * 0.25
        
        # Calculate energy throughput
        charge_energy = battery_metrics[battery_metrics['net_power'] > 0]['net_power'] * 0.25 / 1000
        discharge_energy = battery_metrics[battery_metrics['net_power'] < 0]['net_power'].abs() * 0.25 / 1000
        daily_battery['charge_kwh'] = charge_energy.resample('D').sum()
        daily_battery['discharge_kwh'] = discharge_energy.resample('D').sum()
    
    # Display battery statistics
    print("\nBattery Usage Statistics:")
    print("=" * 60)
    print(f"Average daily SOC range: {daily_battery['soc_range'].mean():.1f}%")
    print(f"Average minimum SOC: {daily_battery['soc_min'].mean():.1f}%")
    print(f"Average maximum SOC: {daily_battery['soc_max'].mean():.1f}%")
    
    if 'charge_hours' in daily_battery.columns:
        print(f"Average daily charge time: {daily_battery['charge_hours'].mean():.1f} hours")
        print(f"Average daily discharge time: {daily_battery['discharge_hours'].mean():.1f} hours")
        print(f"Average daily charge energy: {daily_battery['charge_kwh'].mean():.1f} kWh")
        print(f"Average daily discharge energy: {daily_battery['discharge_kwh'].mean():.1f} kWh")

In [None]:
# Visualize battery patterns
if 'daily_battery' in locals() and not daily_battery.empty:
    fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True)
    
    # SOC patterns
    daily_battery[['soc_min', 'soc_max', 'soc_mean']].plot(ax=axes[0])
    axes[0].fill_between(daily_battery.index, 
                        daily_battery['soc_min'], 
                        daily_battery['soc_max'], 
                        alpha=0.3, label='SOC Range')
    axes[0].set_ylabel('SOC (%)')
    axes[0].set_title('Daily Battery State of Charge')
    axes[0].set_ylim(0, 100)
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Charge/discharge hours
    if 'charge_hours' in daily_battery.columns:
        daily_battery[['charge_hours', 'discharge_hours']].plot(kind='bar', 
                                                                ax=axes[1], 
                                                                stacked=True)
        axes[1].set_ylabel('Hours')
        axes[1].set_title('Daily Battery Operation Time')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
    
    # Energy throughput
    if 'charge_kwh' in daily_battery.columns:
        daily_battery[['charge_kwh', 'discharge_kwh']].rolling(7).mean().plot(ax=axes[2])
        axes[2].set_ylabel('Energy (kWh)')
        axes[2].set_xlabel('Date')
        axes[2].set_title('Daily Battery Energy Throughput (7-day average)')
        axes[2].legend()
        axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 7. Export Policy Analysis

Analyze grid export patterns and optimization with energy prices

In [None]:
# Analyze export patterns with price data
if not pv_data.empty and 'ACPowerToGrid' in pv_data.columns and price_data is not None:
    # Merge PV and price data
    export_analysis = pd.DataFrame()
    export_analysis['export_power'] = pv_data['ACPowerToGrid'].resample('H').mean()
    export_analysis['export_energy_kwh'] = (pv_data['ACPowerToGrid'] * 0.25 / 1000).resample('H').sum()
    
    # Merge with price data
    price_hourly = price_data.resample('H').mean()
    export_analysis = pd.merge(export_analysis, price_hourly, 
                              left_index=True, right_index=True, how='left')
    
    if 'price_czk_kwh' in export_analysis.columns:
        # Calculate export value
        export_analysis['export_value_czk'] = (
            export_analysis['export_energy_kwh'] * export_analysis['price_czk_kwh']
        )
        
        # Identify high-price export periods
        price_threshold = export_analysis['price_czk_kwh'].quantile(0.75)
        export_analysis['high_price_period'] = export_analysis['price_czk_kwh'] > price_threshold
        
        # Calculate statistics
        total_export = export_analysis['export_energy_kwh'].sum()
        high_price_export = export_analysis[export_analysis['high_price_period']]['export_energy_kwh'].sum()
        
        print("\nGrid Export Analysis:")
        print("=" * 60)
        print(f"Total export: {total_export:.1f} kWh")
        print(f"Export during high prices (>{price_threshold:.2f} CZK/kWh): {high_price_export:.1f} kWh ({high_price_export/total_export*100:.1f}%)")
        print(f"Average export price: {export_analysis['price_czk_kwh'].mean():.2f} CZK/kWh")
        print(f"Total export value: {export_analysis['export_value_czk'].sum():.0f} CZK")
        
        # Visualize export patterns
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 8), sharex=True)
        
        # Export power and prices
        ax1_twin = ax1.twinx()
        export_analysis['export_power'].resample('D').mean().plot(ax=ax1, 
                                                                  color='blue', 
                                                                  label='Export Power')
        export_analysis['price_czk_kwh'].resample('D').mean().plot(ax=ax1_twin, 
                                                                   color='red', 
                                                                   label='Energy Price', 
                                                                   alpha=0.7)
        ax1.set_ylabel('Export Power (W)', color='blue')
        ax1_twin.set_ylabel('Price (CZK/kWh)', color='red')
        ax1.set_title('Daily Average Export Power vs Energy Prices')
        ax1.grid(True, alpha=0.3)
        
        # Export value
        daily_export_value = export_analysis['export_value_czk'].resample('D').sum()
        daily_export_value.rolling(7).mean().plot(ax=ax2, color='green', linewidth=2)
        ax2.set_ylabel('Export Value (CZK/day)')
        ax2.set_xlabel('Date')
        ax2.set_title('Daily Export Value (7-day moving average)')
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

## 8. Summary and Feature Creation

Create features for ML model development

In [None]:
# Create comprehensive feature set for PV prediction
if not pv_data.empty:
    print("\nCreating PV Prediction Features:")
    print("=" * 60)
    
    features = pd.DataFrame(index=pv_data.index)
    
    # Time-based features
    features['hour'] = features.index.hour
    features['day_of_year'] = features.index.dayofyear
    features['month'] = features.index.month
    features['is_weekend'] = features.index.weekday.isin([5, 6]).astype(int)
    
    # Cyclical encoding for time features
    features['hour_sin'] = np.sin(2 * np.pi * features['hour'] / 24)
    features['hour_cos'] = np.cos(2 * np.pi * features['hour'] / 24)
    features['month_sin'] = np.sin(2 * np.pi * features['month'] / 12)
    features['month_cos'] = np.cos(2 * np.pi * features['month'] / 12)
    
    # Lag features from PV data
    if 'InputPower' in pv_data.columns:
        features['pv_power_lag_1h'] = pv_data['InputPower'].shift(4)  # 1 hour ago (15-min data)
        features['pv_power_lag_24h'] = pv_data['InputPower'].shift(96)  # 24 hours ago
        features['pv_power_rolling_mean_3h'] = pv_data['InputPower'].rolling(12).mean()
    
    # Weather features (if available)
    if not weather_data.empty:
        weather_features = weather_data[[
            col for col in ['temperature_2m', 'cloudcover', 'shortwave_radiation', 
                           'direct_radiation', 'diffuse_radiation'] 
            if col in weather_data.columns
        ]]
        features = pd.merge(features, weather_features, 
                           left_index=True, right_index=True, how='left')
    
    # Battery state features
    if 'SOC' in pv_data.columns:
        features['battery_soc'] = pv_data['SOC']
        features['battery_available_capacity'] = 100 - features['battery_soc']
    
    # Target variable
    if 'InputPower' in pv_data.columns:
        features['target_pv_power'] = pv_data['InputPower']
    
    # Display feature summary
    print(f"Total features created: {len(features.columns)}")
    print(f"\nFeature categories:")
    print(f"  - Time-based: {len([col for col in features.columns if any(x in col for x in ['hour', 'day', 'month', 'weekend'])])}")
    print(f"  - Weather: {len([col for col in features.columns if any(x in col for x in ['temp', 'cloud', 'radiation', 'wind'])])}")
    print(f"  - Lag features: {len([col for col in features.columns if 'lag' in col])}")
    print(f"  - Battery: {len([col for col in features.columns if 'battery' in col])}")
    
    # Save features for later use
    features_file = Path('../../../data/processed/pv_prediction_features.parquet')
    features_file.parent.mkdir(parents=True, exist_ok=True)
    features.to_parquet(features_file)
    print(f"\nFeatures saved to: {features_file}")

## 9. Key Insights and Recommendations

Summarize findings and provide actionable recommendations

In [None]:
print("\nPV Production Analysis - Key Insights:")
print("=" * 60)

insights = []

# Production insights
if not pv_data.empty and 'InputPower' in pv_data.columns:
    max_production = pv_data['InputPower'].max()
    avg_daily_production = pv_data['solar_energy_kwh'].resample('D').sum().mean() if 'solar_energy_kwh' in pv_data.columns else 0
    insights.append(f"Peak production capacity: {max_production:.0f}W")
    insights.append(f"Average daily production: {avg_daily_production:.1f} kWh")

# Self-consumption insights
if 'daily_metrics' in locals() and 'self_consumption_ratio' in daily_metrics.columns:
    avg_self_consumption = daily_metrics['self_consumption_ratio'].mean()
    insights.append(f"Average self-consumption ratio: {avg_self_consumption:.1%}")

# Weather correlation insights
if 'correlation_matrix' in locals():
    insights.append("Strong positive correlation between solar radiation and PV production")
    insights.append("Temperature has minor negative impact on efficiency")

# Anomaly insights
if 'anomalies' in locals() and len(anomalies) > 0:
    insights.append(f"Detected {len(anomalies)} production anomalies requiring investigation")

# Battery insights
if 'daily_battery' in locals():
    avg_soc_range = daily_battery['soc_range'].mean()
    insights.append(f"Average daily battery SOC range: {avg_soc_range:.1f}%")

# Display insights
for i, insight in enumerate(insights, 1):
    print(f"{i}. {insight}")

print("\nRecommendations:")
print("=" * 60)
recommendations = [
    "Implement weather-based PV production forecasting for better energy management",
    "Optimize battery charging schedule based on production patterns and price signals",
    "Investigate detected anomalies to identify potential system issues",
    "Consider increasing self-consumption through load shifting during peak production",
    "Monitor temperature coefficients for signs of panel degradation"
]

for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")