# PV Production Analysis

This notebook analyzes photovoltaic (PV) production patterns, seasonal variations, and correlations with weather data.

## Key Analysis Areas:
1. **Production Patterns**: Daily, weekly, and seasonal patterns
2. **Weather Correlations**: Impact of temperature, cloud cover, and solar radiation
3. **Export Policy Impact**: Before/after export policy change analysis
4. **Performance Metrics**: Capacity factor, efficiency analysis
5. **Anomaly Detection**: Identify system issues or unusual performance


In [None]:
import sys
import asyncio
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

# Add parent directory to path
sys.path.append(str(Path().absolute().parent.parent))

from analysis.data_extraction import DataExtractor
from analysis.pattern_analysis import PVAnalyzer
from config.settings import PEMSSettings

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Data Loading and Preprocessing

In [None]:
# Initialize settings and data extractor
settings = PEMSSettings()
extractor = DataExtractor(settings)
pv_analyzer = PVAnalyzer()

# Define analysis period
end_date = datetime.now()
start_date = end_date - timedelta(days=730)  # 2 years

print(f"Analysis period: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

In [None]:
# Extract PV and weather data
try:
    pv_data = await extractor.extract_pv_data(start_date, end_date)
    weather_data = await extractor.extract_weather_data(start_date, end_date)
    price_data = await extractor.extract_energy_prices(start_date, end_date)
    
    print(f"PV data shape: {pv_data.shape}")
    print(f"Weather data shape: {weather_data.shape}")
    print(f"Price data shape: {price_data.shape if price_data is not None else 'None'}")
    
    # Display data summary
    print("\nPV Data Columns:")
    print(pv_data.columns.tolist())
    
    print("\nPV Data Sample:")
    display(pv_data.head())
    
except Exception as e:
    print(f"Error loading data: {e}")
    # Load from saved files if extraction fails
    try:
        pv_data = extractor.load_from_parquet("pv_data")
        weather_data = extractor.load_from_parquet("weather_data")
        print("Loaded data from saved parquet files")
    except:
        print("Failed to load saved data. Please run data extraction first.")

## 2. Basic Production Statistics

In [None]:
if not pv_data.empty:
    # Run PV analysis
    pv_results = pv_analyzer.analyze_pv_production(pv_data, weather_data)
    
    # Display basic statistics
    if "basic_stats" in pv_results:
        stats = pv_results["basic_stats"]
        
        print("=== PV PRODUCTION STATISTICS ===")
        print(f"Total Energy Generated: {stats.get('total_energy_kwh', 0):.1f} kWh")
        print(f"Maximum Power: {stats.get('max_power', 0):.1f} W")
        print(f"Mean Power: {stats.get('mean_power', 0):.1f} W")
        print(f"Capacity Factor: {stats.get('capacity_factor', 0)*100:.1f}%")
        print(f"Daylight Capacity Factor: {stats.get('daylight_capacity_factor', 0)*100:.1f}%")
        print(f"Peak Production Months: {', '.join(map(str, stats.get('peak_months', [])))}")
        print(f"Low Production Months: {', '.join(map(str, stats.get('low_months', [])))}")
else:
    print("No PV data available for analysis")

## 3. Production Patterns Visualization

In [None]:
if not pv_data.empty and 'InputPower' in pv_data.columns:
    # Create figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Daily production pattern
    hourly_avg = pv_data['InputPower'].groupby(pv_data.index.hour).mean()
    axes[0, 0].plot(hourly_avg.index, hourly_avg.values, marker='o', linewidth=2)
    axes[0, 0].set_title('Average Daily Production Pattern')
    axes[0, 0].set_xlabel('Hour of Day')
    axes[0, 0].set_ylabel('Average Power (W)')
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Monthly production variation
    monthly_total = pv_data['InputPower'].resample('M').sum() * 0.25 / 1000  # Convert to kWh
    axes[0, 1].bar(range(1, 13), 
                   [monthly_total[monthly_total.index.month == m].mean() for m in range(1, 13)])
    axes[0, 1].set_title('Average Monthly Production')
    axes[0, 1].set_xlabel('Month')
    axes[0, 1].set_ylabel('Average Monthly Energy (kWh)')
    axes[0, 1].set_xticks(range(1, 13))
    
    # 3. Weekly pattern
    weekly_avg = pv_data['InputPower'].groupby(pv_data.index.weekday).mean()
    weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    axes[1, 0].bar(weekdays, weekly_avg.values)
    axes[1, 0].set_title('Average Production by Day of Week')
    axes[1, 0].set_ylabel('Average Power (W)')
    
    # 4. Production time series (last 3 months)
    recent_data = pv_data.last('3M')['InputPower']
    daily_production = recent_data.resample('D').sum() * 0.25 / 1000
    axes[1, 1].plot(daily_production.index, daily_production.values, alpha=0.7)
    axes[1, 1].set_title('Daily Production (Last 3 Months)')
    axes[1, 1].set_xlabel('Date')
    axes[1, 1].set_ylabel('Daily Energy (kWh)')
    axes[1, 1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

## 4. Weather Correlation Analysis

In [None]:
if "weather_correlations" in pv_results:
    correlations = pv_results["weather_correlations"]
    
    print("=== WEATHER CORRELATIONS ===")
    if "correlations" in correlations:
        # Create correlation plot
        corr_data = correlations["correlations"]
        if corr_data:
            variables = list(corr_data.keys())
            corr_values = [corr_data[var]["correlation"] for var in variables]
            
            plt.figure(figsize=(12, 6))
            colors = ['red' if x < 0 else 'green' for x in corr_values]
            bars = plt.bar(variables, corr_values, color=colors, alpha=0.7)
            plt.title('PV Production Correlation with Weather Variables')
            plt.ylabel('Correlation Coefficient')
            plt.xticks(rotation=45)
            plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
            plt.grid(True, alpha=0.3)
            
            # Add value labels on bars
            for bar, value in zip(bars, corr_values):
                plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01 * np.sign(value),
                        f'{value:.3f}', ha='center', va='bottom' if value > 0 else 'top')
            
            plt.tight_layout()
            plt.show()
    
    # Print strongest correlations
    if "strongest_positive" in correlations and correlations["strongest_positive"]:
        strongest = correlations["strongest_positive"]
        print(f"Strongest positive correlation: {strongest[0]} ({strongest[1]['correlation']:.3f})")
    
    if "strongest_negative" in correlations and correlations["strongest_negative"]:
        strongest = correlations["strongest_negative"]
        print(f"Strongest negative correlation: {strongest[0]} ({strongest[1]['correlation']:.3f})")

## 5. Export Policy Impact Analysis

In [None]:
# Check if export policy analysis is available
if "export_policy" in pv_results:
    export_policy = pv_results["export_policy"]
    
    print("=== EXPORT POLICY IMPACT ===")
    if "policy_change_date" in export_policy:
        change_date = export_policy["policy_change_date"]
        print(f"Export policy change detected: {change_date}")
        print(f"Export consistency after change: {export_policy.get('export_consistency', 0):.1%}")
        print(f"Days analyzed post-change: {export_policy.get('days_analyzed', 0)}")
        
        # Pre/post export analysis
        if "pre_export_analysis" in pv_results:
            pre_analysis = pv_results["pre_export_analysis"]
            print(f"\nPre-export period (self-consumption):")
            print(f"  Duration: {pre_analysis.get('period_duration_days', 0)} days")
            print(f"  Total production: {pre_analysis.get('total_production_kwh', 0):.1f} kWh")
            print(f"  Self-consumption ratio: {pre_analysis.get('self_consumption_ratio', 0):.1%}")
            print(f"  Estimated curtailment: {pre_analysis.get('estimated_curtailment_kwh', 0):.1f} kWh")
        
        if "post_export_analysis" in pv_results:
            post_analysis = pv_results["post_export_analysis"]
            print(f"\nPost-export period (price-based):")
            print(f"  Total export: {post_analysis.get('total_export_kwh', 0):.1f} kWh")
            print(f"  Estimated revenue: {post_analysis.get('estimated_total_revenue_czk', 0):.0f} CZK")
            print(f"  Revenue per kWh: {post_analysis.get('revenue_per_kwh_czk', 0):.2f} CZK/kWh")
        
        if "optimization_potential" in pv_results:
            opt_potential = pv_results["optimization_potential"]
            print(f"\nOptimization potential:")
            print(f"  Lost revenue from curtailment: {opt_potential.get('lost_revenue_curtailment_czk', 0):.0f} CZK")
            print(f"  Storage value potential: {opt_potential.get('storage_value_potential_czk', 0):.0f} CZK")
    else:
        print("No clear export policy change detected in the data")
else:
    print("Export policy analysis not available (requires price data)")

## 6. Seasonal Analysis and Forecasting

In [None]:
if "seasonal_patterns" in pv_results:
    seasonal = pv_results["seasonal_patterns"]
    
    print("=== SEASONAL PATTERNS ===")
    if "peak_season" in seasonal:
        print(f"Peak production season: {seasonal['peak_season']}")
    
    # Plot seasonal profiles if available
    if "seasonal_profiles" in seasonal:
        profiles = seasonal["seasonal_profiles"]
        
        plt.figure(figsize=(12, 8))
        
        for season, profile in profiles.items():
            if isinstance(profile, pd.Series):
                plt.plot(profile.index, profile.values, label=season, marker='o', alpha=0.7)
        
        plt.title('Seasonal Production Profiles')
        plt.xlabel('Hour of Day')
        plt.ylabel('Average Power (W)')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()
    
    # STL decomposition results
    if "decomposition" in seasonal:
        decomp = seasonal["decomposition"]
        print(f"Seasonal strength: {decomp.get('seasonal_strength', 0):.3f}")
        print(f"Trend strength: {decomp.get('trend_strength', 0):.3f}")

## 7. Performance and Efficiency Analysis

In [None]:
# Efficiency analysis
if "efficiency_analysis" in pv_results:
    efficiency = pv_results["efficiency_analysis"]
    
    print("=== EFFICIENCY ANALYSIS ===")
    if "temperature_efficiency" in efficiency:
        temp_eff = efficiency["temperature_efficiency"]
        print("Power output by temperature range:")
        for temp_range, stats in temp_eff.items():
            print(f"  {temp_range}: {stats['mean_power']:.0f}W (max: {stats['max_power']:.0f}W, samples: {stats['records']})")
    
    if "optimal_temperature_range" in efficiency:
        print(f"Optimal temperature range: {efficiency['optimal_temperature_range']}")

# Clear sky analysis
if "clear_sky_analysis" in pv_results:
    clear_sky = pv_results["clear_sky_analysis"]
    
    print("\n=== CLEAR SKY PERFORMANCE ===")
    print(f"Clear sky conditions: {clear_sky.get('clear_sky_percentage', 0):.1f}% of time")
    print(f"Clear sky mean power: {clear_sky.get('clear_sky_mean_power', 0):.0f}W")
    print(f"Cloudy sky mean power: {clear_sky.get('cloudy_sky_mean_power', 0):.0f}W")
    print(f"Clear sky advantage: {clear_sky.get('clear_sky_advantage', 1):.1f}x")

## 8. Anomaly Detection

In [None]:
if "anomalies" in pv_results:
    anomalies = pv_results["anomalies"]
    
    print("=== ANOMALY DETECTION ===")
    print(f"Total anomalies detected: {anomalies.get('total_anomalies', 0)}")
    print(f"Anomaly rate: {anomalies.get('anomaly_percentage', 0):.2f}%")
    print(f"Zero production events: {anomalies.get('zero_production_events', 0)}")
    
    if "largest_anomaly" in anomalies and anomalies["largest_anomaly"]["timestamp"]:
        largest = anomalies["largest_anomaly"]
        print(f"Largest anomaly: {largest['value']:.0f}W at {largest['timestamp']}")
    
    # Plot anomaly dates if available
    if "anomaly_dates" in anomalies and anomalies["anomaly_dates"]:
        anomaly_dates = anomalies["anomaly_dates"][:10]  # First 10
        print(f"\nFirst 10 anomaly dates: {', '.join(map(str, anomaly_dates))}")

## 9. Feature Importance for ML Models

In [None]:
if "feature_importance" in pv_results:
    feature_imp = pv_results["feature_importance"]
    
    print("=== FEATURE IMPORTANCE FOR PV PREDICTION ===")
    if "top_5_features" in feature_imp:
        top_features = feature_imp["top_5_features"]
        print(f"Top 5 features: {', '.join(top_features)}")
        print(f"Model R² score: {feature_imp.get('model_score', 0):.3f}")
        
        # Plot feature importance
        if "feature_importance" in feature_imp:
            importance_dict = feature_imp["feature_importance"]
            if importance_dict:
                features = list(importance_dict.keys())[:10]  # Top 10
                importance = [importance_dict[f] for f in features]
                
                plt.figure(figsize=(10, 6))
                plt.barh(features, importance, alpha=0.7)
                plt.title('Feature Importance for PV Production Prediction')
                plt.xlabel('Importance Score')
                plt.gca().invert_yaxis()
                plt.tight_layout()
                plt.show()

## 10. Model Performance for Forecasting

In [None]:
if "prediction_performance" in pv_results:
    pred_perf = pv_results["prediction_performance"]
    
    print("=== PREDICTION MODEL PERFORMANCE ===")
    
    # Display model comparison
    models_data = []
    for model_name, metrics in pred_perf.items():
        if isinstance(metrics, dict) and "mean_rmse" in metrics:
            models_data.append({
                "Model": model_name,
                "RMSE": f"{metrics['mean_rmse']:.1f} ± {metrics.get('std_rmse', 0):.1f}",
                "MAE": f"{metrics['mean_mae']:.1f} ± {metrics.get('std_mae', 0):.1f}",
                "R²": f"{metrics['mean_r2']:.3f} ± {metrics.get('std_r2', 0):.3f}"
            })
    
    if models_data:
        models_df = pd.DataFrame(models_data)
        display(models_df)
        
        if "best_model" in pred_perf:
            print(f"\nBest performing model: {pred_perf['best_model']}")
    else:
        print("No valid model performance data available")

## Summary and Recommendations

Based on the analysis above, here are the key findings and recommendations for the PV system:

In [None]:
print("=== PV SYSTEM ANALYSIS SUMMARY ===")
print()

# Generate recommendations based on analysis results
recommendations = []

if "basic_stats" in pv_results:
    stats = pv_results["basic_stats"]
    capacity_factor = stats.get('capacity_factor', 0)
    
    if capacity_factor < 0.15:
        recommendations.append("LOW CAPACITY FACTOR: Consider system inspection for shading, soiling, or equipment issues")
    elif capacity_factor > 0.25:
        recommendations.append("EXCELLENT PERFORMANCE: System is performing above average")

if "anomalies" in pv_results:
    anomaly_rate = pv_results["anomalies"].get('anomaly_percentage', 0)
    if anomaly_rate > 5:
        recommendations.append(f"HIGH ANOMALY RATE ({anomaly_rate:.1f}%): Investigate system reliability")

if "export_policy" in pv_results and "optimization_potential" in pv_results:
    lost_revenue = pv_results["optimization_potential"].get('lost_revenue_curtailment_czk', 0)
    if lost_revenue > 1000:
        recommendations.append(f"CURTAILMENT LOSSES: {lost_revenue:.0f} CZK lost - consider battery storage")

if "weather_correlations" in pv_results:
    correlations = pv_results["weather_correlations"]
    if "strongest_negative" in correlations and correlations["strongest_negative"]:
        strongest_neg = correlations["strongest_negative"]
        if abs(strongest_neg[1]["correlation"]) > 0.5:
            recommendations.append(f"STRONG NEGATIVE CORRELATION with {strongest_neg[0]} - optimize for these conditions")

if recommendations:
    print("KEY RECOMMENDATIONS:")
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec}")
else:
    print("SYSTEM STATUS: No major issues detected - system performing within normal parameters")

print("\nAnalysis completed successfully!")
print(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")