# Weather Correlation Analysis

## Objectives:
1. Analyze correlations between weather conditions and energy systems
2. Quantify weather impact on PV production, heating demand, and total consumption
3. Develop weather-based prediction models
4. Identify optimal weather conditions for energy efficiency
5. Create weather features for energy management optimization

## Key Analyses:
- PV production vs solar radiation correlation
- Temperature impact on heating demand
- Weather influence on total energy consumption
- Seasonal weather pattern analysis
- Weather-based forecasting model development
- Extreme weather event analysis
- Weather optimization recommendations

In [ ]:
# Import required libraries
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import asyncio
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr, spearmanr
import warnings
warnings.filterwarnings('ignore')

# Add pems_v2 directory to path for imports
sys.path.append(str(Path('./pems_v2').resolve()))

# Import project modules
from analysis.core.data_extraction import DataExtractor
from config.settings import PEMSSettings

# Set up plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## 1. Load Weather and Energy Data

Load comprehensive weather data and all energy system data for correlation analysis

In [None]:
# Initialize settings and extractors
settings = PEMSSettings()
extractor = DataExtractor(settings)

# Define analysis period (last 150 days for comprehensive weather analysis)
end_date = datetime.now()
start_date = end_date - timedelta(days=150)

print(f"Analysis period: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

In [None]:
# Extract weather and energy data
async def load_weather_and_energy_data():
    """Load comprehensive weather and energy data."""
    print("Loading weather forecast data...")
    weather_data = await extractor.extract_weather_data(start_date, end_date)
    
    print("Loading current weather data...")
    try:
        current_weather = await extractor.extract_current_weather(start_date, end_date)
    except Exception as e:
        print(f"Warning: Could not load current weather: {e}")
        current_weather = pd.DataFrame()
    
    print("Loading PV production data...")
    pv_data = await extractor.extract_pv_data(start_date, end_date)
    
    print("Loading energy consumption data...")
    consumption_data = await extractor.extract_energy_consumption(start_date, end_date)
    
    print("Loading room temperature data...")
    room_data = await extractor.extract_room_temperatures(start_date, end_date)
    
    print("Loading heating relay data...")
    relay_data = await extractor.extract_relay_states(start_date, end_date)
    
    print("Loading battery data...")
    try:
        battery_data = await extractor.extract_battery_data(start_date, end_date)
    except Exception as e:
        print(f"Warning: Could not load battery data: {e}")
        battery_data = pd.DataFrame()
    
    return weather_data, current_weather, pv_data, consumption_data, room_data, relay_data, battery_data

# Load data
weather_data, current_weather, pv_data, consumption_data, room_data, relay_data, battery_data = await load_weather_and_energy_data()

print(f"\nData loaded:")
print(f"  Weather forecast records: {len(weather_data)}")
print(f"  Current weather records: {len(current_weather)}")
print(f"  PV records: {len(pv_data)}")
print(f"  Consumption records: {len(consumption_data)}")
print(f"  Room data: {len(room_data)} rooms")
print(f"  Relay data: {len(relay_data)} rooms")
print(f"  Battery records: {len(battery_data)}")

# Display available weather parameters
if not weather_data.empty:
    print(f"\nWeather forecast parameters: {list(weather_data.columns)}")
if not current_weather.empty:
    print(f"Current weather parameters: {list(current_weather.columns)}")

## 2. Merge and Prepare Weather Data

Combine different weather data sources and prepare for correlation analysis

In [None]:
# Merge weather data sources
merged_weather = pd.DataFrame()

if not weather_data.empty:
    merged_weather = weather_data.copy()
    print(f"Using weather forecast data as primary source: {len(merged_weather)} records")

if not current_weather.empty:
    if merged_weather.empty:
        merged_weather = current_weather.copy()
        print(f"Using current weather data as primary source: {len(merged_weather)} records")
    else:
        # Merge current weather with forecast data
        common_cols = set(merged_weather.columns) & set(current_weather.columns)
        
        # Prioritize current weather for overlapping periods
        for col in common_cols:
            if col in current_weather.columns:
                # Fill missing values in forecast with current weather
                merged_weather[col] = merged_weather[col].fillna(current_weather[col])
        
        # Add unique columns from current weather
        unique_current_cols = set(current_weather.columns) - common_cols
        for col in unique_current_cols:
            merged_weather[col] = current_weather[col]
        
        print(f"Merged weather data: {len(merged_weather)} records with {len(merged_weather.columns)} parameters")

if merged_weather.empty:
    print("Error: No weather data available for analysis")
else:
    # Resample weather data to hourly intervals
    weather_hourly = merged_weather.resample('H').mean()
    
    # Add derived weather features
    if 'temperature_2m' in weather_hourly.columns:
        # Temperature features
        weather_hourly['temp_celsius'] = weather_hourly['temperature_2m']
        weather_hourly['heating_degree_hours'] = np.maximum(18 - weather_hourly['temp_celsius'], 0)  # HDD base 18°C
        weather_hourly['cooling_degree_hours'] = np.maximum(weather_hourly['temp_celsius'] - 24, 0)  # CDD base 24°C
    
    if 'shortwave_radiation' in weather_hourly.columns:
        # Solar radiation features
        weather_hourly['solar_irradiance'] = weather_hourly['shortwave_radiation']
        weather_hourly['solar_energy_wh_m2'] = weather_hourly['solar_irradiance']  # Wh/m² per hour
    
    if 'cloudcover' in weather_hourly.columns:
        # Cloud features
        weather_hourly['cloud_factor'] = (100 - weather_hourly['cloudcover']) / 100  # Clear sky factor
    
    if 'windspeed_10m' in weather_hourly.columns:
        # Wind features
        weather_hourly['wind_chill_factor'] = np.where(
            weather_hourly['windspeed_10m'] > 5,
            13.12 + 0.6215 * weather_hourly.get('temp_celsius', 20) - 
            11.37 * (weather_hourly['windspeed_10m'] ** 0.16) +
            0.3965 * weather_hourly.get('temp_celsius', 20) * (weather_hourly['windspeed_10m'] ** 0.16),
            weather_hourly.get('temp_celsius', 20)
        )
    
    # Add time-based features
    weather_hourly['hour'] = weather_hourly.index.hour
    weather_hourly['day_of_year'] = weather_hourly.index.dayofyear
    weather_hourly['month'] = weather_hourly.index.month
    weather_hourly['is_daylight'] = (weather_hourly['hour'] >= 6) & (weather_hourly['hour'] <= 20)
    
    print(f"\nProcessed weather data: {len(weather_hourly)} hours")
    print(f"Available weather features: {len(weather_hourly.columns)}")
    print(f"Sample features: {list(weather_hourly.columns)[:10]}")
    
    # Data quality check
    missing_pct = weather_hourly.isnull().mean() * 100
    high_missing = missing_pct[missing_pct > 20]
    
    if len(high_missing) > 0:
        print(f"\nWeather parameters with >20% missing data:")
        for param, pct in high_missing.items():
            print(f"  {param}: {pct:.1f}% missing")
    else:
        print(f"\nWeather data quality: Good (all parameters <20% missing)")

## 3. PV Production vs Weather Correlation

Analyze correlation between solar production and weather conditions

In [None]:
# Analyze PV production vs weather correlation
if not pv_data.empty and not weather_hourly.empty:
    
    # Prepare PV data
    pv_hourly = pv_data.resample('H').mean()
    
    # Merge PV and weather data
    pv_weather = pd.merge(pv_hourly, weather_hourly, left_index=True, right_index=True, how='inner')
    
    if len(pv_weather) > 24:  # Need sufficient data
        print(f"\nPV-Weather Correlation Analysis:")
        print(f"Merged dataset: {len(pv_weather)} hours")
        
        # Focus on daylight hours for solar analysis
        daylight_data = pv_weather[pv_weather['is_daylight']].copy()
        
        if len(daylight_data) > 10:
            # Define PV power column
            pv_power_col = 'InputPower' if 'InputPower' in daylight_data.columns else None
            if not pv_power_col and 'solar_power' in daylight_data.columns:
                pv_power_col = 'solar_power'
            
            if pv_power_col:
                # Calculate correlations with weather parameters
                weather_params = ['temperature_2m', 'shortwave_radiation', 'direct_radiation', 
                                 'diffuse_radiation', 'cloudcover', 'humidity']
                available_params = [p for p in weather_params if p in daylight_data.columns]
                
                pv_weather_correlations = {}
                
                for param in available_params:
                    correlation = daylight_data[pv_power_col].corr(daylight_data[param])
                    pv_weather_correlations[param] = correlation
                
                # Display correlations
                print(f"\nPV Power Correlations (daylight hours only):")
                print("=" * 50)
                sorted_corrs = sorted(pv_weather_correlations.items(), key=lambda x: abs(x[1]), reverse=True)
                
                for param, corr in sorted_corrs:
                    print(f"{param:25s}: {corr:+.3f}")
                
                # Analyze solar radiation relationship
                if 'shortwave_radiation' in daylight_data.columns:
                    # Filter for meaningful production periods
                    production_data = daylight_data[daylight_data[pv_power_col] > 100].copy()
                    
                    if len(production_data) > 20:
                        # Calculate PV efficiency (power per unit radiation)
                        production_data['pv_efficiency'] = production_data[pv_power_col] / (production_data['shortwave_radiation'] + 1)
                        
                        # Analyze efficiency vs temperature
                        if 'temperature_2m' in production_data.columns:
                            temp_bins = pd.cut(production_data['temperature_2m'], bins=10)
                            efficiency_by_temp = production_data.groupby(temp_bins)['pv_efficiency'].agg(['mean', 'count'])
                            
                            # Only consider bins with sufficient data
                            valid_bins = efficiency_by_temp[efficiency_by_temp['count'] >= 5]
                            
                            if len(valid_bins) >= 3:
                                # Find optimal temperature
                                optimal_temp_bin = valid_bins['mean'].idxmax()
                                optimal_temp = optimal_temp_bin.mid
                                max_efficiency = valid_bins['mean'].max()
                                
                                print(f"\nPV Efficiency Analysis:")
                                print(f"Optimal temperature: {optimal_temp:.1f}°C")
                                print(f"Maximum efficiency: {max_efficiency:.3f} W per W/m²")
                                
                                # Calculate temperature coefficient
                                temp_values = [interval.mid for interval in valid_bins.index]
                                eff_values = valid_bins['mean'].values
                                
                                if len(temp_values) >= 3:
                                    # Fit linear relationship
                                    slope, intercept, r_value, p_value, std_err = stats.linregress(temp_values, eff_values)
                                    
                                    print(f"Temperature coefficient: {slope:.6f} per °C")
                                    print(f"R-squared: {r_value**2:.3f}")
                        
                        # Analyze cloud cover impact
                        if 'cloudcover' in production_data.columns:
                            cloud_bins = pd.cut(production_data['cloudcover'], bins=[0, 25, 50, 75, 100], 
                                               labels=['Clear', 'Partly Cloudy', 'Mostly Cloudy', 'Overcast'])
                            
                            cloud_impact = production_data.groupby(cloud_bins)[pv_power_col].agg(['mean', 'std', 'count'])
                            
                            print(f"\nCloud Cover Impact on PV Production:")
                            print("-" * 45)
                            print(f"{'Condition':15s} {'Avg Power (W)':12s} {'Std (W)':10s} {'Count':8s}")
                            print("-" * 45)
                            
                            for condition, row in cloud_impact.iterrows():
                                if row['count'] >= 5:  # Only show conditions with sufficient data
                                    print(f"{condition:15s} {row['mean']:11.0f} {row['std']:9.0f} {row['count']:7.0f}")
                
                # Store PV weather analysis results
                pv_weather_analysis = {
                    'correlations': pv_weather_correlations,
                    'daylight_hours_analyzed': len(daylight_data),
                    'production_hours': len(production_data) if 'production_data' in locals() else 0
                }
                
                if 'efficiency_by_temp' in locals():
                    pv_weather_analysis['temperature_analysis'] = {
                        'optimal_temp': optimal_temp if 'optimal_temp' in locals() else None,
                        'max_efficiency': max_efficiency if 'max_efficiency' in locals() else None,
                        'temp_coefficient': slope if 'slope' in locals() else None
                    }
                
                if 'cloud_impact' in locals():
                    pv_weather_analysis['cloud_impact'] = cloud_impact.to_dict()
            
            else:
                print("No PV power data available for correlation analysis")
                pv_weather_analysis = {}
        else:
            print("Insufficient daylight data for PV analysis")
            pv_weather_analysis = {}
    else:
        print("Insufficient merged PV-weather data")
        pv_weather_analysis = {}
else:
    print("PV or weather data not available")
    pv_weather_analysis = {}

In [None]:
# Visualize PV-weather relationships
if pv_weather_analysis and 'correlations' in pv_weather_analysis:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Correlation heatmap
    if len(pv_weather_analysis['correlations']) > 0:
        corr_data = pd.Series(pv_weather_analysis['correlations'])
        corr_data.plot(kind='barh', ax=axes[0,0], color='skyblue')
        axes[0,0].set_title('PV Power - Weather Correlations')
        axes[0,0].set_xlabel('Correlation Coefficient')
        axes[0,0].grid(True, alpha=0.3)
        axes[0,0].axvline(x=0, color='black', linewidth=0.5)
    
    # PV vs Solar Radiation scatter
    if 'shortwave_radiation' in daylight_data.columns and pv_power_col:
        sample_size = min(500, len(daylight_data))
        sample_data = daylight_data.sample(sample_size)
        
        scatter = axes[0,1].scatter(sample_data['shortwave_radiation'], sample_data[pv_power_col],
                                   c=sample_data.get('temperature_2m', 20), cmap='viridis', alpha=0.6)
        axes[0,1].set_xlabel('Solar Radiation (W/m²)')
        axes[0,1].set_ylabel('PV Power (W)')
        axes[0,1].set_title('PV Power vs Solar Radiation')
        axes[0,1].grid(True, alpha=0.3)
        
        # Add colorbar for temperature
        if 'temperature_2m' in sample_data.columns:
            cbar = plt.colorbar(scatter, ax=axes[0,1])
            cbar.set_label('Temperature (°C)')
    
    # Temperature efficiency curve
    if 'temperature_analysis' in pv_weather_analysis and 'efficiency_by_temp' in locals():
        valid_bins = efficiency_by_temp[efficiency_by_temp['count'] >= 5]
        if len(valid_bins) > 0:
            x_temps = [interval.mid for interval in valid_bins.index]
            y_effs = valid_bins['mean'].values
            
            axes[1,0].plot(x_temps, y_effs, 'o-', linewidth=2, markersize=6)
            axes[1,0].set_xlabel('Temperature (°C)')
            axes[1,0].set_ylabel('PV Efficiency (W per W/m²)')
            axes[1,0].set_title('PV Efficiency vs Temperature')
            axes[1,0].grid(True, alpha=0.3)
            
            # Add trend line if enough points
            if len(x_temps) >= 3:
                z = np.polyfit(x_temps, y_effs, 1)
                p = np.poly1d(z)
                axes[1,0].plot(x_temps, p(x_temps), 'r--', alpha=0.8, label=f'Trend: {z[0]:.6f}/°C')
                axes[1,0].legend()
    
    # Cloud cover impact
    if 'cloud_impact' in pv_weather_analysis and 'cloud_impact' in locals():
        cloud_means = cloud_impact['mean']
        cloud_stds = cloud_impact['std']
        
        bars = axes[1,1].bar(range(len(cloud_means)), cloud_means.values, 
                            yerr=cloud_stds.values, capsize=5, alpha=0.8)
        axes[1,1].set_xticks(range(len(cloud_means)))
        axes[1,1].set_xticklabels(cloud_means.index, rotation=45)
        axes[1,1].set_ylabel('Average PV Power (W)')
        axes[1,1].set_title('PV Production by Cloud Condition')
        axes[1,1].grid(True, alpha=0.3)
        
        # Add value labels on bars
        for i, (bar, mean_val) in enumerate(zip(bars, cloud_means.values)):
            axes[1,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + cloud_stds.iloc[i]/2,
                          f'{mean_val:.0f}W', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()

## 4. Heating Demand vs Weather Correlation

Analyze relationship between outdoor conditions and heating system usage

In [None]:
# Analyze heating demand vs weather correlation
if relay_data and not weather_hourly.empty:
    
    # Calculate total heating demand from all rooms
    total_heating_power = pd.Series(0, index=weather_hourly.index)
    
    # Room power ratings for heating load calculation
    room_power_ratings = {
        'obyvacka': 2000, 'kuchyn': 1500, 'loznice': 1500,
        'detsky_pokoj': 1000, 'koupelna': 800, 'pracovna': 1200,
        'chodba': 500, 'spiz': 300
    }
    
    rooms_with_heating = []
    
    for room, relay_df in relay_data.items():
        if not relay_df.empty and 'value' in relay_df.columns:
            # Resample relay data to hourly
            relay_hourly = relay_df['value'].resample('H').mean()
            
            # Calculate heating power for this room
            room_power = room_power_ratings.get(room, 1000)
            heating_power = relay_hourly * room_power
            
            # Add to total heating
            total_heating_power = total_heating_power.add(heating_power, fill_value=0)
            rooms_with_heating.append(room)
    
    if len(rooms_with_heating) > 0:
        # Merge heating and weather data
        heating_weather = pd.merge(total_heating_power.to_frame('heating_power'), 
                                  weather_hourly, left_index=True, right_index=True, how='inner')
        
        print(f"\nHeating-Weather Correlation Analysis:")
        print(f"Rooms included: {', '.join(rooms_with_heating)}")
        print(f"Merged dataset: {len(heating_weather)} hours")
        
        # Calculate correlations with weather parameters
        weather_params = ['temperature_2m', 'humidity', 'windspeed_10m', 'cloudcover', 
                         'heating_degree_hours', 'wind_chill_factor']
        available_params = [p for p in weather_params if p in heating_weather.columns]
        
        heating_weather_correlations = {}
        
        for param in available_params:
            correlation = heating_weather['heating_power'].corr(heating_weather[param])
            heating_weather_correlations[param] = correlation
        
        # Display correlations
        print(f"\nHeating Power Correlations:")
        print("=" * 40)
        sorted_corrs = sorted(heating_weather_correlations.items(), key=lambda x: abs(x[1]), reverse=True)
        
        for param, corr in sorted_corrs:
            print(f"{param:25s}: {corr:+.3f}")
        
        # Analyze temperature-heating relationship
        if 'temperature_2m' in heating_weather.columns:
            # Find heating threshold temperature
            temp_bins = pd.cut(heating_weather['temperature_2m'], bins=20)
            heating_by_temp = heating_weather.groupby(temp_bins)['heating_power'].agg(['mean', 'count'])
            
            # Only consider bins with sufficient data
            valid_bins = heating_by_temp[heating_by_temp['count'] >= 5]
            
            if len(valid_bins) >= 5:
                # Find temperature where heating starts
                heating_threshold = None
                min_heating_temp = None
                
                for temp_bin, row in valid_bins.iterrows():
                    if row['mean'] > heating_weather['heating_power'].mean() * 0.1:  # 10% of average
                        heating_threshold = temp_bin.right
                        break
                
                # Find minimum heating temperature
                non_zero_heating = valid_bins[valid_bins['mean'] > 100]  # At least 100W
                if len(non_zero_heating) > 0:
                    min_heating_temp = max([interval.right for interval in non_zero_heating.index])
                
                print(f"\nHeating Temperature Analysis:")
                if heating_threshold:
                    print(f"Heating threshold: {heating_threshold:.1f}°C")
                if min_heating_temp:
                    print(f"Minimum heating temperature: {min_heating_temp:.1f}°C")
                
                # Calculate heating degree day relationship
                if 'heating_degree_hours' in heating_weather.columns:
                    # Fit linear model: heating power vs HDD
                    hdd_data = heating_weather[heating_weather['heating_degree_hours'] > 0]
                    
                    if len(hdd_data) > 10:
                        X = hdd_data['heating_degree_hours'].values.reshape(-1, 1)
                        y = hdd_data['heating_power'].values
                        
                        model = LinearRegression().fit(X, y)
                        hdd_coefficient = model.coef_[0]
                        hdd_r2 = model.score(X, y)
                        
                        print(f"Heating degree relationship:")
                        print(f"  Coefficient: {hdd_coefficient:.1f} W per degree-hour")
                        print(f"  R-squared: {hdd_r2:.3f}")
        
        # Analyze seasonal heating patterns
        heating_weather['month'] = heating_weather.index.month
        monthly_heating = heating_weather.groupby('month')['heating_power'].agg(['mean', 'sum', 'count'])
        
        print(f"\nMonthly Heating Pattern:")
        print("-" * 40)
        print(f"{'Month':8s} {'Avg (W)':10s} {'Total (kWh)':12s} {'Hours':8s}")
        print("-" * 40)
        
        month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                      'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        
        for month, row in monthly_heating.iterrows():
            if row['count'] > 0:
                month_name = month_names[month - 1]
                total_kwh = row['sum'] / 1000  # Convert Wh to kWh
                print(f"{month_name:8s} {row['mean']:9.0f} {total_kwh:11.1f} {row['count']:7.0f}")
        
        # Store heating weather analysis
        heating_weather_analysis = {
            'correlations': heating_weather_correlations,
            'rooms_analyzed': rooms_with_heating,
            'hours_analyzed': len(heating_weather),
            'monthly_pattern': monthly_heating.to_dict()
        }
        
        if 'heating_threshold' in locals():
            heating_weather_analysis['temperature_analysis'] = {
                'heating_threshold': heating_threshold,
                'min_heating_temp': min_heating_temp if 'min_heating_temp' in locals() else None
            }
        
        if 'hdd_coefficient' in locals():
            heating_weather_analysis['hdd_analysis'] = {
                'coefficient': hdd_coefficient,
                'r_squared': hdd_r2
            }
    
    else:
        print("No heating relay data available")
        heating_weather_analysis = {}
else:
    print("Heating or weather data not available")
    heating_weather_analysis = {}

In [None]:
# Visualize heating-weather relationships
if heating_weather_analysis and 'correlations' in heating_weather_analysis:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Heating correlation bar chart
    if len(heating_weather_analysis['correlations']) > 0:
        corr_data = pd.Series(heating_weather_analysis['correlations'])
        bars = corr_data.plot(kind='barh', ax=axes[0,0], color='lightcoral')
        axes[0,0].set_title('Heating Power - Weather Correlations')
        axes[0,0].set_xlabel('Correlation Coefficient')
        axes[0,0].grid(True, alpha=0.3)
        axes[0,0].axvline(x=0, color='black', linewidth=0.5)
    
    # Heating vs Temperature scatter
    if 'temperature_2m' in heating_weather.columns:
        sample_size = min(1000, len(heating_weather))
        sample_data = heating_weather.sample(sample_size)
        
        # Color by hour to show daily patterns
        scatter = axes[0,1].scatter(sample_data['temperature_2m'], sample_data['heating_power'],
                                   c=sample_data.index.hour, cmap='tab20', alpha=0.6, s=20)
        axes[0,1].set_xlabel('Temperature (°C)')
        axes[0,1].set_ylabel('Heating Power (W)')
        axes[0,1].set_title('Heating Power vs Temperature')
        axes[0,1].grid(True, alpha=0.3)
        
        # Add colorbar for hour
        cbar = plt.colorbar(scatter, ax=axes[0,1])
        cbar.set_label('Hour of Day')
        
        # Add trend line
        if len(sample_data) > 10:
            z = np.polyfit(sample_data['temperature_2m'], sample_data['heating_power'], 1)
            p = np.poly1d(z)
            temp_range = np.linspace(sample_data['temperature_2m'].min(), 
                                   sample_data['temperature_2m'].max(), 100)
            axes[0,1].plot(temp_range, p(temp_range), 'r-', linewidth=2, alpha=0.8)
    
    # Temperature binned heating
    if 'valid_bins' in locals() and len(valid_bins) > 0:
        x_temps = [interval.mid for interval in valid_bins.index]
        y_heating = valid_bins['mean'].values
        
        axes[1,0].plot(x_temps, y_heating, 'o-', linewidth=2, markersize=6, color='red')
        axes[1,0].set_xlabel('Temperature (°C)')
        axes[1,0].set_ylabel('Average Heating Power (W)')
        axes[1,0].set_title('Heating Power vs Temperature (Binned)')
        axes[1,0].grid(True, alpha=0.3)
        
        # Highlight heating threshold
        if 'heating_threshold' in locals() and heating_threshold:
            axes[1,0].axvline(x=heating_threshold, color='orange', linestyle='--', 
                             label=f'Threshold: {heating_threshold:.1f}°C')
            axes[1,0].legend()
    
    # Monthly heating pattern
    if 'monthly_pattern' in heating_weather_analysis:
        monthly_data = pd.DataFrame(heating_weather_analysis['monthly_pattern'])
        if not monthly_data.empty and 'mean' in monthly_data.columns:
            month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
            
            # Filter months with data
            available_months = [m for m in monthly_data.index if m <= 12]
            month_labels = [month_names[m-1] for m in available_months]
            month_values = [monthly_data.loc[m, 'mean'] for m in available_months]
            
            bars = axes[1,1].bar(month_labels, month_values, color='lightblue', alpha=0.8)
            axes[1,1].set_ylabel('Average Heating Power (W)')
            axes[1,1].set_title('Monthly Heating Demand')
            axes[1,1].set_xticklabels(month_labels, rotation=45)
            axes[1,1].grid(True, alpha=0.3)
            
            # Add value labels
            for bar, value in zip(bars, month_values):
                axes[1,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + value*0.01,
                              f'{value:.0f}', ha='center', va='bottom', fontsize=9)
    
    plt.tight_layout()
    plt.show()

## 5. Total Energy Consumption vs Weather

Analyze correlation between weather and total building energy consumption

In [None]:
# Analyze total consumption vs weather correlation
if not consumption_data.empty and not weather_hourly.empty:
    
    # Prepare consumption data
    consumption_hourly = consumption_data.resample('H').mean()
    
    # Find total power column
    power_columns = [col for col in consumption_hourly.columns if 'power' in col.lower()]
    if power_columns:
        total_power_col = power_columns[0]
    elif not pv_data.empty and 'ACPowerToUser' in pv_data.columns:
        # Use PV self-consumption as proxy for total consumption
        pv_hourly = pv_data['ACPowerToUser'].resample('H').mean()
        consumption_hourly = pd.merge(consumption_hourly, pv_hourly.to_frame('total_power'),
                                    left_index=True, right_index=True, how='outer')
        total_power_col = 'total_power'
    else:
        total_power_col = None
    
    if total_power_col and total_power_col in consumption_hourly.columns:
        # Merge consumption and weather data
        consumption_weather = pd.merge(consumption_hourly[[total_power_col]], 
                                     weather_hourly, left_index=True, right_index=True, how='inner')
        
        print(f"\nTotal Consumption-Weather Correlation Analysis:")
        print(f"Merged dataset: {len(consumption_weather)} hours")
        
        # Calculate correlations with weather parameters
        weather_params = ['temperature_2m', 'humidity', 'windspeed_10m', 'cloudcover',
                         'shortwave_radiation', 'heating_degree_hours', 'cooling_degree_hours']
        available_params = [p for p in weather_params if p in consumption_weather.columns]
        
        consumption_weather_correlations = {}
        
        for param in available_params:
            correlation = consumption_weather[total_power_col].corr(consumption_weather[param])
            consumption_weather_correlations[param] = correlation
        
        # Display correlations
        print(f"\nTotal Consumption Correlations:")
        print("=" * 45)
        sorted_corrs = sorted(consumption_weather_correlations.items(), key=lambda x: abs(x[1]), reverse=True)
        
        for param, corr in sorted_corrs:
            print(f"{param:25s}: {corr:+.3f}")
        
        # Analyze consumption patterns by weather conditions
        
        # Temperature impact
        if 'temperature_2m' in consumption_weather.columns:
            temp_bins = pd.cut(consumption_weather['temperature_2m'], 
                              bins=[-20, 0, 5, 10, 15, 20, 25, 30, 35, 50], 
                              labels=['<0°C', '0-5°C', '5-10°C', '10-15°C', '15-20°C', 
                                     '20-25°C', '25-30°C', '30-35°C', '>35°C'])
            
            consumption_by_temp = consumption_weather.groupby(temp_bins)[total_power_col].agg(['mean', 'std', 'count'])
            
            # Only show bins with sufficient data
            valid_temp_bins = consumption_by_temp[consumption_by_temp['count'] >= 10]
            
            if len(valid_temp_bins) > 0:
                print(f"\nConsumption by Temperature Range:")
                print("-" * 50)
                print(f"{'Range':12s} {'Avg (W)':10s} {'Std (W)':10s} {'Count':8s}")
                print("-" * 50)
                
                for temp_range, row in valid_temp_bins.iterrows():
                    print(f"{temp_range:12s} {row['mean']:9.0f} {row['std']:9.0f} {row['count']:7.0f}")
        
        # Solar radiation impact
        if 'shortwave_radiation' in consumption_weather.columns:
            # Separate day and night analysis
            day_data = consumption_weather[consumption_weather['is_daylight']]
            night_data = consumption_weather[~consumption_weather['is_daylight']]
            
            if len(day_data) > 10 and len(night_data) > 10:
                day_corr = day_data[total_power_col].corr(day_data['shortwave_radiation'])
                night_avg = night_data[total_power_col].mean()
                day_avg = day_data[total_power_col].mean()
                
                print(f"\nDay/Night Consumption Analysis:")
                print(f"Day-time average: {day_avg:.0f} W")
                print(f"Night-time average: {night_avg:.0f} W")
                print(f"Day/night ratio: {day_avg/night_avg:.2f}")
                print(f"Solar correlation (daytime): {day_corr:.3f}")
        
        # Seasonal consumption analysis
        consumption_weather['season'] = consumption_weather['month'].map({
            12: 'Winter', 1: 'Winter', 2: 'Winter',
            3: 'Spring', 4: 'Spring', 5: 'Spring',
            6: 'Summer', 7: 'Summer', 8: 'Summer',
            9: 'Autumn', 10: 'Autumn', 11: 'Autumn'
        })
        
        seasonal_consumption = consumption_weather.groupby('season')[total_power_col].agg(['mean', 'std', 'count'])
        
        print(f"\nSeasonal Consumption Pattern:")
        print("-" * 40)
        print(f"{'Season':10s} {'Avg (W)':10s} {'Std (W)':10s} {'Count':8s}")
        print("-" * 40)
        
        for season, row in seasonal_consumption.iterrows():
            if row['count'] > 0:
                print(f"{season:10s} {row['mean']:9.0f} {row['std']:9.0f} {row['count']:7.0f}")
        
        # Weather extremes analysis
        if 'temperature_2m' in consumption_weather.columns:
            # Define extreme conditions
            temp_95 = consumption_weather['temperature_2m'].quantile(0.95)
            temp_5 = consumption_weather['temperature_2m'].quantile(0.05)
            
            hot_days = consumption_weather[consumption_weather['temperature_2m'] >= temp_95]
            cold_days = consumption_weather[consumption_weather['temperature_2m'] <= temp_5]
            normal_days = consumption_weather[
                (consumption_weather['temperature_2m'] > temp_5) & 
                (consumption_weather['temperature_2m'] < temp_95)
            ]
            
            if len(hot_days) > 5 and len(cold_days) > 5:
                print(f"\nExtreme Weather Impact:")
                print(f"Hot days (>{temp_95:.1f}°C): {hot_days[total_power_col].mean():.0f}W avg ({len(hot_days)} hours)")
                print(f"Cold days (<{temp_5:.1f}°C): {cold_days[total_power_col].mean():.0f}W avg ({len(cold_days)} hours)")
                print(f"Normal days: {normal_days[total_power_col].mean():.0f}W avg ({len(normal_days)} hours)")
        
        # Store consumption weather analysis
        consumption_weather_analysis = {
            'correlations': consumption_weather_correlations,
            'hours_analyzed': len(consumption_weather),
            'seasonal_pattern': seasonal_consumption.to_dict()
        }
        
        if 'day_avg' in locals():
            consumption_weather_analysis['day_night_analysis'] = {
                'day_average': day_avg,
                'night_average': night_avg,
                'day_night_ratio': day_avg/night_avg,
                'solar_correlation_day': day_corr
            }
        
        if 'hot_days' in locals():
            consumption_weather_analysis['extreme_weather'] = {
                'hot_threshold': temp_95,
                'cold_threshold': temp_5,
                'hot_consumption': hot_days[total_power_col].mean(),
                'cold_consumption': cold_days[total_power_col].mean(),
                'normal_consumption': normal_days[total_power_col].mean()
            }
    
    else:
        print("No suitable total power consumption data found")
        consumption_weather_analysis = {}
else:
    print("Consumption or weather data not available")
    consumption_weather_analysis = {}

## 6. Weather-Based Prediction Models

Develop models to predict energy parameters based on weather conditions

In [None]:
# Develop weather-based prediction models
weather_prediction_models = {}

# Model 1: PV Production Prediction
if pv_weather_analysis and 'daylight_data' in locals():
    if len(daylight_data) > 50 and pv_power_col in daylight_data.columns:
        print("\nDeveloping PV Production Prediction Model:")
        print("=" * 50)
        
        # Prepare features
        pv_features = ['shortwave_radiation', 'direct_radiation', 'diffuse_radiation', 
                      'temperature_2m', 'cloudcover', 'humidity']
        available_pv_features = [f for f in pv_features if f in daylight_data.columns]
        
        if len(available_pv_features) >= 2:
            # Filter out zero/very low radiation periods
            pv_model_data = daylight_data[daylight_data.get('shortwave_radiation', 0) > 50].copy()
            
            if len(pv_model_data) > 30:
                X_pv = pv_model_data[available_pv_features].dropna()
                y_pv = pv_model_data.loc[X_pv.index, pv_power_col]
                
                # Split data
                X_pv_train, X_pv_test, y_pv_train, y_pv_test = train_test_split(
                    X_pv, y_pv, test_size=0.3, random_state=42)
                
                # Scale features
                scaler_pv = StandardScaler()
                X_pv_train_scaled = scaler_pv.fit_transform(X_pv_train)
                X_pv_test_scaled = scaler_pv.transform(X_pv_test)
                
                # Train models
                pv_models = {
                    'Linear': LinearRegression(),
                    'Ridge': Ridge(alpha=1.0),
                    'Random Forest': RandomForestRegressor(n_estimators=50, random_state=42)
                }
                
                pv_model_results = {}
                
                for name, model in pv_models.items():
                    if name == 'Random Forest':
                        model.fit(X_pv_train, y_pv_train)
                        y_pred = model.predict(X_pv_test)
                    else:
                        model.fit(X_pv_train_scaled, y_pv_train)
                        y_pred = model.predict(X_pv_test_scaled)
                    
                    mae = mean_absolute_error(y_pv_test, y_pred)
                    rmse = np.sqrt(mean_squared_error(y_pv_test, y_pred))
                    r2 = r2_score(y_pv_test, y_pred)
                    
                    pv_model_results[name] = {'mae': mae, 'rmse': rmse, 'r2': r2}
                
                # Display results
                print(f"Training samples: {len(X_pv_train)}")
                print(f"Test samples: {len(X_pv_test)}")
                print(f"Features used: {', '.join(available_pv_features)}")
                print(f"\nModel Performance:")
                print(f"{'Model':15s} {'MAE (W)':10s} {'RMSE (W)':10s} {'R²':8s}")
                print("-" * 45)
                
                for name, results in pv_model_results.items():
                    print(f"{name:15s} {results['mae']:9.0f} {results['rmse']:9.0f} {results['r2']:7.3f}")
                
                # Select best model
                best_pv_model = max(pv_model_results.keys(), key=lambda x: pv_model_results[x]['r2'])
                print(f"\nBest model: {best_pv_model}")
                
                weather_prediction_models['pv_production'] = {
                    'features': available_pv_features,
                    'results': pv_model_results,
                    'best_model': best_pv_model,
                    'training_samples': len(X_pv_train)
                }

# Model 2: Heating Demand Prediction
if heating_weather_analysis and 'heating_weather' in locals():
    if len(heating_weather) > 50:
        print("\n\nDeveloping Heating Demand Prediction Model:")
        print("=" * 50)
        
        # Prepare features
        heating_features = ['temperature_2m', 'windspeed_10m', 'humidity', 
                           'heating_degree_hours', 'wind_chill_factor']
        available_heating_features = [f for f in heating_features if f in heating_weather.columns]
        
        if len(available_heating_features) >= 2:
            # Filter out periods with very low heating
            heating_model_data = heating_weather.copy()
            
            X_heat = heating_model_data[available_heating_features].dropna()
            y_heat = heating_model_data.loc[X_heat.index, 'heating_power']
            
            if len(X_heat) > 30:
                # Split data
                X_heat_train, X_heat_test, y_heat_train, y_heat_test = train_test_split(
                    X_heat, y_heat, test_size=0.3, random_state=42)
                
                # Scale features
                scaler_heat = StandardScaler()
                X_heat_train_scaled = scaler_heat.fit_transform(X_heat_train)
                X_heat_test_scaled = scaler_heat.transform(X_heat_test)
                
                # Train models
                heat_models = {
                    'Linear': LinearRegression(),
                    'Ridge': Ridge(alpha=1.0),
                    'Random Forest': RandomForestRegressor(n_estimators=50, random_state=42)
                }
                
                heat_model_results = {}
                
                for name, model in heat_models.items():
                    if name == 'Random Forest':
                        model.fit(X_heat_train, y_heat_train)
                        y_pred = model.predict(X_heat_test)
                    else:
                        model.fit(X_heat_train_scaled, y_heat_train)
                        y_pred = model.predict(X_heat_test_scaled)
                    
                    mae = mean_absolute_error(y_heat_test, y_pred)
                    rmse = np.sqrt(mean_squared_error(y_heat_test, y_pred))
                    r2 = r2_score(y_heat_test, y_pred)
                    
                    heat_model_results[name] = {'mae': mae, 'rmse': rmse, 'r2': r2}
                
                # Display results
                print(f"Training samples: {len(X_heat_train)}")
                print(f"Test samples: {len(X_heat_test)}")
                print(f"Features used: {', '.join(available_heating_features)}")
                print(f"\nModel Performance:")
                print(f"{'Model':15s} {'MAE (W)':10s} {'RMSE (W)':10s} {'R²':8s}")
                print("-" * 45)
                
                for name, results in heat_model_results.items():
                    print(f"{name:15s} {results['mae']:9.0f} {results['rmse']:9.0f} {results['r2']:7.3f}")
                
                # Select best model
                best_heat_model = max(heat_model_results.keys(), key=lambda x: heat_model_results[x]['r2'])
                print(f"\nBest model: {best_heat_model}")
                
                weather_prediction_models['heating_demand'] = {
                    'features': available_heating_features,
                    'results': heat_model_results,
                    'best_model': best_heat_model,
                    'training_samples': len(X_heat_train)
                }

# Model 3: Total Consumption Prediction
if consumption_weather_analysis and 'consumption_weather' in locals():
    if len(consumption_weather) > 50 and total_power_col:
        print("\n\nDeveloping Total Consumption Prediction Model:")
        print("=" * 50)
        
        # Prepare features
        consumption_features = ['temperature_2m', 'shortwave_radiation', 'cloudcover', 
                               'humidity', 'heating_degree_hours', 'cooling_degree_hours']
        available_consumption_features = [f for f in consumption_features if f in consumption_weather.columns]
        
        if len(available_consumption_features) >= 2:
            X_cons = consumption_weather[available_consumption_features].dropna()
            y_cons = consumption_weather.loc[X_cons.index, total_power_col]
            
            if len(X_cons) > 30:
                # Split data
                X_cons_train, X_cons_test, y_cons_train, y_cons_test = train_test_split(
                    X_cons, y_cons, test_size=0.3, random_state=42)
                
                # Scale features
                scaler_cons = StandardScaler()
                X_cons_train_scaled = scaler_cons.fit_transform(X_cons_train)
                X_cons_test_scaled = scaler_cons.transform(X_cons_test)
                
                # Train models
                cons_models = {
                    'Linear': LinearRegression(),
                    'Ridge': Ridge(alpha=1.0),
                    'Random Forest': RandomForestRegressor(n_estimators=50, random_state=42)
                }
                
                cons_model_results = {}
                
                for name, model in cons_models.items():
                    if name == 'Random Forest':
                        model.fit(X_cons_train, y_cons_train)
                        y_pred = model.predict(X_cons_test)
                    else:
                        model.fit(X_cons_train_scaled, y_cons_train)
                        y_pred = model.predict(X_cons_test_scaled)
                    
                    mae = mean_absolute_error(y_cons_test, y_pred)
                    rmse = np.sqrt(mean_squared_error(y_cons_test, y_pred))
                    r2 = r2_score(y_cons_test, y_pred)
                    
                    cons_model_results[name] = {'mae': mae, 'rmse': rmse, 'r2': r2}
                
                # Display results
                print(f"Training samples: {len(X_cons_train)}")
                print(f"Test samples: {len(X_cons_test)}")
                print(f"Features used: {', '.join(available_consumption_features)}")
                print(f"\nModel Performance:")
                print(f"{'Model':15s} {'MAE (W)':10s} {'RMSE (W)':10s} {'R²':8s}")
                print("-" * 45)
                
                for name, results in cons_model_results.items():
                    print(f"{name:15s} {results['mae']:9.0f} {results['rmse']:9.0f} {results['r2']:7.3f}")
                
                # Select best model
                best_cons_model = max(cons_model_results.keys(), key=lambda x: cons_model_results[x]['r2'])
                print(f"\nBest model: {best_cons_model}")
                
                weather_prediction_models['total_consumption'] = {
                    'features': available_consumption_features,
                    'results': cons_model_results,
                    'best_model': best_cons_model,
                    'training_samples': len(X_cons_train)
                }

if not weather_prediction_models:
    print("\nInsufficient data for weather-based prediction models")
else:
    print(f"\n\nWeather-based models developed: {list(weather_prediction_models.keys())}")

## 7. Extreme Weather Analysis

Analyze system performance during extreme weather conditions

In [None]:
# Extreme weather event analysis
if not weather_hourly.empty:
    print("\nExtreme Weather Event Analysis:")
    print("=" * 50)
    
    extreme_weather_events = {}
    
    # Define extreme weather thresholds
    if 'temperature_2m' in weather_hourly.columns:
        temp_data = weather_hourly['temperature_2m'].dropna()
        if len(temp_data) > 100:
            # Temperature extremes
            temp_p99 = temp_data.quantile(0.99)
            temp_p1 = temp_data.quantile(0.01)
            
            hot_extreme = weather_hourly[weather_hourly['temperature_2m'] >= temp_p99]
            cold_extreme = weather_hourly[weather_hourly['temperature_2m'] <= temp_p1]
            
            extreme_weather_events['hot_extreme'] = {
                'threshold': temp_p99,
                'hours': len(hot_extreme),
                'dates': hot_extreme.index.date if len(hot_extreme) > 0 else []
            }
            
            extreme_weather_events['cold_extreme'] = {
                'threshold': temp_p1,
                'hours': len(cold_extreme),
                'dates': cold_extreme.index.date if len(cold_extreme) > 0 else []
            }
    
    # Wind extremes
    if 'windspeed_10m' in weather_hourly.columns:
        wind_data = weather_hourly['windspeed_10m'].dropna()
        if len(wind_data) > 100:
            wind_p95 = wind_data.quantile(0.95)
            high_wind = weather_hourly[weather_hourly['windspeed_10m'] >= wind_p95]
            
            extreme_weather_events['high_wind'] = {
                'threshold': wind_p95,
                'hours': len(high_wind),
                'dates': high_wind.index.date if len(high_wind) > 0 else []
            }
    
    # Solar radiation extremes
    if 'shortwave_radiation' in weather_hourly.columns:
        solar_data = weather_hourly['shortwave_radiation'].dropna()
        if len(solar_data) > 100:
            solar_p95 = solar_data.quantile(0.95)
            solar_p5 = solar_data.quantile(0.05)
            
            high_solar = weather_hourly[weather_hourly['shortwave_radiation'] >= solar_p95]
            low_solar = weather_hourly[weather_hourly['shortwave_radiation'] <= solar_p5]
            
            extreme_weather_events['high_solar'] = {
                'threshold': solar_p95,
                'hours': len(high_solar),
                'dates': high_solar.index.date if len(high_solar) > 0 else []
            }
            
            extreme_weather_events['low_solar'] = {
                'threshold': solar_p5,
                'hours': len(low_solar),
                'dates': low_solar.index.date if len(low_solar) > 0 else []
            }
    
    # Display extreme weather summary
    print(f"\nExtreme Weather Event Summary:")
    print("-" * 40)
    
    for event_type, data in extreme_weather_events.items():
        threshold = data['threshold']
        hours = data['hours']
        unique_dates = len(set(data['dates'])) if data['dates'] else 0
        
        print(f"{event_type:15s}: {threshold:6.1f} threshold, {hours:3d} hours, {unique_dates:2d} days")
    
    # Analyze energy system performance during extremes
    extreme_performance = {}
    
    # PV performance during extreme weather
    if pv_weather_analysis and 'daylight_data' in locals():
        print(f"\nPV Performance During Extreme Weather:")
        print("-" * 45)
        
        normal_pv = daylight_data[pv_power_col].mean() if pv_power_col in daylight_data.columns else 0
        
        for event_type, data in extreme_weather_events.items():
            if len(data['dates']) > 0:
                # Find PV data during extreme events
                extreme_dates = set(data['dates'])
                extreme_pv_data = daylight_data[daylight_data.index.date.isin(extreme_dates)]
                
                if len(extreme_pv_data) > 0 and pv_power_col in extreme_pv_data.columns:
                    extreme_pv_avg = extreme_pv_data[pv_power_col].mean()
                    performance_ratio = extreme_pv_avg / normal_pv if normal_pv > 0 else 0
                    
                    print(f"{event_type:15s}: {extreme_pv_avg:6.0f}W avg ({performance_ratio:5.1%} of normal)")
                    
                    extreme_performance[f'pv_{event_type}'] = {
                        'average_power': extreme_pv_avg,
                        'performance_ratio': performance_ratio,
                        'hours_analyzed': len(extreme_pv_data)
                    }
    
    # Heating performance during extreme weather
    if heating_weather_analysis and 'heating_weather' in locals():
        print(f"\nHeating Performance During Extreme Weather:")
        print("-" * 50)
        
        normal_heating = heating_weather['heating_power'].mean()
        
        for event_type, data in extreme_weather_events.items():
            if len(data['dates']) > 0:
                # Find heating data during extreme events
                extreme_dates = set(data['dates'])
                extreme_heating_data = heating_weather[heating_weather.index.date.isin(extreme_dates)]
                
                if len(extreme_heating_data) > 0:
                    extreme_heating_avg = extreme_heating_data['heating_power'].mean()
                    performance_ratio = extreme_heating_avg / normal_heating if normal_heating > 0 else 0
                    
                    print(f"{event_type:15s}: {extreme_heating_avg:6.0f}W avg ({performance_ratio:5.1%} of normal)")
                    
                    extreme_performance[f'heating_{event_type}'] = {
                        'average_power': extreme_heating_avg,
                        'performance_ratio': performance_ratio,
                        'hours_analyzed': len(extreme_heating_data)
                    }
    
    # Identify most impactful weather events
    if extreme_performance:
        print(f"\nMost Impactful Weather Events:")
        print("-" * 35)
        
        # Sort by deviation from normal (absolute value)
        impact_ranking = []
        for event, perf in extreme_performance.items():
            deviation = abs(perf['performance_ratio'] - 1.0)
            impact_ranking.append((event, deviation, perf['performance_ratio']))
        
        impact_ranking.sort(key=lambda x: x[1], reverse=True)
        
        for event, deviation, ratio in impact_ranking[:5]:  # Top 5 most impactful
            impact_desc = "increase" if ratio > 1 else "decrease"
            print(f"{event:20s}: {abs(ratio-1)*100:4.1f}% {impact_desc}")
    
    # Store extreme weather analysis
    extreme_weather_analysis = {
        'events': extreme_weather_events,
        'performance_impact': extreme_performance,
        'analysis_period_hours': len(weather_hourly)
    }
else:
    print("No weather data available for extreme weather analysis")
    extreme_weather_analysis = {}

## 8. Summary and Weather Optimization Recommendations

Generate actionable insights and weather-based optimization recommendations

In [None]:
print("\nWeather Correlation Analysis - Key Insights and Recommendations:")
print("=" * 80)

# Generate insights based on weather correlation analysis
insights = []
recommendations = []

# PV-weather insights
if pv_weather_analysis and 'correlations' in pv_weather_analysis:
    strongest_pv_corr = max(pv_weather_analysis['correlations'].items(), key=lambda x: abs(x[1]))
    insights.append(f"Strongest PV correlation: {strongest_pv_corr[0]} ({strongest_pv_corr[1]:+.3f})")
    
    if 'temperature_analysis' in pv_weather_analysis:
        temp_analysis = pv_weather_analysis['temperature_analysis']
        if temp_analysis.get('optimal_temp'):
            insights.append(f"Optimal PV temperature: {temp_analysis['optimal_temp']:.1f}°C")
            recommendations.append("Monitor PV efficiency during hot weather (>30°C)")
    
    if abs(strongest_pv_corr[1]) > 0.7:
        recommendations.append(f"Use {strongest_pv_corr[0]} for accurate PV forecasting")

# Heating-weather insights
if heating_weather_analysis and 'correlations' in heating_weather_analysis:
    strongest_heat_corr = max(heating_weather_analysis['correlations'].items(), key=lambda x: abs(x[1]))
    insights.append(f"Strongest heating correlation: {strongest_heat_corr[0]} ({strongest_heat_corr[1]:+.3f})")
    
    if 'temperature_analysis' in heating_weather_analysis:
        temp_analysis = heating_weather_analysis['temperature_analysis']
        if temp_analysis.get('heating_threshold'):
            threshold = temp_analysis['heating_threshold']
            insights.append(f"Heating threshold temperature: {threshold:.1f}°C")
            recommendations.append(f"Pre-heat buildings when temperature drops below {threshold:.0f}°C")
    
    if 'hdd_analysis' in heating_weather_analysis:
        hdd_coeff = heating_weather_analysis['hdd_analysis']['coefficient']
        insights.append(f"Heating demand: {hdd_coeff:.0f}W per degree-hour below 18°C")
        recommendations.append("Use heating degree hours for demand forecasting")

# Total consumption insights
if consumption_weather_analysis and 'correlations' in consumption_weather_analysis:
    strongest_cons_corr = max(consumption_weather_analysis['correlations'].items(), key=lambda x: abs(x[1]))
    insights.append(f"Strongest consumption correlation: {strongest_cons_corr[0]} ({strongest_cons_corr[1]:+.3f})")
    
    if 'day_night_analysis' in consumption_weather_analysis:
        day_night = consumption_weather_analysis['day_night_analysis']
        ratio = day_night['day_night_ratio']
        insights.append(f"Day/night consumption ratio: {ratio:.2f}")
        
        if ratio > 1.5:
            recommendations.append("Significant day/night difference - optimize time-of-use schedules")
    
    if 'extreme_weather' in consumption_weather_analysis:
        extreme = consumption_weather_analysis['extreme_weather']
        hot_impact = (extreme['hot_consumption'] - extreme['normal_consumption']) / extreme['normal_consumption'] * 100
        cold_impact = (extreme['cold_consumption'] - extreme['normal_consumption']) / extreme['normal_consumption'] * 100
        
        if abs(hot_impact) > 10:
            insights.append(f"Hot weather impact: {hot_impact:+.1f}% consumption change")
        if abs(cold_impact) > 10:
            insights.append(f"Cold weather impact: {cold_impact:+.1f}% consumption change")

# Weather prediction model insights
if weather_prediction_models:
    for model_type, model_data in weather_prediction_models.items():
        best_model = model_data['best_model']
        best_r2 = model_data['results'][best_model]['r2']
        
        insights.append(f"{model_type.replace('_', ' ').title()} prediction: {best_model} model (R²={best_r2:.3f})")
        
        if best_r2 > 0.7:
            recommendations.append(f"Implement weather-based {model_type.replace('_', ' ')} forecasting")
        elif best_r2 < 0.5:
            recommendations.append(f"Improve {model_type.replace('_', ' ')} model with additional features")

# Extreme weather insights
if extreme_weather_analysis and 'performance_impact' in extreme_weather_analysis:
    high_impact_events = []
    
    for event, perf in extreme_weather_analysis['performance_impact'].items():
        deviation = abs(perf['performance_ratio'] - 1.0)
        if deviation > 0.2:  # >20% impact
            high_impact_events.append((event, perf['performance_ratio']))
    
    if high_impact_events:
        insights.append(f"High-impact weather events: {len(high_impact_events)} identified")
        recommendations.append("Develop weather-adaptive control strategies for extreme conditions")
        
        # Find most problematic event
        worst_event = max(high_impact_events, key=lambda x: abs(x[1] - 1.0))
        event_name = worst_event[0].replace('_', ' ').title()
        impact_pct = (worst_event[1] - 1.0) * 100
        recommendations.append(f"Priority: Address {event_name} impact ({impact_pct:+.0f}% performance change)")

# Seasonal insights
if heating_weather_analysis and 'monthly_pattern' in heating_weather_analysis:
    monthly_data = pd.DataFrame(heating_weather_analysis['monthly_pattern'])
    if not monthly_data.empty and 'mean' in monthly_data.columns:
        winter_months = [12, 1, 2]
        summer_months = [6, 7, 8]
        
        winter_avg = monthly_data.loc[monthly_data.index.isin(winter_months), 'mean'].mean()
        summer_avg = monthly_data.loc[monthly_data.index.isin(summer_months), 'mean'].mean()
        
        if not np.isnan(winter_avg) and not np.isnan(summer_avg):
            seasonal_ratio = winter_avg / summer_avg if summer_avg > 0 else float('inf')
            insights.append(f"Seasonal heating variation: {seasonal_ratio:.1f}x winter vs summer")
            
            if seasonal_ratio > 5:
                recommendations.append("High seasonal variation - implement season-specific control strategies")

# Display insights and recommendations
print("\nKey Insights:")
for i, insight in enumerate(insights, 1):
    print(f"{i}. {insight}")

print("\nWeather-Based Optimization Recommendations:")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

# Summary statistics
print("\nSummary Statistics:")
print("-" * 40)
if not weather_hourly.empty:
    print(f"Weather data analyzed: {len(weather_hourly)} hours ({len(weather_hourly)/24:.1f} days)")
    print(f"Weather parameters: {len(weather_hourly.columns)}")

if pv_weather_analysis:
    print(f"PV-weather correlations: {len(pv_weather_analysis.get('correlations', {}))}")
    print(f"PV daylight hours analyzed: {pv_weather_analysis.get('daylight_hours_analyzed', 0)}")

if heating_weather_analysis:
    print(f"Heating-weather correlations: {len(heating_weather_analysis.get('correlations', {}))}")
    print(f"Heating rooms analyzed: {len(heating_weather_analysis.get('rooms_analyzed', []))}")

if weather_prediction_models:
    print(f"Weather-based models developed: {len(weather_prediction_models)}")
    
    for model_type, model_data in weather_prediction_models.items():
        best_r2 = max([r['r2'] for r in model_data['results'].values()])
        print(f"  {model_type.replace('_', ' ').title()}: R² = {best_r2:.3f}")

if extreme_weather_analysis:
    event_count = len(extreme_weather_analysis.get('events', {}))
    impact_count = len(extreme_weather_analysis.get('performance_impact', {}))
    print(f"Extreme weather events analyzed: {event_count}")
    print(f"Performance impacts identified: {impact_count}")

print(f"\nWeather correlation analysis completed.")

In [None]:
# Save weather correlation analysis results
import pickle
from pathlib import Path

# Create comprehensive results dictionary
weather_correlation_results = {
    'analysis_period': {'start': start_date, 'end': end_date},
    'weather_data_summary': {
        'hours_analyzed': len(weather_hourly) if not weather_hourly.empty else 0,
        'parameters_available': list(weather_hourly.columns) if not weather_hourly.empty else [],
        'data_quality': 'Good' if not weather_hourly.empty else 'No data'
    },
    'pv_weather_analysis': pv_weather_analysis,
    'heating_weather_analysis': heating_weather_analysis,
    'consumption_weather_analysis': consumption_weather_analysis,
    'weather_prediction_models': weather_prediction_models,
    'extreme_weather_analysis': extreme_weather_analysis,
    'insights': insights,
    'recommendations': recommendations
}

# Save to files
results_dir = Path('../../../data/processed')
results_dir.mkdir(parents=True, exist_ok=True)

# Save as pickle for programmatic use
with open(results_dir / 'weather_correlation_results.pkl', 'wb') as f:
    pickle.dump(weather_correlation_results, f)

# Save weather correlations as CSV
correlations_data = []

if pv_weather_analysis and 'correlations' in pv_weather_analysis:
    for param, corr in pv_weather_analysis['correlations'].items():
        correlations_data.append({'system': 'PV Production', 'weather_parameter': param, 'correlation': corr})

if heating_weather_analysis and 'correlations' in heating_weather_analysis:
    for param, corr in heating_weather_analysis['correlations'].items():
        correlations_data.append({'system': 'Heating Demand', 'weather_parameter': param, 'correlation': corr})

if consumption_weather_analysis and 'correlations' in consumption_weather_analysis:
    for param, corr in consumption_weather_analysis['correlations'].items():
        correlations_data.append({'system': 'Total Consumption', 'weather_parameter': param, 'correlation': corr})

if correlations_data:
    correlations_df = pd.DataFrame(correlations_data)
    correlations_df.to_csv(results_dir / 'weather_correlations.csv', index=False)

# Save prediction model results
if weather_prediction_models:
    model_results_data = []
    for model_type, model_data in weather_prediction_models.items():
        for model_name, results in model_data['results'].items():
            model_results_data.append({
                'prediction_target': model_type,
                'model': model_name,
                'mae': results['mae'],
                'rmse': results['rmse'],
                'r2': results['r2'],
                'features_used': ', '.join(model_data['features']),
                'training_samples': model_data['training_samples']
            })
    
    if model_results_data:
        model_results_df = pd.DataFrame(model_results_data)
        model_results_df.to_csv(results_dir / 'weather_prediction_models.csv', index=False)

# Save summary as text
with open(results_dir / 'weather_correlation_summary.txt', 'w') as f:
    f.write("Weather Correlation Analysis Summary\n")
    f.write("=" * 40 + "\n\n")
    f.write(f"Analysis Period: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}\n")
    f.write(f"Weather Data Hours: {len(weather_hourly) if not weather_hourly.empty else 0}\n\n")
    
    f.write("Key Insights:\n")
    for i, insight in enumerate(insights, 1):
        f.write(f"{i}. {insight}\n")
    
    f.write("\nRecommendations:\n")
    for i, rec in enumerate(recommendations, 1):
        f.write(f"{i}. {rec}\n")
    
    if weather_prediction_models:
        f.write("\nWeather Prediction Model Performance:\n")
        for model_type, model_data in weather_prediction_models.items():
            best_model = model_data['best_model']
            best_results = model_data['results'][best_model]
            f.write(f"  {model_type.replace('_', ' ').title()}: {best_model}\n")
            f.write(f"    R² = {best_results['r2']:.3f}, MAE = {best_results['mae']:.0f}\n")
    
    if extreme_weather_analysis and 'events' in extreme_weather_analysis:
        f.write("\nExtreme Weather Events:\n")
        for event_type, data in extreme_weather_analysis['events'].items():
            f.write(f"  {event_type.replace('_', ' ').title()}: {data['hours']} hours\n")

print("\nWeather correlation analysis results saved to:")
print(f"  - {results_dir / 'weather_correlation_results.pkl'}")
if correlations_data:
    print(f"  - {results_dir / 'weather_correlations.csv'}")
if weather_prediction_models:
    print(f"  - {results_dir / 'weather_prediction_models.csv'}")
print(f"  - {results_dir / 'weather_correlation_summary.txt'}")