# Weather Correlation Analysis

This notebook analyzes correlations between weather conditions and energy consumption patterns, focusing on heating demand and PV production.

## Key Analysis Areas:
1. **Temperature Impact**: Heating demand vs outdoor temperature
2. **Solar Radiation**: PV production correlation with solar irradiance
3. **Seasonal Patterns**: Weather-driven energy consumption changes
4. **Predictive Features**: Weather variables for energy forecasting
5. **Thermal Comfort**: Indoor vs outdoor temperature relationships


In [None]:
import sys
import asyncio
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Add parent directory to path
sys.path.append(str(Path().absolute().parent.parent))

from analysis.data_extraction import DataExtractor
from analysis.thermal_analysis import ThermalAnalyzer
from analysis.pattern_analysis import PVAnalyzer
from config.settings import PEMSSettings

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Data Loading and Preprocessing

In [None]:
# Initialize settings and analyzers
settings = PEMSSettings()
extractor = DataExtractor(settings)
thermal_analyzer = ThermalAnalyzer()
pv_analyzer = PVAnalyzer()

# Define analysis period
end_date = datetime.now()
start_date = end_date - timedelta(days=365)  # 1 year

print(f"Analysis period: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

In [None]:
# Extract all relevant data
try:
    weather_data = await extractor.extract_weather_data(start_date, end_date)
    pv_data = await extractor.extract_pv_data(start_date, end_date)
    room_data = await extractor.extract_room_data(start_date, end_date)
    consumption_data = await extractor.extract_consumption_data(start_date, end_date)
    relay_states = await extractor.extract_relay_states(start_date, end_date)
    
    print(f"Weather data shape: {weather_data.shape}")
    print(f"PV data shape: {pv_data.shape}")
    print(f"Room data available for {len(room_data)} rooms")
    print(f"Consumption data shape: {consumption_data.shape}")
    print(f"Relay states available for {len(relay_states)} rooms")
    
    # Display weather data overview
    print("\nWeather data columns:")
    print(weather_data.columns.tolist())
    
    print("\nWeather data sample:")
    display(weather_data.head())
    
except Exception as e:
    print(f"Error loading data: {e}")
    # Load from saved files if extraction fails
    try:
        weather_data = extractor.load_from_parquet("weather_data")
        pv_data = extractor.load_from_parquet("pv_data")
        room_data = extractor.load_from_parquet("room_data")
        print("Loaded data from saved parquet files")
    except:
        print("Failed to load saved data. Please run data extraction first.")

## 2. Temperature vs Heating Demand Analysis

In [None]:
# Calculate total heating demand from relay states
if relay_states and not weather_data.empty:
    # Sum all relay states to get total heating demand
    total_heating = pd.DataFrame()
    for room, data in relay_states.items():
        if not data.empty:
            if total_heating.empty:
                total_heating = data.copy()
                total_heating.columns = ['heating_demand']
            else:
                total_heating['heating_demand'] += data.iloc[:, 0]  # Sum relay states
    
    if not total_heating.empty:
        # Align weather and heating data
        weather_aligned = weather_data.resample('15T').mean()  # 15-minute averages
        heating_aligned = total_heating.resample('15T').mean()
        
        # Merge datasets
        combined_data = pd.merge(weather_aligned, heating_aligned, 
                                left_index=True, right_index=True, how='inner')
        
        if 'temperature' in combined_data.columns:
            # Create temperature bins for analysis
            temp_bins = np.arange(-20, 35, 2.5)  # 2.5°C bins
            combined_data['temp_bin'] = pd.cut(combined_data['temperature'], bins=temp_bins)
            
            # Calculate heating demand by temperature
            heating_by_temp = combined_data.groupby('temp_bin')['heating_demand'].agg([
                'mean', 'std', 'count'
            ]).reset_index()
            heating_by_temp['temp_mid'] = heating_by_temp['temp_bin'].apply(
                lambda x: x.mid if pd.notna(x) else np.nan
            )
            
            # Plot temperature vs heating demand
            fig, axes = plt.subplots(2, 2, figsize=(15, 12))
            
            # 1. Scatter plot with trend line
            valid_data = combined_data.dropna(subset=['temperature', 'heating_demand'])
            if len(valid_data) > 0:
                axes[0, 0].scatter(valid_data['temperature'], valid_data['heating_demand'], 
                                  alpha=0.3, s=1)
                
                # Add trend line
                z = np.polyfit(valid_data['temperature'], valid_data['heating_demand'], 1)
                p = np.poly1d(z)
                axes[0, 0].plot(valid_data['temperature'], p(valid_data['temperature']), 
                               "r--", alpha=0.8, linewidth=2)
                
                axes[0, 0].set_xlabel('Outdoor Temperature (°C)')
                axes[0, 0].set_ylabel('Heating Demand (0-16 relays)')
                axes[0, 0].set_title('Temperature vs Heating Demand (Raw Data)')
                axes[0, 0].grid(True, alpha=0.3)
                
                # Calculate correlation
                correlation = valid_data['temperature'].corr(valid_data['heating_demand'])
                axes[0, 0].text(0.05, 0.95, f'Correlation: {correlation:.3f}', 
                               transform=axes[0, 0].transAxes, 
                               bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
            
            # 2. Binned analysis
            valid_bins = heating_by_temp.dropna(subset=['temp_mid', 'mean'])
            if len(valid_bins) > 0:
                axes[0, 1].errorbar(valid_bins['temp_mid'], valid_bins['mean'], 
                                   yerr=valid_bins['std'], fmt='o-', capsize=5)
                axes[0, 1].set_xlabel('Temperature Bin Center (°C)')
                axes[0, 1].set_ylabel('Mean Heating Demand')
                axes[0, 1].set_title('Temperature Bins vs Average Heating Demand')
                axes[0, 1].grid(True, alpha=0.3)
            
            # 3. Heating base temperature analysis
            # Find heating base temperature (when heating starts)
            heating_threshold = valid_bins[valid_bins['mean'] > 0.1]['temp_mid'].max()
            if pd.notna(heating_threshold):
                axes[1, 0].axvline(heating_threshold, color='red', linestyle='--', 
                                  label=f'Heating Base: {heating_threshold:.1f}°C')
                axes[1, 0].plot(valid_bins['temp_mid'], valid_bins['mean'], 'o-')
                axes[1, 0].set_xlabel('Temperature (°C)')
                axes[1, 0].set_ylabel('Heating Demand')
                axes[1, 0].set_title('Heating Base Temperature Analysis')
                axes[1, 0].legend()
                axes[1, 0].grid(True, alpha=0.3)
            
            # 4. Daily patterns by temperature
            combined_data['hour'] = combined_data.index.hour
            combined_data['temp_category'] = pd.cut(combined_data['temperature'], 
                                                   bins=[-30, 0, 10, 20, 40], 
                                                   labels=['Very Cold', 'Cold', 'Mild', 'Warm'])
            
            hourly_patterns = combined_data.groupby(['hour', 'temp_category'])['heating_demand'].mean().unstack()
            
            for category in hourly_patterns.columns:
                if hourly_patterns[category].notna().sum() > 0:
                    axes[1, 1].plot(hourly_patterns.index, hourly_patterns[category], 
                                   label=category, marker='o', alpha=0.7)
            
            axes[1, 1].set_xlabel('Hour of Day')
            axes[1, 1].set_ylabel('Average Heating Demand')
            axes[1, 1].set_title('Daily Heating Patterns by Temperature')
            axes[1, 1].legend()
            axes[1, 1].grid(True, alpha=0.3)
            
            plt.tight_layout()
            plt.show()
            
            print(f"Heating base temperature: {heating_threshold:.1f}°C")
            print(f"Temperature-heating correlation: {correlation:.3f}")
        else:
            print("Temperature column not found in weather data")
    else:
        print("No heating demand data available")
else:
    print("No relay states or weather data available for heating analysis")

## 3. Solar Radiation vs PV Production

In [None]:
if not weather_data.empty and not pv_data.empty:
    # Align weather and PV data
    weather_pv = weather_data.resample('15T').mean()
    pv_aligned = pv_data.resample('15T').mean()
    
    # Merge datasets
    solar_analysis = pd.merge(weather_pv, pv_aligned, 
                             left_index=True, right_index=True, how='inner')
    
    # Filter daylight hours (6 AM to 8 PM)
    daylight_data = solar_analysis[(solar_analysis.index.hour >= 6) & 
                                   (solar_analysis.index.hour <= 20)]
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Solar radiation vs PV power
    if 'solar_radiation' in daylight_data.columns and 'InputPower' in daylight_data.columns:
        valid_solar = daylight_data.dropna(subset=['solar_radiation', 'InputPower'])
        
        if len(valid_solar) > 0:
            axes[0, 0].scatter(valid_solar['solar_radiation'], valid_solar['InputPower'], 
                              alpha=0.5, s=1)
            
            # Add trend line
            z = np.polyfit(valid_solar['solar_radiation'], valid_solar['InputPower'], 1)
            p = np.poly1d(z)
            axes[0, 0].plot(valid_solar['solar_radiation'], p(valid_solar['solar_radiation']), 
                           "r--", alpha=0.8, linewidth=2)
            
            axes[0, 0].set_xlabel('Solar Radiation (W/m²)')
            axes[0, 0].set_ylabel('PV Power (W)')
            axes[0, 0].set_title('Solar Radiation vs PV Production')
            axes[0, 0].grid(True, alpha=0.3)
            
            # Calculate correlation
            solar_corr = valid_solar['solar_radiation'].corr(valid_solar['InputPower'])
            axes[0, 0].text(0.05, 0.95, f'Correlation: {solar_corr:.3f}', 
                           transform=axes[0, 0].transAxes, 
                           bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    # 2. Cloud cover impact
    if 'cloud_cover' in daylight_data.columns and 'InputPower' in daylight_data.columns:
        cloud_bins = pd.cut(daylight_data['cloud_cover'], bins=[0, 25, 50, 75, 100], 
                            labels=['Clear', 'Partly Cloudy', 'Mostly Cloudy', 'Overcast'])
        daylight_data['cloud_category'] = cloud_bins
        
        cloud_power = daylight_data.groupby('cloud_category')['InputPower'].agg(['mean', 'std', 'count'])
        
        valid_cloud = cloud_power.dropna()
        if len(valid_cloud) > 0:
            axes[0, 1].bar(range(len(valid_cloud)), valid_cloud['mean'], 
                          yerr=valid_cloud['std'], capsize=5, alpha=0.7)
            axes[0, 1].set_xticks(range(len(valid_cloud)))
            axes[0, 1].set_xticklabels(valid_cloud.index, rotation=45)
            axes[0, 1].set_ylabel('Average PV Power (W)')
            axes[0, 1].set_title('PV Production by Cloud Cover')
            axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Temperature impact on PV efficiency
    if 'temperature' in daylight_data.columns and 'InputPower' in daylight_data.columns:
        # Only consider periods with significant solar radiation
        sunny_data = daylight_data[daylight_data.get('solar_radiation', 0) > 200]
        
        if len(sunny_data) > 0:
            temp_bins = pd.cut(sunny_data['temperature'], bins=np.arange(-10, 45, 5))
            temp_power = sunny_data.groupby(temp_bins)['InputPower'].agg(['mean', 'std', 'count'])
            temp_power['temp_mid'] = temp_power.index.map(lambda x: x.mid if pd.notna(x) else np.nan)
            
            valid_temp = temp_power.dropna(subset=['temp_mid', 'mean'])
            if len(valid_temp) > 0:
                axes[1, 0].errorbar(valid_temp['temp_mid'], valid_temp['mean'], 
                                   yerr=valid_temp['std'], fmt='o-', capsize=5)
                axes[1, 0].set_xlabel('Temperature (°C)')
                axes[1, 0].set_ylabel('Average PV Power (W)')
                axes[1, 0].set_title('PV Efficiency vs Temperature (Sunny Conditions)')
                axes[1, 0].grid(True, alpha=0.3)
    
    # 4. Weather correlation matrix
    weather_cols = ['temperature', 'humidity', 'pressure', 'wind_speed', 'cloud_cover', 'solar_radiation']
    pv_cols = ['InputPower', 'OutputPower', 'EnergyToday'] if 'InputPower' in daylight_data.columns else []
    
    available_weather = [col for col in weather_cols if col in daylight_data.columns]
    available_pv = [col for col in pv_cols if col in daylight_data.columns]
    
    if available_weather and available_pv:
        corr_matrix = daylight_data[available_weather + available_pv].corr()
        
        # Extract correlations between weather and PV
        weather_pv_corr = corr_matrix.loc[available_weather, available_pv]
        
        im = axes[1, 1].imshow(weather_pv_corr.values, cmap='RdBu_r', vmin=-1, vmax=1)
        axes[1, 1].set_xticks(range(len(available_pv)))
        axes[1, 1].set_yticks(range(len(available_weather)))
        axes[1, 1].set_xticklabels(available_pv, rotation=45)
        axes[1, 1].set_yticklabels(available_weather)
        axes[1, 1].set_title('Weather-PV Correlation Matrix')
        
        # Add correlation values as text
        for i in range(len(available_weather)):
            for j in range(len(available_pv)):
                text = axes[1, 1].text(j, i, f'{weather_pv_corr.iloc[i, j]:.2f}',
                                      ha="center", va="center", color="black" if abs(weather_pv_corr.iloc[i, j]) < 0.5 else "white")
        
        plt.colorbar(im, ax=axes[1, 1], shrink=0.8)
    
    plt.tight_layout()
    plt.show()
    
    # Print strongest correlations
    if available_weather and available_pv:
        print("\n=== STRONGEST WEATHER-PV CORRELATIONS ===")
        for pv_var in available_pv:
            correlations = [(weather_var, weather_pv_corr.loc[weather_var, pv_var]) 
                           for weather_var in available_weather]
            correlations.sort(key=lambda x: abs(x[1]), reverse=True)
            
            print(f"\n{pv_var}:")
            for weather_var, corr in correlations[:3]:  # Top 3
                print(f"  {weather_var}: {corr:.3f}")
else:
    print("Weather or PV data not available for solar analysis")

## 4. Seasonal Weather Patterns

In [None]:
if not weather_data.empty:
    # Add seasonal information
    weather_seasonal = weather_data.copy()
    weather_seasonal['month'] = weather_seasonal.index.month
    weather_seasonal['season'] = weather_seasonal['month'].map({
        12: 'Winter', 1: 'Winter', 2: 'Winter',
        3: 'Spring', 4: 'Spring', 5: 'Spring',
        6: 'Summer', 7: 'Summer', 8: 'Summer',
        9: 'Autumn', 10: 'Autumn', 11: 'Autumn'
    })
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Seasonal temperature distribution
    if 'temperature' in weather_seasonal.columns:
        season_temps = [weather_seasonal[weather_seasonal['season'] == season]['temperature'].dropna() 
                       for season in ['Winter', 'Spring', 'Summer', 'Autumn']]
        
        axes[0, 0].boxplot(season_temps, labels=['Winter', 'Spring', 'Summer', 'Autumn'])
        axes[0, 0].set_ylabel('Temperature (°C)')
        axes[0, 0].set_title('Seasonal Temperature Distribution')
        axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Monthly weather patterns
    if 'temperature' in weather_seasonal.columns:
        monthly_stats = weather_seasonal.groupby('month')['temperature'].agg(['mean', 'min', 'max'])
        
        axes[0, 1].fill_between(monthly_stats.index, monthly_stats['min'], monthly_stats['max'], 
                               alpha=0.3, label='Min-Max Range')
        axes[0, 1].plot(monthly_stats.index, monthly_stats['mean'], 'o-', linewidth=2, label='Average')
        axes[0, 1].set_xlabel('Month')
        axes[0, 1].set_ylabel('Temperature (°C)')
        axes[0, 1].set_title('Monthly Temperature Patterns')
        axes[0, 1].set_xticks(range(1, 13))
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3)
    
    # 3. Humidity vs Temperature
    if 'humidity' in weather_seasonal.columns and 'temperature' in weather_seasonal.columns:
        for season in ['Winter', 'Spring', 'Summer', 'Autumn']:
            season_data = weather_seasonal[weather_seasonal['season'] == season]
            if len(season_data) > 0:
                axes[1, 0].scatter(season_data['temperature'], season_data['humidity'], 
                                  alpha=0.5, s=1, label=season)
        
        axes[1, 0].set_xlabel('Temperature (°C)')
        axes[1, 0].set_ylabel('Humidity (%)')
        axes[1, 0].set_title('Temperature vs Humidity by Season')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
    
    # 4. Wind and pressure patterns
    if 'wind_speed' in weather_seasonal.columns:
        seasonal_wind = weather_seasonal.groupby('season')['wind_speed'].mean()
        
        axes[1, 1].bar(seasonal_wind.index, seasonal_wind.values, alpha=0.7)
        axes[1, 1].set_ylabel('Average Wind Speed (m/s)')
        axes[1, 1].set_title('Seasonal Wind Patterns')
        axes[1, 1].tick_params(axis='x', rotation=45)
        axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print seasonal statistics
    print("=== SEASONAL WEATHER STATISTICS ===")
    seasonal_stats = weather_seasonal.groupby('season').agg({
        'temperature': ['mean', 'min', 'max', 'std'],
        'humidity': ['mean', 'std'] if 'humidity' in weather_seasonal.columns else None,
        'wind_speed': ['mean', 'max'] if 'wind_speed' in weather_seasonal.columns else None
    }).round(1)
    
    for season in ['Winter', 'Spring', 'Summer', 'Autumn']:
        if season in seasonal_stats.index:
            print(f"\n{season}:")
            if ('temperature', 'mean') in seasonal_stats.columns:
                temp_data = seasonal_stats.loc[season, 'temperature']
                print(f"  Temperature: {temp_data['mean']:.1f}°C (range: {temp_data['min']:.1f} to {temp_data['max']:.1f}°C)")
            
            if ('humidity', 'mean') in seasonal_stats.columns:
                hum_data = seasonal_stats.loc[season, 'humidity']
                print(f"  Humidity: {hum_data['mean']:.1f}% ± {hum_data['std']:.1f}%")
            
            if ('wind_speed', 'mean') in seasonal_stats.columns:
                wind_data = seasonal_stats.loc[season, 'wind_speed']
                print(f"  Wind: {wind_data['mean']:.1f} m/s (max: {wind_data['max']:.1f} m/s)")
else:
    print("Weather data not available for seasonal analysis")

## 5. Indoor vs Outdoor Temperature Analysis

In [None]:
if room_data and not weather_data.empty:
    # Combine indoor temperature data
    indoor_temps = []
    room_names = []
    
    for room_name, room_df in room_data.items():
        if not room_df.empty and 'temperature' in room_df.columns:
            indoor_temps.append(room_df['temperature'])
            room_names.append(room_name)
    
    if indoor_temps:
        # Calculate average indoor temperature
        indoor_avg = pd.concat(indoor_temps, axis=1).mean(axis=1, skipna=True)
        indoor_avg.name = 'indoor_temperature'
        
        # Align with outdoor temperature
        outdoor_temp = weather_data['temperature'] if 'temperature' in weather_data.columns else None
        
        if outdoor_temp is not None:
            # Resample to same frequency
            indoor_resampled = indoor_avg.resample('1H').mean()
            outdoor_resampled = outdoor_temp.resample('1H').mean()
            
            # Merge data
            temp_comparison = pd.merge(indoor_resampled, outdoor_resampled, 
                                     left_index=True, right_index=True, how='inner')
            temp_comparison.columns = ['indoor', 'outdoor']
            
            fig, axes = plt.subplots(2, 2, figsize=(15, 12))
            
            # 1. Indoor vs Outdoor scatter
            valid_temps = temp_comparison.dropna()
            if len(valid_temps) > 0:
                axes[0, 0].scatter(valid_temps['outdoor'], valid_temps['indoor'], 
                                  alpha=0.5, s=1)
                
                # Add 1:1 line
                min_temp = min(valid_temps['outdoor'].min(), valid_temps['indoor'].min())
                max_temp = max(valid_temps['outdoor'].max(), valid_temps['indoor'].max())
                axes[0, 0].plot([min_temp, max_temp], [min_temp, max_temp], 'r--', 
                               alpha=0.8, label='1:1 Line')
                
                # Add trend line
                z = np.polyfit(valid_temps['outdoor'], valid_temps['indoor'], 1)
                p = np.poly1d(z)
                axes[0, 0].plot(valid_temps['outdoor'], p(valid_temps['outdoor']), 
                               "g--", alpha=0.8, linewidth=2, label='Trend Line')
                
                axes[0, 0].set_xlabel('Outdoor Temperature (°C)')
                axes[0, 0].set_ylabel('Indoor Temperature (°C)')
                axes[0, 0].set_title('Indoor vs Outdoor Temperature')
                axes[0, 0].legend()
                axes[0, 0].grid(True, alpha=0.3)
                
                # Calculate correlation
                temp_corr = valid_temps['outdoor'].corr(valid_temps['indoor'])
                axes[0, 0].text(0.05, 0.95, f'Correlation: {temp_corr:.3f}', 
                               transform=axes[0, 0].transAxes, 
                               bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
                
                print(f"Indoor-Outdoor temperature correlation: {temp_corr:.3f}")
                print(f"Trend line: Indoor = {z[0]:.3f} × Outdoor + {z[1]:.1f}")
            
            # 2. Temperature difference over time
            temp_comparison['difference'] = temp_comparison['indoor'] - temp_comparison['outdoor']
            
            # Plot last 30 days
            recent_temps = temp_comparison.last('30D')
            if len(recent_temps) > 0:
                axes[0, 1].plot(recent_temps.index, recent_temps['indoor'], 
                               label='Indoor', alpha=0.7, linewidth=1)
                axes[0, 1].plot(recent_temps.index, recent_temps['outdoor'], 
                               label='Outdoor', alpha=0.7, linewidth=1)
                axes[0, 1].fill_between(recent_temps.index, 
                                       recent_temps['indoor'], recent_temps['outdoor'], 
                                       alpha=0.3, label='Temperature Difference')
                
                axes[0, 1].set_xlabel('Date')
                axes[0, 1].set_ylabel('Temperature (°C)')
                axes[0, 1].set_title('Indoor vs Outdoor Temperature (Last 30 Days)')
                axes[0, 1].legend()
                axes[0, 1].tick_params(axis='x', rotation=45)
                axes[0, 1].grid(True, alpha=0.3)
            
            # 3. Temperature difference distribution
            if len(temp_comparison['difference'].dropna()) > 0:
                axes[1, 0].hist(temp_comparison['difference'].dropna(), bins=50, 
                               alpha=0.7, edgecolor='black')
                axes[1, 0].axvline(temp_comparison['difference'].mean(), 
                                  color='red', linestyle='--', 
                                  label=f'Mean: {temp_comparison["difference"].mean():.1f}°C')
                axes[1, 0].set_xlabel('Temperature Difference (Indoor - Outdoor) °C')
                axes[1, 0].set_ylabel('Frequency')
                axes[1, 0].set_title('Temperature Difference Distribution')
                axes[1, 0].legend()
                axes[1, 0].grid(True, alpha=0.3)
            
            # 4. Seasonal temperature differences
            temp_comparison['month'] = temp_comparison.index.month
            temp_comparison['season'] = temp_comparison['month'].map({
                12: 'Winter', 1: 'Winter', 2: 'Winter',
                3: 'Spring', 4: 'Spring', 5: 'Spring',
                6: 'Summer', 7: 'Summer', 8: 'Summer',
                9: 'Autumn', 10: 'Autumn', 11: 'Autumn'
            })
            
            seasonal_diff = temp_comparison.groupby('season')['difference'].agg(['mean', 'std'])
            
            if len(seasonal_diff) > 0:
                axes[1, 1].bar(seasonal_diff.index, seasonal_diff['mean'], 
                              yerr=seasonal_diff['std'], capsize=5, alpha=0.7)
                axes[1, 1].set_ylabel('Temperature Difference (°C)')
                axes[1, 1].set_title('Seasonal Indoor-Outdoor Temperature Difference')
                axes[1, 1].tick_params(axis='x', rotation=45)
                axes[1, 1].grid(True, alpha=0.3)
                axes[1, 1].axhline(y=0, color='black', linestyle='-', alpha=0.3)
            
            plt.tight_layout()
            plt.show()
            
            # Print statistics
            print("\n=== TEMPERATURE DIFFERENCE STATISTICS ===")
            print(f"Mean temperature difference: {temp_comparison['difference'].mean():.1f}°C")
            print(f"Standard deviation: {temp_comparison['difference'].std():.1f}°C")
            print(f"Maximum heating effect: {temp_comparison['difference'].max():.1f}°C")
            print(f"Maximum cooling effect: {temp_comparison['difference'].min():.1f}°C")
            
            for season in ['Winter', 'Spring', 'Summer', 'Autumn']:
                if season in seasonal_diff.index:
                    season_data = seasonal_diff.loc[season]
                    print(f"{season}: {season_data['mean']:.1f}°C ± {season_data['std']:.1f}°C")
        else:
            print("Outdoor temperature data not available")
    else:
        print("No indoor temperature data available")
else:
    print("Room data or weather data not available for thermal comfort analysis")

## 6. Principal Component Analysis of Weather Features

In [None]:
# PCA analysis of weather features for dimensionality reduction
if not weather_data.empty:
    # Select numeric weather features
    weather_features = ['temperature', 'humidity', 'pressure', 'wind_speed', 
                       'cloud_cover', 'solar_radiation', 'precipitation']
    
    available_features = [col for col in weather_features if col in weather_data.columns]
    
    if len(available_features) >= 3:
        # Prepare data for PCA
        weather_pca_data = weather_data[available_features].dropna()
        
        if len(weather_pca_data) > 100:  # Need sufficient data
            # Standardize features
            scaler = StandardScaler()
            weather_scaled = scaler.fit_transform(weather_pca_data)
            
            # Perform PCA
            pca = PCA()
            weather_pca = pca.fit_transform(weather_scaled)
            
            fig, axes = plt.subplots(2, 2, figsize=(15, 12))
            
            # 1. Explained variance ratio
            cumvar = np.cumsum(pca.explained_variance_ratio_)
            axes[0, 0].bar(range(1, len(pca.explained_variance_ratio_) + 1), 
                          pca.explained_variance_ratio_, alpha=0.7, label='Individual')
            axes[0, 0].plot(range(1, len(cumvar) + 1), cumvar, 'ro-', label='Cumulative')
            axes[0, 0].set_xlabel('Principal Component')
            axes[0, 0].set_ylabel('Explained Variance Ratio')
            axes[0, 0].set_title('PCA Explained Variance')
            axes[0, 0].legend()
            axes[0, 0].grid(True, alpha=0.3)
            
            # 2. Feature loadings for first two components
            loadings = pca.components_[:2].T * np.sqrt(pca.explained_variance_[:2])
            
            for i, feature in enumerate(available_features):
                axes[0, 1].arrow(0, 0, loadings[i, 0], loadings[i, 1], 
                                head_width=0.05, head_length=0.05, 
                                fc='blue', ec='blue', alpha=0.7)
                axes[0, 1].text(loadings[i, 0] * 1.1, loadings[i, 1] * 1.1, 
                                feature, fontsize=10, ha='center', va='center')
            
            axes[0, 1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
            axes[0, 1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
            axes[0, 1].set_title('PCA Feature Loadings')
            axes[0, 1].grid(True, alpha=0.3)
            axes[0, 1].set_aspect('equal')
            
            # 3. PC1 vs PC2 scatter (colored by season)
            weather_pca_df = pd.DataFrame(weather_pca[:, :2], 
                                         columns=['PC1', 'PC2'],
                                         index=weather_pca_data.index)
            weather_pca_df['month'] = weather_pca_df.index.month
            weather_pca_df['season'] = weather_pca_df['month'].map({
                12: 'Winter', 1: 'Winter', 2: 'Winter',
                3: 'Spring', 4: 'Spring', 5: 'Spring',
                6: 'Summer', 7: 'Summer', 8: 'Summer',
                9: 'Autumn', 10: 'Autumn', 11: 'Autumn'
            })
            
            colors = {'Winter': 'blue', 'Spring': 'green', 'Summer': 'red', 'Autumn': 'orange'}
            for season in colors.keys():
                season_data = weather_pca_df[weather_pca_df['season'] == season]
                if len(season_data) > 0:
                    axes[1, 0].scatter(season_data['PC1'], season_data['PC2'], 
                                      c=colors[season], alpha=0.6, s=1, label=season)
            
            axes[1, 0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
            axes[1, 0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
            axes[1, 0].set_title('Weather Patterns in Principal Component Space')
            axes[1, 0].legend()
            axes[1, 0].grid(True, alpha=0.3)
            
            # 4. Component importance heatmap
            n_components = min(4, len(available_features))
            component_matrix = pca.components_[:n_components]
            
            im = axes[1, 1].imshow(component_matrix, cmap='RdBu_r', aspect='auto')
            axes[1, 1].set_xticks(range(len(available_features)))
            axes[1, 1].set_yticks(range(n_components))
            axes[1, 1].set_xticklabels(available_features, rotation=45)
            axes[1, 1].set_yticklabels([f'PC{i+1}' for i in range(n_components)])
            axes[1, 1].set_title('Principal Component Weights')
            
            # Add component values as text
            for i in range(n_components):
                for j in range(len(available_features)):
                    text = axes[1, 1].text(j, i, f'{component_matrix[i, j]:.2f}',
                                          ha="center", va="center", 
                                          color="white" if abs(component_matrix[i, j]) > 0.5 else "black")
            
            plt.colorbar(im, ax=axes[1, 1], shrink=0.8)
            
            plt.tight_layout()
            plt.show()
            
            # Print PCA summary
            print("=== PRINCIPAL COMPONENT ANALYSIS SUMMARY ===")
            print(f"Number of components needed for 90% variance: {np.argmax(cumvar >= 0.9) + 1}")
            print(f"Number of components needed for 95% variance: {np.argmax(cumvar >= 0.95) + 1}")
            
            print("\nFirst 3 Principal Components:")
            for i in range(min(3, len(pca.explained_variance_ratio_))):
                print(f"PC{i+1}: {pca.explained_variance_ratio_[i]:.1%} variance")
                
                # Find features with highest absolute loadings
                component_loadings = list(zip(available_features, abs(pca.components_[i])))
                component_loadings.sort(key=lambda x: x[1], reverse=True)
                
                top_features = component_loadings[:3]
                print(f"  Top features: {', '.join([f[0] for f in top_features])}")
        else:
            print("Insufficient weather data for PCA analysis")
    else:
        print(f"Not enough weather features for PCA (found {len(available_features)}, need ≥3)")
else:
    print("Weather data not available for PCA analysis")

## Summary and Key Findings

In [None]:
print("=== WEATHER CORRELATION ANALYSIS SUMMARY ===")
print()

# Generate summary based on analyses
findings = []

# Add findings based on what data was available and analyzed
if 'correlation' in locals():
    findings.append(f"Temperature-heating correlation: {correlation:.3f} (stronger = more weather-dependent heating)")

if 'heating_threshold' in locals() and pd.notna(heating_threshold):
    findings.append(f"Heating base temperature: {heating_threshold:.1f}°C (heating starts below this temperature)")

if 'solar_corr' in locals():
    findings.append(f"Solar radiation-PV correlation: {solar_corr:.3f} (indicates PV system responsiveness)")

if 'temp_corr' in locals():
    findings.append(f"Indoor-outdoor temperature correlation: {temp_corr:.3f} (thermal envelope performance)")

if 'temp_comparison' in locals() and 'difference' in temp_comparison.columns:
    mean_diff = temp_comparison['difference'].mean()
    findings.append(f"Average indoor-outdoor temperature difference: {mean_diff:.1f}°C (heating system effectiveness)")

if 'pca' in locals():
    n_components_90 = np.argmax(cumvar >= 0.9) + 1 if 'cumvar' in locals() else 'N/A'
    findings.append(f"Weather complexity: {n_components_90} components explain 90% of weather variance")

# Print findings
if findings:
    print("KEY FINDINGS:")
    for i, finding in enumerate(findings, 1):
        print(f"{i}. {finding}")
else:
    print("Analysis completed but insufficient data for detailed findings.")

# Recommendations for energy optimization
print("\nRECOMMENDATIONS FOR ENERGY OPTIMIZATION:")

recommendations = []

if 'correlation' in locals() and abs(correlation) > 0.7:
    recommendations.append("Strong weather dependence detected - weather-based heating prediction models recommended")

if 'heating_threshold' in locals() and heating_threshold > 15:
    recommendations.append(f"High heating threshold ({heating_threshold:.1f}°C) - consider thermal envelope improvements")

if 'solar_corr' in locals() and solar_corr > 0.8:
    recommendations.append("Excellent PV-weather correlation - weather-based PV forecasting highly recommended")

if 'temp_corr' in locals() and temp_corr < 0.5:
    recommendations.append("Weak indoor-outdoor correlation - investigate thermal envelope issues")

if not recommendations:
    recommendations.append("System performing within normal parameters - continue monitoring")

for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

print("\nAnalysis completed successfully!")
print(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")