# Data Quality Analysis with PV Export Constraints

This notebook analyzes data quality considering your PV system's unique constraints:
- Historical period with export disabled (forced self-consumption)
- Current period with price-based conditional export
- 16-room relay-controlled heating system

In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Import PEMS v2 modules
import sys
sys.path.append('../../..')  # Adjust path as needed

from pems_v2.analysis.data_extraction import DataExtractor
from pems_v2.analysis.data_preprocessing import (
    DataValidator, OutlierDetector, GapFiller,
    RelayDataProcessor, PVDataProcessor
)
from pems_v2.analysis.visualization import AnalysisVisualizer

## 1. Data Extraction and Initial Assessment

In [None]:
# Initialize data extractor
extractor = DataExtractor()

# Define analysis period
end_date = datetime.now()
start_date = end_date - timedelta(days=730)  # 2 years of data

print(f"Analysis period: {start_date.date()} to {end_date.date()}")

# Extract all data types
print("\nExtracting data...")
all_data = extractor.extract_all_data(
    start_time=start_date.isoformat(),
    end_time=end_date.isoformat()
)

# Display data availability
print("\n=== Data Availability Summary ===")
for data_type, data in all_data.items():
    if isinstance(data, dict):
        if 'error' in data:
            print(f"{data_type}: ERROR - {data['error']}")
        else:
            # For room data
            room_count = len([k for k in data.keys() if k not in ['error', 'warning']])
            print(f"{data_type}: {room_count} rooms")
    elif isinstance(data, pd.DataFrame):
        print(f"{data_type}: {len(data)} records, {len(data.columns)} columns")
        if not data.empty:
            print(f"  Date range: {data.index.min()} to {data.index.max()}")
    else:
        print(f"{data_type}: Unknown format")

## 2. PV Data Quality Analysis with Export Constraints

In [None]:
# Initialize processors and validators
pv_processor = PVDataProcessor()
validator = DataValidator()
visualizer = AnalysisVisualizer()

# Process PV data
pv_data = all_data.get('pv', pd.DataFrame())
price_data = all_data.get('prices', pd.DataFrame())

if not pv_data.empty:
    print("=== PV Data Validation ===")
    pv_validation = validator.validate_pv_data(pv_data)
    print(f"Valid: {pv_validation['valid']}")
    
    if pv_validation['warnings']:
        print("\nWarnings:")
        for warning in pv_validation['warnings']:
            print(f"  - {warning}")
    
    # Process PV data with export constraint analysis
    print("\n=== PV Export Constraint Analysis ===")
    pv_analysis = pv_processor.process_pv_data(pv_data, price_data)
    
    # Display export period detection
    if 'export_periods' in pv_analysis:
        periods = pv_analysis['export_periods']
        if 'policy_change_date' in periods:
            print(f"\nExport Policy Change Detected: {periods['policy_change_date']}")
            print(f"Pre-export period: {periods['pre_export_period']['start']} to {periods['pre_export_period']['end']}")
            print(f"Post-export period: {periods['post_export_period']['start']} to {periods['post_export_period']['end']}")
    
    # Display production analysis
    if 'production_analysis' in pv_analysis:
        prod = pv_analysis['production_analysis']
        print(f"\n=== Production Statistics ===")
        print(f"Total production: {prod.get('total_production_kwh', 0):.1f} kWh")
        print(f"Daily average: {prod.get('daily_avg_kwh', 0):.1f} kWh")
        print(f"Peak power: {prod.get('peak_power_kw', 0):.1f} kW")
        print(f"Capacity factor: {prod.get('capacity_factor', 0):.1%}")
else:
    print("No PV data available for analysis")

## 3. Self-Consumption and Curtailment Analysis

In [None]:
if not pv_data.empty and 'export_periods' in pv_analysis:
    # Visualize export constraint impact
    export_date = pv_analysis['export_periods'].get('policy_change_date')
    
    if export_date:
        fig = visualizer.plot_pv_export_constraint_analysis(
            pv_data, price_data, export_date,
            save_path='pv_export_constraints.html'
        )
        fig.show()
    
    # Display curtailment analysis
    if 'curtailment' in pv_analysis:
        curt = pv_analysis['curtailment']
        print("\n=== Curtailment Analysis ===")
        print(f"Total curtailment: {curt.get('total_curtailment_kwh', 0):.1f} kWh")
        print(f"Average daily curtailment: {curt.get('avg_daily_curtailment_kwh', 0):.1f} kWh")
        print(f"Curtailment ratio: {curt.get('curtailment_ratio', 0):.1%}")
        
        # Monthly curtailment pattern
        if 'curtailment_by_month' in curt:
            monthly_curt = pd.Series(curt['curtailment_by_month'])
            if not monthly_curt.empty:
                plt.figure(figsize=(12, 6))
                monthly_curt.plot(kind='bar')
                plt.title('Monthly Curtailment (kWh)')
                plt.xlabel('Month')
                plt.ylabel('Curtailment (kWh)')
                plt.xticks(rotation=45)
                plt.tight_layout()
                plt.show()

## 4. Relay System Data Quality

In [None]:
# Process relay data
relay_processor = RelayDataProcessor()
room_data = all_data.get('rooms', {})

if room_data and not isinstance(room_data, pd.DataFrame):
    print("=== Relay System Analysis ===")
    relay_analysis = relay_processor.process_relay_data(room_data)
    
    # Display system-wide statistics
    if 'system_totals' in relay_analysis:
        system = relay_analysis['system_totals']
        print(f"\nSystem Statistics:")
        print(f"Total rooms: {system.get('total_rooms', 0)}")
        print(f"Total capacity: {system.get('total_installed_capacity_kw', 0):.1f} kW")
        print(f"Peak demand: {system.get('peak_demand_kw', 0):.1f} kW")
        print(f"Average demand: {system.get('average_demand_kw', 0):.1f} kW")
        print(f"Load factor: {system.get('load_factor', 0):.1%}")
        print(f"Diversity factor: {system.get('diversity_factor', 0):.2f}")
    
    # Room-by-room statistics
    print("\n=== Room-by-Room Analysis ===")
    room_stats = []
    
    for room_name, room_analysis in relay_analysis.items():
        if room_name != 'system_totals' and isinstance(room_analysis, dict):
            if 'switching_statistics' in room_analysis:
                stats = room_analysis['switching_statistics']
                room_stats.append({
                    'Room': room_name,
                    'Power (kW)': room_analysis.get('power_rating_kw', 0),
                    'On Time %': stats.get('on_time_percentage', 0),
                    'Switches/Day': stats.get('switches_per_day', 0),
                    'Avg On Duration (min)': stats.get('avg_on_duration_minutes', 0)
                })
    
    if room_stats:
        room_df = pd.DataFrame(room_stats)
        room_df = room_df.sort_values('On Time %', ascending=False)
        print(room_df.to_string(index=False))
        
        # Visualize room usage patterns
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        
        # On-time percentage
        room_df.plot(x='Room', y='On Time %', kind='bar', ax=ax1)
        ax1.set_title('Relay On-Time Percentage by Room')
        ax1.set_ylabel('On Time (%)')
        ax1.tick_params(axis='x', rotation=45)
        
        # Switching frequency
        room_df.plot(x='Room', y='Switches/Day', kind='bar', ax=ax2, color='orange')
        ax2.set_title('Daily Switching Frequency by Room')
        ax2.set_ylabel('Switches per Day')
        ax2.tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()

## 5. Data Gap Analysis

In [None]:
# Analyze data gaps across all sources
print("=== Data Gap Analysis ===")

gap_summary = []

# PV data gaps
if not pv_data.empty:
    pv_gaps = pv_data.index.to_series().diff()
    large_gaps = pv_gaps[pv_gaps > pd.Timedelta(hours=2)]
    gap_summary.append({
        'Data Source': 'PV System',
        'Total Gaps': len(large_gaps),
        'Max Gap (hours)': large_gaps.max().total_seconds() / 3600 if len(large_gaps) > 0 else 0,
        'Total Gap Time (hours)': large_gaps.sum().total_seconds() / 3600 if len(large_gaps) > 0 else 0
    })

# Weather data gaps
weather_data = all_data.get('weather', pd.DataFrame())
if not weather_data.empty:
    weather_gaps = weather_data.index.to_series().diff()
    large_gaps = weather_gaps[weather_gaps > pd.Timedelta(hours=2)]
    gap_summary.append({
        'Data Source': 'Weather',
        'Total Gaps': len(large_gaps),
        'Max Gap (hours)': large_gaps.max().total_seconds() / 3600 if len(large_gaps) > 0 else 0,
        'Total Gap Time (hours)': large_gaps.sum().total_seconds() / 3600 if len(large_gaps) > 0 else 0
    })

# Room data gaps
if relay_analysis and 'system_totals' not in room_data:
    for room_name, room_analysis in relay_analysis.items():
        if room_name != 'system_totals' and 'gap_analysis' in room_analysis:
            gap_info = room_analysis['gap_analysis']
            gap_summary.append({
                'Data Source': f'Room: {room_name}',
                'Total Gaps': gap_info.get('total_gaps', 0),
                'Max Gap (hours)': gap_info.get('largest_gap_hours', 0),
                'Total Gap Time (hours)': gap_info.get('total_gap_duration_hours', 0)
            })

if gap_summary:
    gap_df = pd.DataFrame(gap_summary)
    gap_df = gap_df.sort_values('Total Gap Time (hours)', ascending=False)
    print(gap_df.to_string(index=False))
    
    # Visualize gaps
    if len(gap_df) > 5:
        # Show only top sources with gaps
        gap_df_top = gap_df.head(10)
        
        plt.figure(figsize=(12, 6))
        gap_df_top.plot(x='Data Source', y='Total Gap Time (hours)', kind='bar')
        plt.title('Data Gaps by Source (Top 10)')
        plt.xlabel('Data Source')
        plt.ylabel('Total Gap Time (hours)')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

## 6. Outlier Detection

In [None]:
# Detect outliers in key measurements
outlier_detector = OutlierDetector()

print("=== Outlier Detection ===")

# PV production outliers
if not pv_data.empty and 'InputPower' in pv_data.columns:
    pv_outliers = outlier_detector.detect_statistical_outliers(
        pv_data['InputPower'], method='modified_zscore'
    )
    pv_outlier_count = pv_outliers.sum()
    print(f"\nPV Production outliers: {pv_outlier_count} ({pv_outlier_count/len(pv_data)*100:.2f}%)")
    
    if pv_outlier_count > 0:
        # Show outlier distribution by hour
        outlier_hours = pv_data[pv_outliers].index.hour.value_counts().sort_index()
        
        plt.figure(figsize=(10, 5))
        outlier_hours.plot(kind='bar')
        plt.title('PV Production Outliers by Hour of Day')
        plt.xlabel('Hour')
        plt.ylabel('Number of Outliers')
        plt.tight_layout()
        plt.show()

# Temperature outliers for each room
temp_outlier_summary = []
for room_name, room_df in room_data.items():
    if isinstance(room_df, pd.DataFrame) and not room_df.empty:
        temp_col = None
        for col in room_df.columns:
            if 'temp' in col.lower():
                temp_col = col
                break
        
        if temp_col:
            outliers = outlier_detector.detect_contextual_outliers(
                room_df[temp_col], context_window='24H'
            )
            outlier_count = outliers.sum()
            temp_outlier_summary.append({
                'Room': room_name,
                'Outliers': outlier_count,
                'Percentage': f"{outlier_count/len(room_df)*100:.2f}%"
            })

if temp_outlier_summary:
    print("\nTemperature outliers by room:")
    outlier_df = pd.DataFrame(temp_outlier_summary)
    outlier_df = outlier_df.sort_values('Outliers', ascending=False)
    print(outlier_df.to_string(index=False))

## 7. Data Quality Recommendations

In [None]:
print("=== Data Quality Recommendations ===")

recommendations = []

# PV system recommendations
if 'curtailment' in pv_analysis:
    curt_ratio = pv_analysis['curtailment'].get('curtailment_ratio', 0)
    if curt_ratio > 0.1:  # More than 10% curtailment
        recommendations.append({
            'Priority': 'HIGH',
            'Area': 'PV System',
            'Issue': f'High curtailment ratio ({curt_ratio:.1%})',
            'Recommendation': 'Consider battery storage or load shifting to capture curtailed energy'
        })

# Export behavior recommendations
if 'export_behavior' in pv_analysis:
    export_freq = pv_analysis['export_behavior'].get('export_frequency', 0)
    if export_freq < 0.5:  # Exporting less than 50% of days
        recommendations.append({
            'Priority': 'MEDIUM',
            'Area': 'PV Export',
            'Issue': f'Low export frequency ({export_freq:.1%})',
            'Recommendation': 'Review price threshold settings for export optimization'
        })

# Relay system recommendations
if 'system_totals' in relay_analysis:
    diversity_factor = relay_analysis['system_totals'].get('diversity_factor', 1)
    if diversity_factor > 0.8:  # High simultaneous usage
        recommendations.append({
            'Priority': 'HIGH',
            'Area': 'Relay System',
            'Issue': f'High diversity factor ({diversity_factor:.2f})',
            'Recommendation': 'Implement relay coordination to reduce peak demand'
        })

# Data quality recommendations
if gap_summary:
    total_gap_hours = sum(item['Total Gap Time (hours)'] for item in gap_summary)
    if total_gap_hours > 100:  # More than 100 hours of gaps
        recommendations.append({
            'Priority': 'MEDIUM',
            'Area': 'Data Quality',
            'Issue': f'Significant data gaps ({total_gap_hours:.0f} hours total)',
            'Recommendation': 'Investigate data collection reliability and implement gap filling'
        })

if recommendations:
    rec_df = pd.DataFrame(recommendations)
    rec_df = rec_df.sort_values('Priority')
    print(rec_df.to_string(index=False))
else:
    print("No critical data quality issues identified.")

# Save summary report
print("\n=== Analysis Summary ===")
print(f"Analysis completed for period: {start_date.date()} to {end_date.date()}")
print(f"Data sources analyzed: {len([k for k,v in all_data.items() if not (isinstance(v, dict) and 'error' in v)])}")
print(f"Total recommendations: {len(recommendations)}")
print(f"High priority issues: {len([r for r in recommendations if r.get('Priority') == 'HIGH'])}")