# Solar Irradiance From AC Export

A Jupyter Notebook that does it's best to model and construct a historical solar irradiance time series from solar panel park's historical AC export data.

## 1. Project Setup

### 1.1 Imports

In [None]:
# --- Imports ---

# Standard Library Imports
from pathlib import Path
import os

# Third-Party Library Imports
import yaml
import pandas as pd
import plotly.io as pio
from dotenv import load_dotenv

print("✅ Libraries loaded successfully.")

***

### 1.2 Configuration

This project uses a two-step configuration process:

1.  **Path Definition (`.env`):** This file defines the project's physical location (`PROJECT_ROOT`) and the name of the configuration file. This separation ensures the notebook is portable across different machines and environments.
2.  **Parameter Definition (`config.yml`):** This file contains the physical and electrical parameters of your solar park(s), including sensitive information like GPS coordinates and detailed system specifications.

**To get started:**

1.  **Configure Paths:** Copy the template file `.env.example` to a new file named `.env`. Open `.env` and set the absolute path for the `PROJECT_ROOT` variable.
2.  **Configure Parks:** Copy the example configuration file `config.example.yml` to `config.yml`. Open `config.yml` and replace the placeholder values with the details of your solar installation.

The cell below loads the environment variables, resolves the final configuration path, and sets up the plotting environment.

In [None]:
# --- Configuration ---

# Load environment variables from .env file
load_dotenv()

# Define paths using environment variables
PROJECT_ROOT_STR = os.getenv("PROJECT_ROOT")
CONFIG_FILENAME = os.getenv("CONFIG_FILENAME", "config.yml")  # Fallback to config.yml
PRODUCTION_AND_PRICE_FILE_PATH = os.getenv(
    "PRODUCTION_AND_PRICE_FILE_PATH",
    "/home/user/solar-irradiance-from-ac-export/production.csv",
)
WEATHER_FILE_PATH = os.getenv(
    "WEATHER_FILE_PATH", "/home/user/solar-irradiance-from-ac-export/weather.csv"
)

if not PROJECT_ROOT_STR:
    # If PROJECT_ROOT is not set in .env, assume the current working directory
    PROJECT_ROOT_STR = os.getcwd()
    print(
        f"⚠️ WARNING: PROJECT_ROOT not set in .env. Using current directory: {PROJECT_ROOT_STR}"
    )

PROJECT_ROOT = Path(PROJECT_ROOT_STR)
CONFIG_PATH = PROJECT_ROOT / CONFIG_FILENAME

print(f"Project Root defined as: {PROJECT_ROOT}")
print(f"Configuration file path: {CONFIG_PATH}")

try:
    with open(CONFIG_PATH, "r", encoding="utf-8") as f:
        config = yaml.safe_load(f)

    # Extract park configurations
    PARK_CONFIGS = config.get("parks", {})

    if not PARK_CONFIGS:
        raise ValueError(
            "No parks defined under the 'parks' key in the configuration file."
        )

    # Create a list of park names for easy iteration later
    PARK_NAMES = list(PARK_CONFIGS.keys())

    # --- Load and Validate Target Park for Analysis ---
    TARGET_PARK_NAME = os.getenv("TARGET_PARK_NAME")

    if not TARGET_PARK_NAME:
        raise ValueError("TARGET_PARK_NAME is not set in the .env file. Please specify which park to analyze.")

    if TARGET_PARK_NAME not in PARK_NAMES:
        raise ValueError(
            f"The target park '{TARGET_PARK_NAME}' defined in .env is not found in 'config.yml'.\n"
            f"Available parks in config: {PARK_NAMES}"
        )

    print(f"🎯 Analysis will be performed for target park: '{TARGET_PARK_NAME}'")

    print(
        f"✅ Configuration loaded successfully from '{CONFIG_PATH}' for {len(PARK_NAMES)} park(s): {', '.join(PARK_NAMES)}."
    )

except FileNotFoundError:
    print(f"❌ CONFIGURATION ERROR: The '{CONFIG_PATH}' file was not found.")
    print(
        "Please check your .env file's PROJECT_ROOT setting, and ensure 'config.yml' exists at that location."
    )
    print(
        "If 'config.yml' is missing, copy 'config.example.yml' to 'config.yml' and fill in your park's details."
    )
except (yaml.YAMLError, ValueError) as e:
    print(
        f"❌ CONFIGURATION ERROR: Could not parse '{CONFIG_PATH}'. Please check its format. Details: {e}"
    )


# --- Plotting and Display Configuration ---
pio.templates.default = "plotly_dark"

# Set display options for better viewing in Jupyter
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)

print("Plotting and display options set.")

***
***

## 2. Data Loading

### Helper Functions

In [None]:
# --- Data Loading Helper Function ---


def load_park_specific_data(
    file_path: str,
    timestamp_col: str,
    park_name_col: str,
    required_data_cols: list[str],
    target_park_name: str,
    data_name: str,
) -> pd.DataFrame:
    """
    Loads, validates, and filters data for a single specified park from a long-format CSV.

    Args:
        file_path (str): Absolute path to the CSV file.
        timestamp_col (str): Name of the timestamp column.
        park_name_col (str): Name of the park identifier column.
        required_data_cols (list): List of required data column names.
        target_park_name (str): The specific park to extract data for.
        data_name (str): A descriptive name for the data (e.g., "Production").

    Returns:
        pandas.DataFrame: A DataFrame containing only the data for the target park,
                          with the park_name column removed. Returns an empty
                          DataFrame on failure.
    """
    print(f"--- Loading {data_name} Data for '{target_park_name}' ---")
    print(f"Attempting to load from: {file_path}")

    try:
        # 1. Load the full CSV
        df = pd.read_csv(
            file_path, parse_dates=[timestamp_col], index_col=timestamp_col
        )

        # 2. Basic Column Check
        all_required_cols = required_data_cols + [park_name_col]
        if not all(col in df.columns for col in all_required_cols):
            missing = [col for col in all_required_cols if col not in df.columns]
            raise ValueError(f"Missing required columns in {data_name} CSV: {missing}")

        # 3. Data Cleaning and Validation
        df.index = pd.to_datetime(df.index, utc=True)
        df = df.dropna(subset=[park_name_col])
        df[park_name_col] = df[park_name_col].astype(str)

        # 4. Check if Target Park Exists in Data
        if target_park_name not in df[park_name_col].unique():
            raise ValueError(
                f"Target park '{target_park_name}' not found in the {data_name} file."
            )

        # 5. Filter for Target Park and Finalize
        df_park = df[df[park_name_col] == target_park_name].copy()

        # Drop the now-redundant park name column
        df_park = df_park.drop(columns=[park_name_col])

        df_park = df_park.sort_index()
        print(f"✅ {data_name} data for '{target_park_name}' loaded successfully.")
        print(f"   Shape of final DataFrame: {df_park.shape}")
        print(f"   Time range: {df_park.index.min()} to {df_park.index.max()}")
        print("Sample:")
        print(df_park.sample(n=5))
        return df_park

    except FileNotFoundError:
        print(f"❌ DATA ERROR: The {data_name} file was not found at: {file_path}")
        return pd.DataFrame()
    except Exception as e:
        print(f"❌ AN UNEXPECTED ERROR OCCURRED during {data_name} data loading: {e}")
        return pd.DataFrame()

print("✅ Helper function load_park_specific_data defined.")

***

### 2.1 Hourly Production And Spot Price Data

In [None]:
# --- Load Production and Price Data ---

# Define required column names for production data
COL_TIMESTAMP = "timestamp_utc"
COL_PARK_NAME = "park_name"
PRODUCTION_DATA_COLS = ["ac_export_kwh", "spot_price_eur_mwh"]

# Load the data for the target park using the helper function
df_production = load_park_specific_data(
    file_path=PRODUCTION_AND_PRICE_FILE_PATH,
    timestamp_col=COL_TIMESTAMP,
    park_name_col=COL_PARK_NAME,
    required_data_cols=PRODUCTION_DATA_COLS,
    target_park_name=TARGET_PARK_NAME,  # type: ignore
    data_name="Production & Price",
)
assert isinstance(df_production.index, pd.DatetimeIndex)

***

### 2.2 Load Hourly Weather Data

In [None]:
# --- Load and Crop Weather Data ---

# Define required column names for weather data
WEATHER_DATA_COLS = ["temp_air_c", "wind_speed_m_s", "pressure_hpa", "ghi_w_m2"]

# Load the weather data for the target park using the helper function
df_weather = load_park_specific_data(
    file_path=WEATHER_FILE_PATH,
    timestamp_col=COL_TIMESTAMP,
    park_name_col=COL_PARK_NAME,
    required_data_cols=WEATHER_DATA_COLS,
    target_park_name=TARGET_PARK_NAME,  # type: ignore
    data_name="Weather",
)
assert isinstance(df_weather.index, pd.DatetimeIndex)

# Post-processing: Crop the weather data to the production time range
if not df_production.empty and not df_weather.empty:
    start_time = df_production.index.min()
    end_time = df_production.index.max()

    original_rows = len(df_weather)
    df_weather = df_weather.loc[start_time:end_time].copy()

    print(f"\nWeather data cropped to production time range.")
    print(f"   Original rows: {original_rows}, Cropped rows: {len(df_weather)}")
    print(f"   New time range: {df_weather.index.min()} to {df_weather.index.max()}")

***
***

## 3. Data Upsampling and Feature Engineering 

### Helper Functions 

In [None]:
# --- Interpolation Helper Function ---

from typing import Literal
import numpy as np


def interpolate_by_gap_size(
    data: pd.Series | pd.DataFrame,
    max_gap_size: int = 1,
    method: Literal[
        "linear",
        "time",
        "index",
        "values",
        "pad",
        "nearest",
        "zero",
        "slinear",
        "quadratic",
        "cubic",
        "barycentric",
        "polynomial",
        "spline",
        "krogh",
        "piecewise_polynomial",
        "pchip",
        "akima",
        "cubicspline",
        "from_derivatives",
    ] = "linear",
    **kwargs
) -> pd.Series | pd.DataFrame:
    """
    Interpolates NaN values in a Series or DataFrame, but only for gaps
    that are less than or equal to a specified maximum size.

    For a DataFrame, the operation is applied column-wise.

    Args:
        data (pd.Series | pd.DataFrame): Input data with potential NaNs.
        max_gap_size (int): Max consecutive NaNs to interpolate. Gaps larger
                            than this are ignored. Defaults to 1.
        method (str): Interpolation technique (see pandas.Series.interpolate).
                      Defaults to "linear".
        **kwargs: Additional keyword arguments for the interpolate() method.

    Returns:
        pd.Series | pd.DataFrame: New object with specified gaps filled.
    """
    if not isinstance(data, (pd.Series, pd.DataFrame)):
        raise TypeError("Input `data` must be a pandas Series or DataFrame.")
    if not isinstance(max_gap_size, int) or max_gap_size <= 0:
        raise ValueError("`max_gap_size` must be a positive integer.")

    if isinstance(data, pd.Series):
        if data.empty or data.isna().sum() == 0:
            return data.copy()

        # Identify gaps by creating groups based on non-NaN values
        grouper = data.notna().cumsum()
        # Calculate the size of each NaN gap
        gap_sizes = data.isna().groupby(grouper).transform("sum")
        # Create a boolean mask for NaNs that are part of small-enough gaps
        mask_to_fill = data.isna() & (gap_sizes <= max_gap_size)

        # Interpolate the entire series to get the fill values
        fully_interpolated = data.interpolate(method=method, **kwargs) # type: ignore

        # Apply the fill values only where the mask is True
        return data.where(~mask_to_fill, fully_interpolated)

    # If it's a DataFrame, apply this function to each column
    return data.apply(
        lambda col: interpolate_by_gap_size(
            col, max_gap_size=max_gap_size, method=method, **kwargs
        )
    )


print("✅ Helper function interpolate_by_gap_size defined.")

***

### 3.1 Upsample Production Data 

In [None]:
# --- Upsample Production and Price Data to 30-Minute Frequency ---

print(f"--- Upsampling data for park: '{TARGET_PARK_NAME}' ---")

# 1. Determine the actual production period to avoid creating a massive index
first_prod_time: pd.Timestamp = df_production[df_production["ac_export_kwh"] > 0].index.min()
last_prod_time: pd.Timestamp = df_production[df_production["ac_export_kwh"] > 0].index.max()

if pd.isna(first_prod_time) or pd.isna(last_prod_time):
    print("⚠️ No valid production data (> 0 kWh) found. Cannot proceed.")
    df_production_30min = pd.DataFrame()
else:
    print(f"   - Production data range: {first_prod_time} to {last_prod_time}")

    # Crop the hourly data to the actual production period
    df_prod_cropped = df_production.loc[first_prod_time:last_prod_time]

    # Create a full 30-minute time range for this period
    full_30min_range = pd.date_range(
        start=first_prod_time, end=last_prod_time, freq="30min", tz="UTC"
    )

    # 2. Upsample spot price using forward-fill
    # The price at HH:30 is the same as the price at HH:00
    price_30min = (
        df_prod_cropped["spot_price_eur_mwh"]
        .reindex(full_30min_range)
        .ffill()
        .to_frame()
    )

    # 3. Upsample production data from hourly kWh to 30-min average Power (W)
    # Convert kWh (energy) to W (power) for the hourly interval
    power_w_hourly = df_prod_cropped["ac_export_kwh"] * 1000

    # Shift the index 30 mins forward. The power calculated from the interval
    # [HH:00, HH+1:00] is best represented at the midpoint, HH:30.
    power_w_hourly.index = power_w_hourly.index + pd.Timedelta(minutes=30)

    # Reindex to the full 30-min range. This introduces NaNs at HH:00.
    # Use interpolate_by_gap_size(max_gap_size=1) to linearly fill only the
    # points at HH:00, which are surrounded by two HH:30 points.
    power_w_30min = interpolate_by_gap_size(
        power_w_hourly.reindex(full_30min_range), max_gap_size=1, method="linear"
    ).to_frame("ac_export_w") # type: ignore

    # 4. Combine the upsampled series into a single DataFrame
    df_production_30min = price_30min.join(power_w_30min)

    print("\n✅ Production and price data upsampled to 30-minute frequency.")
    print(f"   - Shape of new DataFrame: {df_production_30min.shape}")
    print(
        f"   - New time range: {df_production_30min.index.min()} to {df_production_30min.index.max()}"
    )
    print("   - Sample data:")
    print(df_production_30min.sample(n=5))

***

### 3.2 Upsample Weather Data

In [None]:
# --- Upsample Weather Data to 30-Minute Frequency ---

print("--- Upsampling weather data ---")

# 1. Use the index from the upsampled production data for perfect alignment
target_30min_range = df_production_30min.index
print(
    f"   - Aligning to target time range: {target_30min_range.min()} to {target_30min_range.max()}"
)

# 2. Handle standard weather variables with linear interpolation
standard_weather_cols = ["temp_air_c", "wind_speed_m_s", "pressure_hpa"]
df_weather_standard_30min = interpolate_by_gap_size(
    df_weather[standard_weather_cols].reindex(target_30min_range),
    max_gap_size=1,
    method="linear",
)

# 3. Handle GHI with a time shift before interpolation
# The hourly GHI value at HH:00 represents the average over the interval [HH-1:00, HH:00].
# We shift the timestamp back 30 mins to place this value at the interval's midpoint (HH-1:30).
ghi_hourly = df_weather[["ghi_w_m2"]].copy()
assert isinstance(ghi_hourly.index, pd.DatetimeIndex)
ghi_hourly.index = ghi_hourly.index - pd.Timedelta(minutes=30)

# Reindex and interpolate. This will correctly fill the values at the top of the hour (HH:00).
df_ghi_30min = interpolate_by_gap_size(
    ghi_hourly.reindex(target_30min_range), max_gap_size=1, method="linear"
)

# 4. Combine all upsampled weather series
df_weather_30min = df_weather_standard_30min.join(df_ghi_30min)

print("\n✅ Weather data upsampled to 30-minute frequency.")
print(f"   - Shape of new DataFrame: {df_weather_30min.shape}")
print("   - Sample data:")
print(df_weather_30min.sample(n=5))

***

### 3.3 Model PVLIB Features

In [None]:
# --- Model Solar Geometry and Clear-Sky Irradiance with PVLIB ---

import pvlib

print(f"--- Modeling PVLIB features for '{TARGET_PARK_NAME}' ---")

# 1. Extract location parameters from the main config
park_params = PARK_CONFIGS[TARGET_PARK_NAME]
location = pvlib.location.Location(
    latitude=park_params["location"]["latitude"],
    longitude=park_params["location"]["longitude"],
    tz="UTC",
)

# 2. Prepare inputs for pvlib calculations
times = df_weather_30min.index
pressure_pa = df_weather_30min["pressure_hpa"] * 100  # Convert hPa to Pa
temperature_c = df_weather_30min["temp_air_c"]

# 3. Perform pvlib calculations
print("   - Calculating solar position...")
solar_position = location.get_solarposition(
    times, pressure=pressure_pa, temperature=temperature_c  # type: ignore
)
assert isinstance(solar_position, pd.DataFrame)

print("   - Calculating airmass...")
airmass_relative = pvlib.atmosphere.get_relative_airmass(
    zenith=solar_position["apparent_zenith"]
)
airmass_absolute = pvlib.atmosphere.get_absolute_airmass(
    airmass_relative=airmass_relative, pressure=pressure_pa  # type: ignore
)

print("   - Calculating extraterrestrial radiation...")
dni_extra = pvlib.irradiance.get_extra_radiation(times)

print("   - Calculating clear-sky irradiance (Ineichen model)...")
clearsky_irrad = location.get_clearsky(
    times,
    model="ineichen",
    solar_position=solar_position,
    airmass_absolute=airmass_absolute,
    dni_extra=dni_extra,
)
assert isinstance(clearsky_irrad, pd.DataFrame)
clearsky_irrad = clearsky_irrad.rename(
    columns={
        "ghi": "ghi_clearsky_w_m2",
        "dni": "dni_clearsky_w_m2",
        "dhi": "dhi_clearsky_w_m2",
    }
)

print("   - Adding 'is_day' flag...")
is_day = solar_position["apparent_zenith"] < 90.0

# 4. Assemble the final pvlib DataFrame
df_pvlib_30min = pd.concat(
    [
        solar_position[["apparent_zenith", "zenith", "azimuth"]],
        pd.Series(airmass_relative, name="airmass_relative"),
        pd.Series(airmass_absolute, name="airmass_absolute"),
        pd.Series(dni_extra, name="dni_extra_w_m2"),
        clearsky_irrad,
        pd.Series(is_day, name="is_day"),
    ],
    axis=1,
)

print("\n✅ PVLIB features modeled successfully.")
print(f"   - Shape of new DataFrame: {df_pvlib_30min.shape}")
print("   - Sample data:")
print(df_pvlib_30min.sample(n=5))

***

### 3.4 Merge DataFrames 

In [None]:
# --- Merge All 30-Minute DataFrames ---

print("--- Merging all 30-minute data sources ---")

# Join the three dataframes on their common DatetimeIndex
# This preserves all rows and fills with NaNs where data is missing in one source
df_30min = df_production_30min.join([df_weather_30min, df_pvlib_30min])

print("\n✅ All data sources successfully merged into a single DataFrame.")
print(f"   - Final shape: {df_30min.shape}")
print(f"   - Final time range: {df_30min.index.min()} to {df_30min.index.max()}")
print("   - Sample of merged data:")
print(df_30min.sample(n=5))

***
***

## 4. Anomaly Flagging

### 4.1 Flag Clipping and Curtailment

In [None]:
# --- Flag Clipping and Curtailment Events ---

assert isinstance(df_30min.index, pd.DatetimeIndex)
print(f"--- Flagging anomalies for '{TARGET_PARK_NAME}' ---")

# --- Configuration ---
# Fetch park-specific parameters from the config
park_config = PARK_CONFIGS[TARGET_PARK_NAME]
export_limit_w = park_config["export_limit_kw"] * 1000
pdc0_w = park_config["system"]["pdc0"]

# Define thresholds for flagging logic
# Flag as clipped if power is within this percentage of the export limit
CLIPPING_TOLERANCE_PCT = 0.01
# Flag as curtailed if power is below this percentage of DC capacity during negative prices
CURTAILMENT_POWER_THRESHOLD_PCT = 0.01

clipping_threshold_w = export_limit_w * (1 - CLIPPING_TOLERANCE_PCT)
curtailment_threshold_w = pdc0_w * CURTAILMENT_POWER_THRESHOLD_PCT

print(f"   - Clipping threshold: >= {clipping_threshold_w:.2f} W")
print(
    f"   - Curtailment threshold: < {curtailment_threshold_w:.2f} W (during negative prices)"
)

# Initialize flag columns to False
df_30min["is_clipped"] = False
df_30min["is_curtailed"] = False

# --- Clipping Detection Logic ---
print("\n1. Applying Clipping Detection Logic...")

# Step 1: Directly flag HH:30 points at or near the export limit
is_hh30 = df_30min.index.minute == 30
direct_clip_mask = is_hh30 & (df_30min["ac_export_w"] >= clipping_threshold_w)
df_30min.loc[direct_clip_mask, "is_clipped"] = True
print(f"   - Found {direct_clip_mask.sum()} HH:30 points with direct clipping.")

# Step 2: Propagate flag to adjacent HH:30 points (Temporal Contamination)
clipped_at_hh30 = df_30min["is_clipped"][is_hh30]
neighbor_clip_mask = clipped_at_hh30.shift(
    1, fill_value=False
) | clipped_at_hh30.shift(-1, fill_value=False)

# --- FIX ---
# The original code incorrectly used `neighbor_clip_mask.index`, which flagged all HH:30 points.
# The corrected code uses boolean alignment to flag only the HH:30 points where the neighbor_clip_mask is True.
df_30min.loc[is_hh30, "is_clipped"] |= neighbor_clip_mask
print(
    f"   - Propagated clipping flag to {neighbor_clip_mask.sum()} neighboring HH:30 points."
)

# Step 3: Propagate flag to interpolated HH:00 points (Interpolation Contamination)
is_hh00 = df_30min.index.minute == 0
interpolated_clip_mask = df_30min["is_clipped"].shift(
    1, fill_value=False
) | df_30min["is_clipped"].shift(-1, fill_value=False)
df_30min.loc[is_hh00, "is_clipped"] |= interpolated_clip_mask[is_hh00]
print(f"   - Propagated clipping flag to interpolated HH:00 points.")

# --- Curtailment Detection Logic ---
print("\n2. Applying Curtailment Detection Logic...")

# Step 1: Directly flag HH:30 points with negative prices and low production during the day
# The 'is_day' check prevents flagging nighttime hours where production is naturally zero.
direct_curtail_mask = (
    is_hh30
    & df_30min["is_day"]
    & (df_30min["spot_price_eur_mwh"] < 0)
    & (df_30min["ac_export_w"] < curtailment_threshold_w)
)
df_30min.loc[direct_curtail_mask, "is_curtailed"] = True
print(
    f"   - Found {direct_curtail_mask.sum()} HH:30 points with direct curtailment."
)

# Step 2: Propagate flag to interpolated HH:00 points
interpolated_curtail_mask = df_30min["is_curtailed"].shift(
    1, fill_value=False
) | df_30min["is_curtailed"].shift(-1, fill_value=False)
df_30min.loc[is_hh00, "is_curtailed"] |= interpolated_curtail_mask[is_hh00]
print(f"   - Propagated curtailment flag to interpolated HH:00 points.")

# --- Final Summary ---
total_clipped = df_30min["is_clipped"].sum()
total_curtailed = df_30min["is_curtailed"].sum()
print("\n✅ Anomaly flagging complete.")
print(
    f"   - Total points flagged as clipped: {total_clipped} ({total_clipped / len(df_30min):.2%})"
)
print(
    f"   - Total points flagged as curtailed: {total_curtailed} ({total_curtailed / len(df_30min):.2%})"
)

***

### 4.2 Visually Verify Flags 

In [None]:
# --- Visually Verify Anomaly Flags with Interactive Plots ---

import random
import plotly.graph_objects as go


def create_flag_verification_plot(
    df_day: pd.DataFrame,
    flag_col: str,
    title: str,
    flag_color: str,
    normal_color: str = "royalblue",
) -> go.Figure:
    """Creates an interactive bar plot to visualize a specific anomaly flag for a single day."""

    df_plot = df_day.copy()
    df_plot["color"] = df_plot[flag_col].map({True: flag_color, False: normal_color})

    fig = go.Figure()

    fig.add_trace(
        go.Bar(
            x=df_plot.index,
            y=df_plot["ac_export_w"],
            marker_color=df_plot["color"],
            customdata=df_plot[flag_col],
            hovertemplate="<b>Time</b>: %{x|%H:%M}<br><b>Power</b>: %{y:.0f} W<br><b>Flagged</b>: %{customdata}<extra></extra>",
            # Set period to 30 minutes (in milliseconds) and align bars to the start of the period
            xperiod=30 * 60 * 1000,
            xperiodalignment="middle",
        )
    )

    fig.update_layout(
        title_text=title,
        xaxis_title="Time [UTC]",
        yaxis_title="AC Export [W]",
        bargap=0.05,
    )

    return fig


assert isinstance(df_30min.index, pd.DatetimeIndex)

# --- Plot 1: Clipping Verification ---
clipped_days = df_30min[df_30min["is_clipped"]].index.normalize().unique() # type: ignore

if not clipped_days.empty:
    random_clipped_day = random.choice(clipped_days)
    df_plot_clip = df_30min[df_30min.index.date == random_clipped_day.date()]

    fig_clip = create_flag_verification_plot(
        df_day=df_plot_clip,
        flag_col="is_clipped",
        title=f"Clipping Verification for {random_clipped_day.strftime('%Y-%m-%d')}",
        flag_color="crimson",
    )
    fig_clip.show()
else:
    print("ℹ️ No days with clipping were found to plot.")

# --- Plot 2: Curtailment Verification ---
curtailed_days = df_30min[df_30min["is_curtailed"]].index.normalize().unique() # type: ignore

if not curtailed_days.empty:
    random_curtailed_day = random.choice(curtailed_days)
    df_plot_curtail = df_30min[df_30min.index.date == random_curtailed_day.date()]

    fig_curtail = create_flag_verification_plot(
        df_day=df_plot_curtail,
        flag_col="is_curtailed",
        title=f"Curtailment Verification for {random_curtailed_day.strftime('%Y-%m-%d')}",
        flag_color="crimson",
    )
    fig_curtail.show()
else:
    print("ℹ️ No days with curtailment were found to plot.")

***
***

## 5. AC Export -> Plane of Array (POA) Estimation

### Helper Functions

In [None]:
# --- Helper Functions ---

# This section defines the core function for modeling POA irradiance from AC power.
# The process is iterative:
# 1. Guess the cell temperature (`T_cell`).
# 2. Calculate the expected DC power from the measured AC power.
# 3. Use the PVWatts DC power model to estimate the POA irradiance required to produce that DC power, given the current `T_cell` guess.
# 4. Recalculate `T_cell` using the estimated POA irradiance and the SAPM temperature model.
# 5. Compare the new `T_cell` with the previous guess. If they are close enough, the process has converged. If not, repeat from step 3 with the new `T_cell`.
#
# This approach effectively reverses the standard power simulation chain (Irradiance -> DC Power -> AC Power) to derive the initial irradiance.

import numpy as np
import pandas as pd
import pvlib


def estimate_poa_and_temp_cell(
    p_ac: float,
    temp_air: float,
    wind_speed: float,
    pdc0: float,
    gamma_pmp: float,
    inverter_efficiency: float,
    temp_model_params: dict[str, float],
    commissioning_date: pd.Timestamp,
    current_timestamp: pd.Timestamp,
    degradation_rate: float,
) -> tuple[float, float, float, float]:  # Return type for iterations changed to float
    """
    Iteratively estimates POA irradiance and cell temperature from AC power,
    accounting for system degradation over time.

    Args:
        p_ac: AC power output in Watts.
        temp_air: Ambient air temperature in Celsius.
        wind_speed: Wind speed in m/s.
        pdc0: Nameplate DC power of the system at STC in Watts.
        gamma_pmp: Power temperature coefficient (e.g., -0.004 for -0.4%/°C).
        inverter_efficiency: Nominal inverter efficiency (e.g., 0.985).
        temp_model_params: Parameters for the SAPM cell temperature model.
        commissioning_date: The timestamp of the system's start of operation.
        current_timestamp: The timestamp of the data point being modeled.
        degradation_rate: The annual degradation rate (e.g., 0.005 for 0.5%).

    Returns:
        A tuple containing:
        - Estimated POA irradiance (W/m^2).
        - Estimated cell temperature (°C).
        - Final temperature difference upon convergence or max iterations (°C).
        - Number of iterations performed (can be np.nan).
    """
    if any(
        pd.isna(val)
        for val in [p_ac, temp_air, wind_speed, pdc0, gamma_pmp, inverter_efficiency]
    ):
        # Return np.nan for all values, including iterations, if inputs are invalid.
        return np.nan, np.nan, np.nan, np.nan

    if p_ac <= 0:
        return 0.0, temp_air, 0.0, 0

    # Calculate effective DC capacity for the current timestamp considering degradation
    time_delta = current_timestamp - commissioning_date
    years_passed = time_delta.total_seconds() / (365.25 * 24 * 3600)

    if years_passed < 0:
        years_passed = 0.0

    pdc_effective = pdc0 * (1 - degradation_rate) ** years_passed

    # Constants and initial guess
    TEMP_REF = 25.0
    IRRAD_REF = 1000.0
    MAX_ITER = 10
    TOLERANCE = 0.1
    temp_cell_guess = temp_air + 20.0
    p_dc = p_ac / inverter_efficiency

    # Address the unbound linter warning
    irrad_estimate = np.nan
    temp_cell_new = np.nan
    temp_diff = np.nan

    for i in range(1, MAX_ITER + 1):
        temp_factor = 1 + gamma_pmp * (temp_cell_guess - TEMP_REF)

        if temp_factor <= 0:
            return 0.0, temp_air, np.nan, i

        irrad_estimate = (p_dc / (pdc_effective * temp_factor)) * IRRAD_REF
        irrad_estimate = max(0, irrad_estimate)

        temp_cell_new = pvlib.temperature.sapm_cell(
            poa_global=irrad_estimate,
            temp_air=temp_air,
            wind_speed=wind_speed,
            **temp_model_params,
        )
        temp_diff = abs(temp_cell_new - temp_cell_guess)

        if temp_diff < TOLERANCE:
            return irrad_estimate, temp_cell_new, temp_diff, i

        temp_cell_guess = temp_cell_new

    return irrad_estimate, temp_cell_new, temp_diff, MAX_ITER


print("✅ Helper function estimate_poa_and_temp_cell defined.")

***

### 5.1 Main Processing and Data Integration

In [None]:
# --- Main Processing and Data Integration ---

# --- Retrieve Park-Specific Configuration ---
try:
    park_config = PARK_CONFIGS[TARGET_PARK_NAME]
    system_config = park_config["system"]
    temp_model_config = park_config["temperature_model"]

    pdc0: float = system_config["pdc0"]
    gamma_pmp: float = system_config["gamma_pmp"]
    inverter_efficiency: float = system_config["inverter_efficiency"]
    degradation_rate: float = system_config["degradation_rate"]
    commissioning_date_str: str = system_config["commissioning_date"]
    commissioning_ts: pd.Timestamp = pd.to_datetime(commissioning_date_str, utc=True)

    print(f"⚙️ Using parameters for '{TARGET_PARK_NAME}':")
    print(f"  - Commissioning Date: {commissioning_ts.date()}")
    print(f"  - Nameplate DC Power (pdc0): {pdc0 / 1e3:,.1f} kW")
    print(f"  - Annual Degradation: {degradation_rate:.1%}")
    print(f"  - Temp. Coefficient (gamma_pmp): {gamma_pmp * 100:.3f} %/°C")
    print(f"  - Inverter Efficiency: {inverter_efficiency:.1%}")

except KeyError as e:
    print(
        f"❌ CONFIGURATION ERROR: Missing key {e} for park '{TARGET_PARK_NAME}' in config.yml."
    )
    raise

# --- Define Temperature Model from Config ---
try:
    model_type: str = temp_model_config["model_type"]
    model_name: str = temp_model_config["model_name"]
    temp_model_parameters = pvlib.temperature.TEMPERATURE_MODEL_PARAMETERS[model_type][
        model_name
    ]
    print(f"\n🌡️ Using {model_type.upper()} temperature model: '{model_name}'")
except KeyError:
    print(
        f"❌ CONFIGURATION ERROR: Invalid temperature model '{model_type}/{model_name}' specified in config.yml." # type: ignore
    )
    print(
        "Please check available models in pvlib.temperature.TEMPERATURE_MODEL_PARAMETERS."
    )
    raise

# --- Prepare DataFrame for Re-running ---
# Drop columns from previous runs to avoid conflicts during the join operation.
cols_to_drop = [
    "poa_estimated_w_m2",
    "temp_cell_estimated_c",
    "temp_convergence_diff",
    "iterations",
]
df_30min = df_30min.drop(columns=cols_to_drop, errors='ignore')

# --- Run POA Estimation ---
estimation_mask = (
    (df_30min["ac_export_w"] > 0) &
    (~df_30min["is_clipped"]) &
    (~df_30min["is_curtailed"]) &
    (df_30min["is_day"])
)
df_analysis = df_30min.loc[estimation_mask].copy()

print(f"\n🔬 Running POA estimation for {len(df_analysis):,} data points (daytime, non-clipped, non-curtailed, positive power).")

results = df_analysis.apply(
    lambda row: estimate_poa_and_temp_cell(
        p_ac=row["ac_export_w"],
        temp_air=row["temp_air_c"],
        wind_speed=row["wind_speed_m_s"],
        pdc0=pdc0,
        gamma_pmp=gamma_pmp,
        inverter_efficiency=inverter_efficiency,
        temp_model_params=temp_model_parameters,
        commissioning_date=commissioning_ts,
        current_timestamp=row.name,
        degradation_rate=degradation_rate,
    ),
    axis=1,
)

if not results.empty:
    results_df = pd.DataFrame(
        results.tolist(),
        index=df_analysis.index,
        columns=cols_to_drop, # Use the list defined earlier
    )
    df_30min = df_30min.join(results_df)

print("✅ Estimation complete.")

# --- Final Data Cleanup ---
print("\n🧹 Cleaning up and filling non-estimated periods...")

# For nighttime, irradiance is 0 and cell temperature equals ambient.
night_mask = ~df_30min["is_day"]
df_30min.loc[night_mask, "poa_estimated_w_m2"] = 0.0
df_30min.loc[night_mask, "temp_cell_estimated_c"] = df_30min.loc[night_mask, "temp_air_c"]

# For all remaining NaNs in 'iterations' (night, clipped, curtailed, etc.), fill with 0.
df_30min["iterations"] = df_30min["iterations"].fillna(0).astype(int)

print("✅ Cleanup complete. Clipped/curtailed daytime periods remain as NaN for irradiance and temperature.")

# --- Display Results ---
print("\n📊 Sample of dataframe with new estimated POA and cell temperature columns:")
display(df_30min.sample(5))

***
***

## 6. Modeling GHI from Estimated POA

In [None]:
import numpy as np
import pandas as pd
import pvlib
from tqdm.notebook import tqdm

print("--- Starting GHI Modeling from Estimated POA ---")

# --- 1. Preparation ---
GHI_MODEL_COLS = [
    "ghi_modeled_w_m2",
    "dni_modeled_w_m2",
    "dhi_modeled_w_m2",
    "kt_modeled",
    "ghi_model_converged",
    "ghi_model_iterations",
]
IRRADIANCE_COLS = GHI_MODEL_COLS[:4]

df_30min.drop(
    columns=[col for col in GHI_MODEL_COLS if col in df_30min.columns],
    inplace=True,
    errors="ignore",
)

df_30min["ghi_model_converged"] = pd.Series(
    pd.NA, index=df_30min.index, dtype="boolean"
)
df_30min["ghi_model_iterations"] = np.nan
for col in IRRADIANCE_COLS:
    df_30min[col] = np.nan

location_params = PARK_CONFIGS[TARGET_PARK_NAME]["location"]

# --- 2. GHI Reverse Transposition with Progress Bar ---
print("Step 1: Calculating GHI for daytime points with valid POA...")

# A more robust mask: only process daytime points with a positive POA estimate.
# The > 0 condition implicitly handles both NaNs and zeros.
calc_mask = df_30min["is_day"] & (df_30min["poa_estimated_w_m2"] > 0)
df_to_process = df_30min.loc[calc_mask]
assert isinstance(df_to_process.index, pd.DatetimeIndex)

print(f"   - Found {len(df_to_process)} points to process.")

if not df_to_process.empty:
    monthly_groups = df_to_process.groupby(
        [df_to_process.index.year, df_to_process.index.month]
    )
    results_list = []

    pbar = tqdm(monthly_groups, desc="Modeling GHI (monthly chunks)")
    for (year, month), group in pbar:
        pbar.set_postfix_str(f"{year}-{month:02d}")

        ghi_results_chunk = pvlib.irradiance.ghi_from_poa_driesse_2023(
            surface_tilt=location_params["surface_tilt"],
            surface_azimuth=location_params["surface_azimuth"],
            solar_zenith=group["zenith"],
            solar_azimuth=group["azimuth"],
            poa_global=group["poa_estimated_w_m2"],
            dni_extra=group["dni_extra_w_m2"],
            albedo=location_params["albedo"],
            full_output=True,
        )

        results_list.append(
            pd.DataFrame(
                {
                    "ghi_modeled_w_m2": ghi_results_chunk[0],
                    "ghi_model_converged": ghi_results_chunk[1],
                    "ghi_model_iterations": ghi_results_chunk[2],
                },
                index=group.index,
            )
        )

    if results_list:
        all_results_df = pd.concat(results_list)
        df_30min.update(all_results_df)

# --- 3. GHI Decomposition (ERBS Model) ---
# Decompose only where GHI was successfully modeled and converged.
decomp_mask = df_30min["ghi_modeled_w_m2"].notna() & (
    df_30min["ghi_model_converged"] == True
)
print(f"Step 2: Decomposing GHI for {decomp_mask.sum()} valid points...")

if decomp_mask.any():
    decomposed = pvlib.irradiance.erbs_driesse(
        ghi=df_30min.loc[decomp_mask, "ghi_modeled_w_m2"],
        zenith=df_30min.loc[decomp_mask, "zenith"],
        datetime_or_doy=df_30min.loc[decomp_mask].index,
    ).rename( # type: ignore
        columns={
            "dni": "dni_modeled_w_m2",
            "dhi": "dhi_modeled_w_m2",
            "kt": "kt_modeled",
        }
    )
    df_30min.update(decomposed)

# --- 4. Data Cleaning and Finalization ---
print("Step 3: Cleaning and finalizing modeled irradiance data...")

failed_convergence_mask = df_30min["ghi_model_converged"] == False
df_30min.loc[failed_convergence_mask, IRRADIANCE_COLS] = np.nan
print(
    f"   - Invalidated {failed_convergence_mask.sum()} points due to GHI model non-convergence."
)

unrealistic_dni_mask = df_30min["dni_modeled_w_m2"] > (
    df_30min["dni_clearsky_w_m2"] * 1.05
)
df_30min.loc[unrealistic_dni_mask, IRRADIANCE_COLS] = np.nan
print(
    f"   - Invalidated {unrealistic_dni_mask.sum()} points exceeding the clear-sky DNI limit."
)

night_mask = ~df_30min["is_day"]
df_30min.loc[night_mask, IRRADIANCE_COLS] = df_30min.loc[
    night_mask, IRRADIANCE_COLS
].fillna(0)
df_30min["ghi_model_converged"] = df_30min["ghi_model_converged"].fillna(False)
df_30min.loc[night_mask, "ghi_model_iterations"] = df_30min.loc[
    night_mask, "ghi_model_iterations"
].fillna(0)

for col in IRRADIANCE_COLS:
    if col in df_30min.columns:
        df_30min[col] = df_30min[col].clip(lower=0)

print("\n✅ GHI Modeling and Decomposition complete.")

display_cols = [
    "poa_estimated_w_m2",
    "ghi_modeled_w_m2",
    "dni_modeled_w_m2",
    "dhi_modeled_w_m2",
    "kt_modeled",
    "ghi_clearsky_w_m2",
    "dni_clearsky_w_m2",
    "ghi_model_converged",
    "ghi_model_iterations",
]
display(df_30min[display_cols].sample(5))