# Solar Irradiance From AC Export

A Jupyter Notebook that does it's best to model and construct a historical solar irradiance time series from solar panel park's historical AC export data.

## 1. Project Setup

### 1.1 Imports

In [None]:
# --- Imports ---

# Standard Library Imports
from pathlib import Path
import os

# Third-Party Library Imports
import yaml
import pandas as pd
import plotly.io as pio
from dotenv import load_dotenv

print("✅ Libraries loaded successfully.")

***

### 1.2 Configuration

This project uses a two-step configuration process:

1.  **Path Definition (`.env`):** This file defines the project's physical location (`PROJECT_ROOT`) and the name of the configuration file. This separation ensures the notebook is portable across different machines and environments.
2.  **Parameter Definition (`config.yml`):** This file contains the physical and electrical parameters of your solar park(s), including sensitive information like GPS coordinates and detailed system specifications.

**To get started:**

1.  **Configure Paths:** Copy the template file `.env.example` to a new file named `.env`. Open `.env` and set the absolute path for the `PROJECT_ROOT` variable.
2.  **Configure Parks:** Copy the example configuration file `config.example.yml` to `config.yml`. Open `config.yml` and replace the placeholder values with the details of your solar installation.

The cell below loads the environment variables, resolves the final configuration path, and sets up the plotting environment.

In [None]:
# --- Configuration ---

# Load environment variables from .env file
load_dotenv()

# Define paths using environment variables
PROJECT_ROOT_STR = os.getenv("PROJECT_ROOT")
CONFIG_FILENAME = os.getenv("CONFIG_FILENAME", "config.yml")  # Fallback to config.yml
PRODUCTION_AND_PRICE_FILE_PATH = os.getenv(
    "PRODUCTION_AND_PRICE_FILE_PATH",
    "/home/user/solar-irradiance-from-ac-export/production.csv",
)
WEATHER_FILE_PATH = os.getenv(
    "WEATHER_FILE_PATH", "/home/user/solar-irradiance-from-ac-export/weather.csv"
)

if not PROJECT_ROOT_STR:
    # If PROJECT_ROOT is not set in .env, assume the current working directory
    PROJECT_ROOT_STR = os.getcwd()
    print(
        f"⚠️ WARNING: PROJECT_ROOT not set in .env. Using current directory: {PROJECT_ROOT_STR}"
    )

PROJECT_ROOT = Path(PROJECT_ROOT_STR)
CONFIG_PATH = PROJECT_ROOT / CONFIG_FILENAME

print(f"Project Root defined as: {PROJECT_ROOT}")
print(f"Configuration file path: {CONFIG_PATH}")

try:
    with open(CONFIG_PATH, "r", encoding="utf-8") as f:
        config = yaml.safe_load(f)

    # Extract park configurations
    PARK_CONFIGS = config.get("parks", {})

    if not PARK_CONFIGS:
        raise ValueError(
            "No parks defined under the 'parks' key in the configuration file."
        )

    # Create a list of park names for easy iteration later
    PARK_NAMES = list(PARK_CONFIGS.keys())

    # --- Load and Validate Target Park for Analysis ---
    TARGET_PARK_NAME = os.getenv("TARGET_PARK_NAME")

    if not TARGET_PARK_NAME:
        raise ValueError("TARGET_PARK_NAME is not set in the .env file. Please specify which park to analyze.")

    if TARGET_PARK_NAME not in PARK_NAMES:
        raise ValueError(
            f"The target park '{TARGET_PARK_NAME}' defined in .env is not found in 'config.yml'.\n"
            f"Available parks in config: {PARK_NAMES}"
        )

    print(f"🎯 Analysis will be performed for target park: '{TARGET_PARK_NAME}'")

    print(
        f"✅ Configuration loaded successfully from '{CONFIG_PATH}' for {len(PARK_NAMES)} park(s): {', '.join(PARK_NAMES)}."
    )

except FileNotFoundError:
    print(f"❌ CONFIGURATION ERROR: The '{CONFIG_PATH}' file was not found.")
    print(
        "Please check your .env file's PROJECT_ROOT setting, and ensure 'config.yml' exists at that location."
    )
    print(
        "If 'config.yml' is missing, copy 'config.example.yml' to 'config.yml' and fill in your park's details."
    )
except (yaml.YAMLError, ValueError) as e:
    print(
        f"❌ CONFIGURATION ERROR: Could not parse '{CONFIG_PATH}'. Please check its format. Details: {e}"
    )


# --- Plotting and Display Configuration ---
pio.templates.default = "plotly_dark"

# Set display options for better viewing in Jupyter
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)

print("Plotting and display options set.")

***
***

## 2. Data Loading

### Helper Functions

In [None]:
# --- Data Loading Helper Function ---


def load_park_specific_data(
    file_path: str,
    timestamp_col: str,
    park_name_col: str,
    required_data_cols: list[str],
    target_park_name: str,
    data_name: str,
) -> pd.DataFrame:
    """
    Loads, validates, and filters data for a single specified park from a long-format CSV.

    Args:
        file_path (str): Absolute path to the CSV file.
        timestamp_col (str): Name of the timestamp column.
        park_name_col (str): Name of the park identifier column.
        required_data_cols (list): List of required data column names.
        target_park_name (str): The specific park to extract data for.
        data_name (str): A descriptive name for the data (e.g., "Production").

    Returns:
        pandas.DataFrame: A DataFrame containing only the data for the target park,
                          with the park_name column removed. Returns an empty
                          DataFrame on failure.
    """
    print(f"--- Loading {data_name} Data for '{target_park_name}' ---")
    print(f"Attempting to load from: {file_path}")

    try:
        # 1. Load the full CSV
        df = pd.read_csv(
            file_path, parse_dates=[timestamp_col], index_col=timestamp_col
        )

        # 2. Basic Column Check
        all_required_cols = required_data_cols + [park_name_col]
        if not all(col in df.columns for col in all_required_cols):
            missing = [col for col in all_required_cols if col not in df.columns]
            raise ValueError(f"Missing required columns in {data_name} CSV: {missing}")

        # 3. Data Cleaning and Validation
        df.index = pd.to_datetime(df.index, utc=True)
        df = df.dropna(subset=[park_name_col])
        df[park_name_col] = df[park_name_col].astype(str)

        # 4. Check if Target Park Exists in Data
        if target_park_name not in df[park_name_col].unique():
            raise ValueError(
                f"Target park '{target_park_name}' not found in the {data_name} file."
            )

        # 5. Filter for Target Park and Finalize
        df_park = df[df[park_name_col] == target_park_name].copy()

        # Drop the now-redundant park name column
        df_park = df_park.drop(columns=[park_name_col])

        df_park = df_park.sort_index()
        print(f"✅ {data_name} data for '{target_park_name}' loaded successfully.")
        print(f"   Shape of final DataFrame: {df_park.shape}")
        print(f"   Time range: {df_park.index.min()} to {df_park.index.max()}")
        print("Sample:")
        print(df_park.sample(n=5))
        return df_park

    except FileNotFoundError:
        print(f"❌ DATA ERROR: The {data_name} file was not found at: {file_path}")
        return pd.DataFrame()
    except Exception as e:
        print(f"❌ AN UNEXPECTED ERROR OCCURRED during {data_name} data loading: {e}")
        return pd.DataFrame()

print("✅ Helper function load_park_specific_data defined.")

***

### 2.1 Hourly Production And Spot Price Data

In [None]:
# --- Load Production and Price Data ---

# Define required column names for production data
COL_TIMESTAMP = "timestamp_utc"
COL_PARK_NAME = "park_name"
PRODUCTION_DATA_COLS = ["ac_export_kwh", "spot_price_eur_mwh"]

# Load the data for the target park using the helper function
df_production = load_park_specific_data(
    file_path=PRODUCTION_AND_PRICE_FILE_PATH,
    timestamp_col=COL_TIMESTAMP,
    park_name_col=COL_PARK_NAME,
    required_data_cols=PRODUCTION_DATA_COLS,
    target_park_name=TARGET_PARK_NAME,  # type: ignore
    data_name="Production & Price",
)

***

### 2.2 Load Hourly Weather Data

In [None]:
# --- Load and Crop Weather Data ---

# Define required column names for weather data
WEATHER_DATA_COLS = ["temp_air_c", "wind_speed_m_s", "pressure_hpa", "ghi_w_m2"]

# Load the weather data for the target park using the helper function
df_weather = load_park_specific_data(
    file_path=WEATHER_FILE_PATH,
    timestamp_col=COL_TIMESTAMP,
    park_name_col=COL_PARK_NAME,
    required_data_cols=WEATHER_DATA_COLS,
    target_park_name=TARGET_PARK_NAME,  # type: ignore
    data_name="Weather",
)

# Post-processing: Crop the weather data to the production time range
if not df_production.empty and not df_weather.empty:
    start_time = df_production.index.min()
    end_time = df_production.index.max()

    original_rows = len(df_weather)
    df_weather = df_weather.loc[start_time:end_time].copy()

    print(f"\nWeather data cropped to production time range.")
    print(f"   Original rows: {original_rows}, Cropped rows: {len(df_weather)}")
    print(f"   New time range: {df_weather.index.min()} to {df_weather.index.max()}")