# Solar Irradiance From AC Export

A Jupyter Notebook that does it's best to model and construct a historical solar irradiance time series from solar panel park's historical AC export data.

## 1. Project Setup

### 1.1 Imports

In [None]:
# --- Imports ---

# Standard Library Imports
from pathlib import Path
import os

# Third-Party Library Imports
import yaml
import pandas as pd
import plotly.io as pio
from dotenv import load_dotenv

print("✅ Libraries loaded successfully.")

***

### 1.2 Configuration

This project uses a two-step configuration process:

1.  **Path Definition (`.env`):** This file defines the project's physical location (`PROJECT_ROOT`) and the name of the configuration file. This separation ensures the notebook is portable across different machines and environments.
2.  **Parameter Definition (`config.yml`):** This file contains the physical and electrical parameters of your solar park(s), including sensitive information like GPS coordinates and detailed system specifications.

**To get started:**

1.  **Configure Paths:** Copy the template file `.env.example` to a new file named `.env`. Open `.env` and set the absolute path for the `PROJECT_ROOT` variable.
2.  **Configure Parks:** Copy the example configuration file `config.example.yml` to `config.yml`. Open `config.yml` and replace the placeholder values with the details of your solar installation.

The cell below loads the environment variables, resolves the final configuration path, and sets up the plotting environment.

In [None]:
# --- Configuration ---

# Load environment variables from .env file
load_dotenv()

# Define paths using environment variables
PROJECT_ROOT_STR = os.getenv("PROJECT_ROOT")
CONFIG_FILENAME = os.getenv("CONFIG_FILENAME", "config.yml")  # Fallback to config.yml

if not PROJECT_ROOT_STR:
    # If PROJECT_ROOT is not set in .env, assume the current working directory
    PROJECT_ROOT_STR = os.getcwd()
    print(
        f"⚠️ WARNING: PROJECT_ROOT not set in .env. Using current directory: {PROJECT_ROOT_STR}"
    )

PROJECT_ROOT = Path(PROJECT_ROOT_STR)
CONFIG_PATH = PROJECT_ROOT / CONFIG_FILENAME

print(f"Project Root defined as: {PROJECT_ROOT}")
print(f"Configuration file path: {CONFIG_PATH}")

try:
    with open(CONFIG_PATH, "r", encoding="utf-8") as f:
        config = yaml.safe_load(f)

    # Extract park configurations
    PARK_CONFIGS = config.get("parks", {})

    if not PARK_CONFIGS:
        raise ValueError(
            "No parks defined under the 'parks' key in the configuration file."
        )

    # Create a list of park names for easy iteration later
    PARK_NAMES = list(PARK_CONFIGS.keys())

    print(
        f"✅ Configuration loaded successfully from '{CONFIG_PATH}' for {len(PARK_NAMES)} park(s): {', '.join(PARK_NAMES)}."
    )

except FileNotFoundError:
    print(f"❌ CONFIGURATION ERROR: The '{CONFIG_PATH}' file was not found.")
    print(
        "Please check your .env file's PROJECT_ROOT setting, and ensure 'config.yml' exists at that location."
    )
    print(
        "If 'config.yml' is missing, copy 'config.example.yml' to 'config.yml' and fill in your park's details."
    )
except (yaml.YAMLError, ValueError) as e:
    print(
        f"❌ CONFIGURATION ERROR: Could not parse '{CONFIG_PATH}'. Please check its format. Details: {e}"
    )


# --- Plotting and Display Configuration ---
pio.templates.default = "plotly_dark"

# Set display options for better viewing in Jupyter
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)

print("Plotting and display options set.")

***
***

## 2. Data Loading

### 2.1 Hourly Production And Spot Price Data

In [None]:
# --- Data Loading and Validation ---

# Get the data path
PRODUCTION_AND_PRICE_FILE_PATH = os.getenv(
    "PRODUCTION_AND_PRICE_FILE_PATH",
    "/home/user/solar-irradiance-from-ac-export/production.csv",
)

# Source CSV column names must follow a strict specification
COL_TIMESTAMP = "TimestampUTC"
COL_PARK_NAME = "ParkName"
COL_EXPORT = "AC_Export_kWh"
COL_PRICE = "SpotPriceEUR_MWh"

REQUIRED_COLUMNS = [COL_TIMESTAMP, COL_PARK_NAME, COL_EXPORT, COL_PRICE]

print(f"Attempting to load data from: {PRODUCTION_AND_PRICE_FILE_PATH}")

try:
    # 1. Load the CSV
    df_raw = pd.read_csv(
        PRODUCTION_AND_PRICE_FILE_PATH,
        parse_dates=[COL_TIMESTAMP],
        # Ensure the timestamp column is treated as the index upon loading
        index_col=COL_TIMESTAMP,
    )

    # 2. Basic Column Check
    if not all(
        col in df_raw.columns for col in REQUIRED_COLUMNS[1:]
    ):  # Check all except the index column
        missing = [col for col in REQUIRED_COLUMNS[1:] if col not in df_raw.columns]
        raise ValueError(f"Missing required columns in CSV: {missing}")

    # 3. Data Cleaning and Preparation

    # Ensure the index is a proper DatetimeIndex and set to UTC
    df_raw.index = pd.to_datetime(df_raw.index, utc=True)

    # Filter out any rows where the park name is missing or NaN
    df_raw = df_raw.dropna(subset=[COL_PARK_NAME])

    # Convert ParkName column to string type for reliable comparison
    df_raw[COL_PARK_NAME] = df_raw[COL_PARK_NAME].astype(str)

    # 4. Park Name Validation

    # Identify unique parks in the loaded data
    data_parks = set(df_raw[COL_PARK_NAME].unique())

    # Identify parks defined in the YAML configuration
    config_parks = set(PARK_NAMES)

    # Check for parks present in data but missing in config
    missing_config_parks = data_parks - config_parks

    if missing_config_parks:
        print(
            "⚠️ WARNING: The following parks found in the data are missing from 'config.yml':"
        )
        for park in missing_config_parks:
            print(f"  - {park}")
        print(
            "Please update 'config.yml' with parameters for these parks or ensure names match."
        )

    # Check for parks present in config but missing in data (less critical, but good to know)
    missing_data_parks = config_parks - data_parks
    if missing_data_parks:
        print(
            "ℹ️ INFO: The following parks defined in 'config.yml' were not found in the data:"
        )
        for park in missing_data_parks:
            print(f"  - {park}")

    # 5. Final Filtering and Assignment

    # Filter the DataFrame to only include parks that are defined in the YAML file
    df_production = df_raw[df_raw[COL_PARK_NAME].isin(config_parks)].copy()

    if df_production.empty:
        print(
            "❌ ERROR: The resulting production DataFrame is empty. Check park names and data file path."
        )
    else:
        # Final check: Ensure the data is hourly and sort by time
        df_production = df_production.sort_index()

        print(f"✅ Data loaded and validated successfully.")
        print(f"   Shape of final DataFrame: {df_production.shape}")
        print(
            f"   Time range: {df_production.index.min()} to {df_production.index.max()}"
        )
        print("Sample:")
        print(df_production.sample(n=5))

except FileNotFoundError:
    print(
        f"❌ DATA ERROR: The data file was not found at the specified path: {PRODUCTION_AND_PRICE_FILE_PATH}"
    )
    print(
        "Please check the 'PRODUCTION_AND_PRICE_FILE_PATH' variable in your '.env' file."
    )
except Exception as e:
    print(f"❌ AN UNEXPECTED ERROR OCCURRED during data loading: {e}")
    df_production = pd.DataFrame()  # Ensure df_production exists even on failure