### Day-ahead Scheduled Commercial Exchanges (SCE) – Cross Border

**Step 1: Extract SCE from ENTSO-E Transparency Platform API**  
- Bidding zone: Germany and France  
  - Bidding zone & code:  
    - DE-LU: `10Y1001A1001A82H`  
    - FR: `10YFR-RTE------C`  
  - Reference: [Area List with EIC Codes](https://transparencyplatform.zendesk.com/hc/en-us/articles/15885757676308-Area-List-with-Energy-Identification-Code-EIC)  
- API Guide: [ENTSO-E API Documentation](https://transparency.entsoe.eu/content/static_content/Static%20content/web%20api/Guide.html)  
- Debug checklist:  
  1. Ensure the base URL is correct.  
  2. Identify the correct namespace by printing out the XML content.  

**Step 2: Explore SCE Distribution**  
- Plot the distribution of `imported` and `exported`.  
- Observation:  
  - Large percentage of **0 values** (no scheduled exchanges).  
  - Presence of **extreme large values** (outliers).  

**Step 3: Standardization – 0 Indicator + Yeo–Johnson Transformation**  
1. **Zero indicator**:  
   - Add a binary column indicating whether the flow is `0`.  
2. **Positive values transformation**:  
   - Apply **Yeo–Johnson** transformation (fit on positive values only).  
   - This reduces skewness and compresses extreme values.


In [None]:
from __future__ import annotations

import time
import re
import requests
import pandas as pd
from typing import Dict, List, Optional, Tuple
from datetime import datetime, timedelta, timezone
import xml.etree.ElementTree as ET
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from zoneinfo import ZoneInfo


class ENTSOSCEExtractor:
    """
    Extract Scheduled Commercial Exchanges (SCE; documentType=A09) from the ENTSO-E Transparency Platform
    between two bidding zones over a given period.

    Notes
    -----
    - Time parameters (periodStart/periodEnd) must be UTC and formatted as 'YYYYMMDDHHMM'. The end is exclusive.
    - API rate limits apply; introduce delays between calls (e.g., >= 1s).
    - Returned timestamps are tz-aware (UTC). You may convert to local time via `to_tz` in `extract_all_flows()`.
    - Quantities are in MW as reported by ENTSO-E.
    """

    BASE_URL = "https://web-api.tp.entsoe.eu/api"
    DOC_TYPE_SCE = "A09"                   # Scheduled Commercial Exchange
    MARKET_DA = "A01"                      # Day-Ahead
    ISO8601_RES_MIN = {"PT15M": 15, "PT30M": 30, "PT60M": 60, "PT1H": 60}

    # Common bidding zone EIC codes (extend as needed)
    BZ_EIC: Dict[str, str] = {
        "DE_LU": "10Y1001A1001A82H",
        "FR":    "10YFR-RTE------C",
        "AT":    "10YAT-APG------L",
        "NL":    "10YNL----------L",
        "BE":    "10YBE----------2",
    }

    def __init__(self, api_token: str, retry_total: int = 5, backoff: float = 0.4, timeout: int = 60):
        """
        Parameters
        ----------
        api_token : str
            ENTSO-E API securityToken.
        retry_total : int
            Total number of HTTP retries for transient errors.
        backoff : float
            Exponential backoff factor between retries.
        timeout : int
            Request timeout in seconds.
        """
        self.api_token = api_token
        self.timeout = timeout

        self.session = requests.Session()
        retry = Retry(
            total=retry_total,
            backoff_factor=backoff,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["GET"],
        )
        self.session.mount("https://", HTTPAdapter(max_retries=retry))
        self.headers = {"User-Agent": "ENTSOSCEExtractor/1.0"}

    # ---------- Public API ----------

    def extract_all_flows(
        self,
        from_bz: str,
        to_bz: str,
        start_date: str,
        end_date: str,
        market: str = MARKET_DA,
        sleep_between_calls: float = 1.0,
        to_tz: Optional[str] = None,
    ) -> pd.DataFrame:
        """
        Retrieve SCE in both directions between two bidding zones and combine into a single time series.

        Parameters
        ----------
        from_bz : str
            Outgoing bidding zone code (e.g., 'FR').
        to_bz : str
            Incoming bidding zone code (e.g., 'DE_LU').
        start_date : str
            Inclusive start date in 'YYYY-MM-DD'.
        end_date : str
            Inclusive end date in 'YYYY-MM-DD'.
        market : str
            Market agreement type (default: Day-Ahead 'A01').
        sleep_between_calls : float
            Seconds to sleep between directional API requests (rate limiting).
        to_tz : Optional[str]
            If provided (e.g., 'Europe/Berlin'), convert timestamps to this timezone.

        Returns
        -------
        pd.DataFrame
            Columns: ['timestamp', 'imported', 'exported'] where
            - 'imported' is flow from `from_bz` -> `to_bz` (MW),
            - 'exported' is flow from `to_bz` -> `from_bz` (MW).
            Timestamp is tz-aware (UTC by default or converted if `to_tz` is supplied).
        """
        start_utc = self._to_entsoe_time(start_date)
        end_utc = self._to_entsoe_time(end_date, inclusive_end=True)

        print(f"Extracting flows: {from_bz} → {to_bz}")
        imported = self._get_sce_flows(from_bz, to_bz, start_utc, end_utc, market)

        time.sleep(sleep_between_calls)

        print(f"Extracting flows: {to_bz} → {from_bz}")
        exported = self._get_sce_flows(to_bz, from_bz, start_utc, end_utc, market)

        df_in = pd.DataFrame(imported)
        df_out = pd.DataFrame(exported)

        if df_in.empty and df_out.empty:
            print("No data retrieved for the specified period.")
            return pd.DataFrame(columns=["timestamp", "imported", "exported"])

        if not df_in.empty:
            df_in.rename(columns={"flow_mw": "imported"}, inplace=True)
        if not df_out.empty:
            df_out.rename(columns={"flow_mw": "exported"}, inplace=True)

        if not df_in.empty and not df_out.empty:
            combined = pd.merge(df_in, df_out, on="timestamp", how="outer")
        elif not df_in.empty:
            combined = df_in
            combined["exported"] = pd.NA
        else:
            combined = df_out
            combined["imported"] = pd.NA

        # Sorting and tz conversion
        combined = combined.sort_values("timestamp").reset_index(drop=True)
        if to_tz:
            combined["timestamp"] = combined["timestamp"].dt.tz_convert(ZoneInfo(to_tz))

        return combined

    # ---------- Internal helpers ----------

    def _get_sce_flows(
        self,
        from_bz: str,
        to_bz: str,
        period_start: str,
        period_end: str,
        market: str,
    ) -> List[Dict[str, object]]:
        """
        Low-level call to ENTSO-E API for SCE (A09) between two EIC codes.
        Returns a list of dicts with UTC timestamps and MW quantities.
        """
        out_eic = self._to_eic(from_bz)
        in_eic = self._to_eic(to_bz)

        params = {
            "securityToken": self.api_token,
            "documentType": self.DOC_TYPE_SCE,
            "out_Domain": out_eic,
            "in_Domain": in_eic,
            "periodStart": period_start,
            "periodEnd": period_end,
            "contract_MarketAgreement.Type": market,
        }

        try:
            resp = self.session.get(self.BASE_URL, params=params, headers=self.headers, timeout=self.timeout)
            resp.raise_for_status()
        except requests.RequestException as e:
            print(f"API request failed ({from_bz}->{to_bz}): {e}")
            return []

        try:
            root = ET.fromstring(resp.content)
        except ET.ParseError as e:
            print(f"XML parsing failed ({from_bz}->{to_bz}): {e}")
            return []

        ns = self._detect_namespace(root)
        if not ns:
            print("Could not detect XML namespace.")
            return []

        flows: List[Dict[str, object]] = []

        for ts in root.findall(".//ns:TimeSeries", ns):
            # Optional: businessType / curveType inspection if needed
            for period in ts.findall(".//ns:Period", ns):
                start_text = period.find("ns:timeInterval/ns:start", ns)
                res_text = period.find("ns:resolution", ns)
                if start_text is None or res_text is None:
                    continue

                start_dt = self._parse_utc(start_text.text)
                step = self._resolution_to_timedelta(res_text.text)
                if step is None:
                    # Fallback to 60 minutes if resolution is missing/unexpected
                    step = timedelta(minutes=60)

                for point in period.findall(".//ns:Point", ns):
                    pos_el = point.find("ns:position", ns)
                    qty_el = point.find("ns:quantity", ns)
                    if pos_el is None or qty_el is None:
                        continue
                    try:
                        position = int(pos_el.text)
                        quantity = float(qty_el.text)
                    except (TypeError, ValueError):
                        continue

                    ts_utc = start_dt + (position - 1) * step
                    flows.append({"timestamp": ts_utc, "flow_mw": quantity})

        return flows

    @staticmethod
    def _to_eic(bz_or_eic: str) -> str:
        """Return an EIC code given a bidding zone alias; pass-through if it already looks like an EIC."""
        if re.match(r"^10Y[A-Z0-9\-]{10,}$", bz_or_eic):
            return bz_or_eic
        try:
            return ENTSOSCEExtractor.BZ_EIC[bz_or_eic]
        except KeyError as e:
            raise ValueError(f"Unknown bidding zone '{bz_or_eic}'. Extend BZ_EIC mapping.") from e

    @staticmethod
    def _to_entsoe_time(date_str: str, inclusive_end: bool = False) -> str:
        """
        Convert 'YYYY-MM-DD' to ENTSO-E 'YYYYMMDDHHMM' in UTC.
        - Start: 00:00 inclusive
        - End:   00:00 of the next day if inclusive_end=True (exclusive end per API spec)
        """
        dt = datetime.strptime(date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
        if inclusive_end:
            dt = dt + timedelta(days=1)  # end is exclusive at 00:00 next day
        return dt.strftime("%Y%m%d%H%M")

    @staticmethod
    def _detect_namespace(root: ET.Element) -> Optional[Dict[str, str]]:
        """Detect the XML namespace used by the ENTSO-E response."""
        if root.tag.startswith("{"):
            uri = root.tag.split("}")[0].strip("{")
            return {"ns": uri}
        return None

    @staticmethod
    def _parse_utc(text: str) -> datetime:
        """
        Parse ENTSO-E timestamps like '2023-01-01T00:00Z' to tz-aware UTC datetimes.
        """
        # Accept both '...Z' and without Z (robustness)
        text = text.rstrip("Z")
        dt = datetime.strptime(text, "%Y-%m-%dT%H:%M").replace(tzinfo=timezone.utc)
        return dt

    @classmethod
    def _resolution_to_timedelta(cls, res: str) -> Optional[timedelta]:
        """Map ISO 8601 resolutions (PT15M, PT30M, PT60M/PT1H) to timedeltas."""
        if res in cls.ISO8601_RES_MIN:
            return timedelta(minutes=cls.ISO8601_RES_MIN[res])
        if res == "PT1H":
            return timedelta(hours=1)
        return None


In [None]:
# === Sample Usage (SCE FR ↔ DE_LU, Jan 2023) ===
import pandas as pd

token = "1ef3066d-9189-4d0d-a149-36c5c5a9b265"
ex = ENTSOSCEExtractor(api_token=token)

df = ex.extract_all_flows(
    from_bz="FR",
    to_bz="DE_LU",
    start_date="2023-01-01",
    end_date="2023-01-31",
    market=ENTSOSCEExtractor.MARKET_DA,
    sleep_between_calls=1.0,
    to_tz="Europe/Berlin",  # optional; omit to keep UTC
)

print(df.head())
# Optional: save
df.to_csv("sce_FR_DELU_202301.csv", index=False)


Extracting flows: FR → DE_LU
Extracting flows: DE_LU → FR
                  timestamp  imported  exported
0 2023-01-01 01:00:00+01:00       0.0    7699.4
1 2023-01-01 02:00:00+01:00       NaN    6199.5
2 2023-01-01 03:00:00+01:00       NaN    5619.0
3 2023-01-01 04:00:00+01:00       NaN    4563.6
4 2023-01-01 05:00:00+01:00       NaN    4944.3
