In [2]:
from entsoe import EntsoePandasClient
from entsoe.exceptions import NoMatchingDataError
from tqdm.notebook import tqdm
import pandas as pd
import os
from dotenv import load_dotenv
from retrying import retry

In [3]:
load_dotenv()  # load env file
entsoe_key = os.getenv("ENTSOE_API_KEY")  # Get API keys from .env file

# Instantiate the ENTSOE Client
client = EntsoePandasClient(api_key=entsoe_key)

In [4]:
# Define the overall time range
start_year = 2019
end_year = 2024

# Define the countries AT BE HR CZ FR DE HU LU NL PL RO SK SI   
# HR causing problems in 2019
# SK problems in July of 2024, installed gen cap also early 2024
countries = ["AT", "BE", "HR", "CZ", "FR", "DE_LU", "HU","NL", "PL", "RO", "SK", "SI", "PL", "CH"]

In [5]:
from tqdm.auto import tqdm

@retry(stop_max_attempt_number=5, wait_fixed=5000)
def query_and_save(query_func, filename_template, countries=countries, start_year=start_year, end_year=end_year, **kwargs):
    no_data_countries = []
    error_countries = []
    nan_summary = {}
    successful_queries = 0

    dir_name = filename_template.split('_{}')[0]
    base_dir = os.path.join("data/raw", dir_name)
    os.makedirs(base_dir, exist_ok=True)

    total_jobs = len(countries) * (end_year - start_year + 1)
    
    with tqdm(total=total_jobs, desc=f"Processing {query_func.__name__}") as pbar:
        for country in countries:
            country_dir = os.path.join(base_dir, country.lower())
            os.makedirs(country_dir, exist_ok=True)

            for year in range(start_year, end_year + 1):
                year_dir = os.path.join(country_dir, str(year))
                os.makedirs(year_dir, exist_ok=True)

                start = pd.Timestamp(f"{year}0101", tz="UTC")
                end = pd.Timestamp(f"{year}1231", tz="UTC")
                if year == end_year:
                    end = pd.Timestamp(f"{year}0731", tz="UTC")

                try:
                    data = query_func(country_code=country, start=start, end=end, **kwargs)
                    
                    df_out = pd.DataFrame(data)
                    
                    if df_out.empty:
                        no_data_countries.append(f"{country}_{year}")
                        pbar.update(1)
                        continue

                    nan_count = df_out.isna().sum().sum()
                    total_count = df_out.size
                    nan_summary[f"{country}_{year}"] = (nan_count, total_count)
                    
                    filename = filename_template.format(f"{country}_{year}")
                    filepath = os.path.join(year_dir, filename)
                    df_out.to_parquet(filepath, index=True)

                    successful_queries += 1

                except NoMatchingDataError:
                    no_data_countries.append(f"{country}_{year}")
                except Exception as e:
                    tqdm.write(f"Error querying data for {country} in {year}: {e}")
                    error_countries.append(f"{country}_{year}")
                
                pbar.update(1)

    print("\n Summary:")
    for country_year, (nan_count, total_count) in nan_summary.items():
        if nan_count > 0:
            print(f"{country_year}: {nan_count} NaNs out of {total_count} datapoints")

    print(f"\nSuccessful queries: {successful_queries} / {total_jobs} country-years")

    if no_data_countries:
        print(f"\nNo matching data for: {', '.join(no_data_countries)}")
    
    if error_countries:
        print(f"\nErrors occurred for: {', '.join(error_countries)}")

## Data availability test

### Day-ahead prices

In [6]:
query_and_save(
    query_func=client.query_day_ahead_prices,
    filename_template="day_ahead_prices_{}.parquet",
)

Processing query_day_ahead_prices:   0%|          | 0/84 [00:00<?, ?it/s]

Connection Error, retrying in 0 seconds


Error querying data for SI in 2023: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Connection Error, retrying in 0 seconds


Error querying data for CH in 2021: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Connection Error, retrying in 0 seconds


Error querying data for CH in 2023: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

 Summary:

Successful queries: 81 / 84 country-years

Errors occurred for: SI_2023, CH_2021, CH_2023


In [21]:
# Retry failed queries
ch21 = pd.DataFrame(client.query_day_ahead_prices("CH", start=pd.Timestamp("2021-01-01", tz="UTC"), end=pd.Timestamp("2021-12-31", tz="UTC")))

ch21.to_parquet("data/raw/day_ahead_prices/ch/2021/day_ahead_prices_CH_2021.parquet", index=True)

ch23 = pd.DataFrame(client.query_day_ahead_prices("CH", start=pd.Timestamp("2023-01-01", tz="UTC"), end=pd.Timestamp("2023-12-31", tz="UTC")))

ch23.to_parquet("data/raw/day_ahead_prices/ch/2023/day_ahead_prices_CH_2023.parquet", index=True)

si23 = pd.DataFrame(client.query_day_ahead_prices("SI", start=pd.Timestamp("2023-01-01", tz="UTC"), end=pd.Timestamp("2023-12-31", tz="UTC")))

si23.to_parquet("data/raw/day_ahead_prices/si/2023/day_ahead_prices_SI_2023.parquet", index=True)

### Load forecast

In [7]:
query_and_save(
    query_func=client.query_load_forecast,
    filename_template="load_forecast_{}.parquet",
)

Processing query_load_forecast:   0%|          | 0/84 [00:00<?, ?it/s]

Connection Error, retrying in 0 seconds


Error querying data for HU in 2021: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Connection Error, retrying in 0 seconds


Error querying data for CH in 2024: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

 Summary:

Successful queries: 82 / 84 country-years

Errors occurred for: HU_2021, CH_2024


In [17]:
# Retry failed queries
hu21 = client.query_load_forecast("HU", start=pd.Timestamp("2021-01-01", tz="UTC"), end=pd.Timestamp("2021-12-31", tz="UTC"))

hu21.to_parquet("data/raw/load_forecast/hu/2021/load_forecast_HU_2021.parquet", index=True)

ch24 = client.query_load_forecast("CH", start=pd.Timestamp("2024-01-01", tz="UTC"), end=pd.Timestamp("2024-07-31", tz="UTC"))

ch24.to_parquet("data/raw/load_forecast/ch/2024/load_forecast_HU_2024.parquet", index=True)

### Generation forecast

In [8]:
query_and_save(
    query_func=client.query_generation_forecast,
    filename_template="generation_forecast_{}.parquet",
)

Processing query_generation_forecast:   0%|          | 0/84 [00:00<?, ?it/s]

Connection Error, retrying in 0 seconds


Error querying data for SI in 2020: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

 Summary:

Successful queries: 82 / 84 country-years

No matching data for: HR_2019

Errors occurred for: SI_2020


In [23]:
si20 = pd.DataFrame(client.query_generation_forecast("SI", start=pd.Timestamp("2020-01-01", tz="UTC"), end=pd.Timestamp("2020-12-31", tz="UTC")))

si20.to_parquet("data/raw/generation_forecast/si/2020/generation_forecast_SI_2020.parquet", index=True)

### Wind and solar forecast

In [9]:
query_and_save(
    query_func=client.query_wind_and_solar_forecast,
    filename_template="wind_and_solar_forecast_{}.parquet",
)

Processing query_wind_and_solar_forecast:   0%|          | 0/84 [00:00<?, ?it/s]

Connection Error, retrying in 0 seconds


Error querying data for AT in 2021: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Connection Error, retrying in 0 seconds


Error querying data for HU in 2023: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

 Summary:
FR_2019: 72 NaNs out of 17430 datapoints
FR_2020: 49 NaNs out of 17286 datapoints
FR_2021: 74 NaNs out of 17382 datapoints
FR_2022: 39 NaNs out of 17456 datapoints
FR_2023: 5208 NaNs out of 26139 datapoints
DE_LU_2023: 8 NaNs out of 104835 datapoints
HU_2019: 96 NaNs out of 69890 datapoints
PL_2020: 2398 NaNs out of 17522 datapoints

Successful queries: 81 / 84 country-years

No matching data for: HR_2019

Errors occurred for: AT_2021, HU_2023


In [24]:
at21 = pd.DataFrame(client.query_wind_and_solar_forecast("AT", start=pd.Timestamp("2021-01-01", tz="UTC"), end=pd.Timestamp("2021-12-31", tz="UTC")))

at21.to_parquet("data/raw/wind_and_solar_forecast/at/2021/wind_and_solar_forecast_AT_2021.parquet", index=True)

hu23 = pd.DataFrame(client.query_wind_and_solar_forecast("HU", start=pd.Timestamp("2023-01-01", tz="UTC"), end=pd.Timestamp("2023-12-31", tz="UTC")))

hu23.to_parquet("data/raw/wind_and_solar_forecast/hu/2023/wind_and_solar_forecast_HU_2023.parquet", index=True)

### Installed Generation Capacity

In [10]:
query_and_save(
    query_func=client.query_installed_generation_capacity,
    filename_template="installed_generation_capacity_{}.parquet",
)

Processing query_installed_generation_capacity:   0%|          | 0/84 [00:00<?, ?it/s]


 Summary:

Successful queries: 81 / 84 country-years

No matching data for: SK_2022, SK_2023, SK_2024
