## Setup

In [1]:
from entsoe import EntsoePandasClient
import os
from dotenv import load_dotenv
from data_utils import query_and_save

In [2]:
load_dotenv()  # load env file
entsoe_key = os.getenv("ENTSOE_API_KEY")  # Get API keys from .env file

# Instantiate the ENTSOE Client
client = EntsoePandasClient(api_key=entsoe_key)

## Define Parameters

In [3]:
# Define the overall time range
start_year = 2019
end_year = 2024

# Define the countries, CORE CCR countries are: AT BE HR CZ FR DE HU LU NL PL RO SK SI   
countries = ["AT", "BE", "HR", "CZ", "FR", "DE_LU", "HU","NL", "PL", "RO", "SK", "SI", "PL", "CH"]

In [4]:
# @retry(stop_max_attempt_number=5, wait_fixed=5000)
# def query_and_save(query_func, filename_template, countries=countries, start_year=start_year, end_year=end_year, overwrite=False, **kwargs):
#     no_data_countries = []
#     error_countries = []
#     nan_summary = {}
#     successful_queries = 0
# 
#     dir_name = filename_template.split('_{}')[0]
#     base_dir = os.path.join("raw", dir_name)
#     os.makedirs(base_dir, exist_ok=True)
# 
#     total_jobs = len(countries) * (end_year - start_year + 1)
#     
#     with tqdm(total=total_jobs, desc=f"Processing {query_func.__name__}") as pbar:
#         for country in countries:
#             country_dir = os.path.join(base_dir, country.lower())
#             os.makedirs(country_dir, exist_ok=True)
# 
#             for year in range(start_year, end_year + 1):
#                 start = pd.Timestamp(f"{year}0101", tz="UTC")
#                 end = pd.Timestamp(f"{year}1231", tz="UTC") if year != 2024 else pd.Timestamp(f"{year}0731", tz="UTC")
# 
#                 filename = filename_template.format(f"{country}_{year}")
#                 filepath = os.path.join(country_dir, filename)
# 
#                 if os.path.exists(filepath) and not overwrite:
#                     existing_data = pd.read_parquet(filepath)
#                     if existing_data.index[0].normalize() == start and existing_data.index[-1].normalize() == end:
#                         pbar.update(1)
#                         successful_queries += 1
#                         continue
# 
#                 try:
#                     data = query_func(country_code=country, start=start, end=end, **kwargs)
#                     
#                     df_out = pd.DataFrame(data)
#                     
#                     if df_out.empty:
#                         no_data_countries.append(f"{country}_{year}")
#                         pbar.update(1)
#                         continue
# 
#                     nan_count = df_out.isna().sum().sum()
#                     total_count = df_out.size
#                     nan_summary[f"{country}_{year}"] = (nan_count, total_count)
#                     
#                     df_out.to_parquet(filepath, index=True)
# 
#                     successful_queries += 1
# 
#                 except NoMatchingDataError:
#                     no_data_countries.append(f"{country}_{year}")
#                 except Exception as e:
#                     tqdm.write(f"Error querying data for {country} in {year}: {e}")
#                     error_countries.append(f"{country}_{year}")
#                 
#                 pbar.update(1)
# 
#     print("\n Summary:")
#     for country_year, (nan_count, total_count) in nan_summary.items():
#         if nan_count > 0:
#             print(f"{country_year}: {nan_count} NaNs out of {total_count} datapoints")
# 
#     print(f"\nSuccessful queries: {successful_queries} / {total_jobs} country-years")
# 
#     if no_data_countries:
#         print(f"\nNo matching data for: {', '.join(no_data_countries)}")
#     
#     if error_countries:
#         print(f"\nErrors occurred for: {', '.join(error_countries)}")

## Fetch Data
To fetch data from the ENTSOE API, simply run each cell. If there are connection errors, simply run the cell again. Re-running cells will only overwrite data if it's incomplete, so simply rerunning the cells will also update the data to the current date minus 2 days (to allow for delays with uploading the ENTSOE data)

### Day-ahead Prices

In [5]:
query_and_save(
    query_func=client.query_day_ahead_prices,
    filename_template="day_ahead_prices_{}.parquet",
    countries=countries, start_year=start_year, end_year=end_year)

Processing query_day_ahead_prices:   0%|          | 0/84 [00:00<?, ?it/s]


 Summary:

Successful queries: 84 / 84 country-years


### Load Forecast

In [6]:
query_and_save(
    query_func=client.query_load_forecast,
    filename_template="load_forecast_{}.parquet",
    countries=countries, start_year=start_year, end_year=end_year)

Processing query_load_forecast:   0%|          | 0/84 [00:00<?, ?it/s]


 Summary:

Successful queries: 84 / 84 country-years


### Generation Forecast

In [7]:
query_and_save(
    query_func=client.query_generation_forecast,
    filename_template="generation_forecast_{}.parquet",
    countries=countries, start_year=start_year, end_year=end_year)

Processing query_generation_forecast:   0%|          | 0/84 [00:00<?, ?it/s]


 Summary:

Successful queries: 83 / 84 country-years

No matching data for: HR_2019


### Wind and Solar Forecast

In [8]:
query_and_save(
    query_func=client.query_wind_and_solar_forecast,
    filename_template="wind_and_solar_forecast_{}.parquet",
    countries=countries, start_year=start_year, end_year=end_year)

Processing query_wind_and_solar_forecast:   0%|          | 0/84 [00:00<?, ?it/s]


 Summary:
FR_2019: 72 NaNs out of 17430 datapoints
FR_2020: 49 NaNs out of 17286 datapoints
FR_2021: 74 NaNs out of 17382 datapoints
FR_2022: 39 NaNs out of 17456 datapoints
FR_2023: 5208 NaNs out of 26139 datapoints
DE_LU_2023: 8 NaNs out of 104835 datapoints
HU_2019: 96 NaNs out of 69890 datapoints
PL_2020: 2398 NaNs out of 17522 datapoints

Successful queries: 83 / 84 country-years

No matching data for: HR_2019


### Installed Generation Capacity

In [7]:
query_and_save(
    query_func=client.query_installed_generation_capacity,
    filename_template="installed_generation_capacity_{}.parquet",
    countries=countries, start_year=start_year, end_year=end_year)

Processing query_installed_generation_capacity:   0%|          | 0/84 [00:00<?, ?it/s]


 Summary:

Successful queries: 81 / 84 country-years

No matching data for: SK_2022, SK_2023, SK_2024
