Download data van ENTSO-E

In [4]:
import pandas as pd
from entsoe import EntsoePandasClient
import time

# Insert your API key here once you get it
API_KEY = '82aa28d4-59f3-4e3a-b144-6659aa9415b5'

# Initialize ENTSO-E client
client = EntsoePandasClient(api_key=API_KEY)

# Define parameters
country_code = 'NL'
neighboring_countries = ['BE', 'DE', 'GB', 'DK', 'NO']  # Pas dit aan op basis van de relevante buren
years = [2022, 2023, 2024,2025]  # List of years to fetch

# Data storage
all_data = []

# Function to fetch data with retries
def fetch_with_retries(fetch_func, *args, retries=3, delay=5, **kwargs):
    for attempt in range(retries):
        try:
            return fetch_func(*args, **kwargs)
        except Exception as e:
            print(f"Error: {e}, retrying in {delay} seconds...")
            time.sleep(delay)
    raise Exception(f"Failed to fetch data after {retries} attempts")

# Loop through each year and fetch data separately
for year in years:
    start = pd.Timestamp(f'{year}-01-01', tz='Europe/Amsterdam')
    end = pd.Timestamp(f'{year+1}-01-01', tz='Europe/Amsterdam')  # Exclusive end

    print(f"Fetching load data for {year}...")
    yearly_load = fetch_with_retries(client.query_load, country_code, start=start, end=end).squeeze()  # Convert to 1D Series

    print(f"Fetching load forecast for {year}...")
    yearly_load_forecast = fetch_with_retries(client.query_load_forecast, country_code, start=start, end=end).squeeze()  # Convert to 1D Series

    print(f"Fetching price data for {year}...")
    yearly_price = fetch_with_retries(client.query_day_ahead_prices, country_code, start=start, end=end).squeeze()  # Convert to 1D Series

    # Fetch cross-border flows
    flow_data = {}
    for neighbor in neighboring_countries:
        print(f"Fetching cross-border flow from {neighbor} to {country_code} for {year}...")
        yearly_flow_to = fetch_with_retries(client.query_crossborder_flows, country_code_from=neighbor, 
                                            country_code_to=country_code, start=start, end=end).squeeze()  # Convert to 1D Series
        flow_data[f'Flow_{neighbor}_to_{country_code}'] = yearly_flow_to

        print(f"Fetching cross-border flow from {country_code} to {neighbor} for {year}...")
        yearly_flow_from = fetch_with_retries(client.query_crossborder_flows, country_code_from=country_code, 
                                              country_code_to=neighbor, start=start, end=end).squeeze()  # Convert to 1D Series
        flow_data[f'Flow_{country_code}_to_{neighbor}'] = yearly_flow_from

    # Merge all data
    if not yearly_load.empty and not yearly_price.empty:
        df = pd.DataFrame({'Load': yearly_load, 'Price': yearly_price})
        for col_name, flow_series in flow_data.items():
            if not flow_series.empty:
                df[col_name] = flow_series

        # Store yearly data
        all_data.append(df)
    else:
        print(f"No data for year {year}")

# Concatenate all years into one DataFrame if there is data
if all_data:
    final_data = pd.concat(all_data)
    # Save to CSV
    final_data.to_csv('electricity_data_nl_2022_2025.csv')
    print("Data saved successfully!")
else:
    print("No data to save.")

Fetching load data for 2022...
Fetching load forecast for 2022...
Fetching price data for 2022...
Fetching cross-border flow from BE to NL for 2022...
Fetching cross-border flow from NL to BE for 2022...
Fetching cross-border flow from DE to NL for 2022...
Fetching cross-border flow from NL to DE for 2022...
Fetching cross-border flow from GB to NL for 2022...
Fetching cross-border flow from NL to GB for 2022...
Fetching cross-border flow from DK to NL for 2022...
Fetching cross-border flow from NL to DK for 2022...
Fetching cross-border flow from NO to NL for 2022...
Fetching cross-border flow from NL to NO for 2022...
Fetching load data for 2023...
Fetching load forecast for 2023...
Fetching price data for 2023...
Fetching cross-border flow from BE to NL for 2023...
Fetching cross-border flow from NL to BE for 2023...
Fetching cross-border flow from DE to NL for 2023...
Fetching cross-border flow from NL to DE for 2023...
Fetching cross-border flow from GB to NL for 2023...
Fetching 

Connection Error, retrying in 0 seconds


Error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), retrying in 5 seconds...
Fetching cross-border flow from GB to NL for 2025...
Fetching cross-border flow from NL to GB for 2025...
Fetching cross-border flow from DK to NL for 2025...
Fetching cross-border flow from NL to DK for 2025...
Fetching cross-border flow from NO to NL for 2025...
Fetching cross-border flow from NL to NO for 2025...
Data saved successfully!


Read excel file + data per uur

In [6]:
import pandas as pd
from entsoe import EntsoePandasClient
import time


    # Load the existing CSV file
df = pd.read_csv('electricity_data_nl_2022_2025.csv', index_col=0, parse_dates=True)
print("CSV file loaded successfully!")

# Ensure the index is datetime with UTC
df.index = pd.to_datetime(df.index, utc=True)
print("Index converted to datetime with UTC!")

# Save the utc data to a new CSV file
df.to_csv('electricity_data_nl_2022_2025_utc.csv')

# Shift timestamps by 1 hour to calculate the mean for the past hour
df.index = df.index - pd.Timedelta(hours=1)

# Resample the data to hourly frequency and calculate the mean for each hour
df_hourly = df.resample('h').mean()
print("Data resampled to hourly frequency based on the past hour!")

# Save the resampled data to a new CSV file
df_hourly.to_csv('electricity_data_nl_2022_2025_hourly.csv')
print("Hourly data with flow saved successfully!")



CSV file loaded successfully!
Index converted to datetime with UTC!
Data resampled to hourly frequency based on the past hour!
Hourly data with flow saved successfully!


In [None]:
#Nieuwe variabelen flow toevoegen (per land en totaal)

In [9]:
neighboring_countries = ['BE', 'DE', 'GB', 'DK', 'NO']

for neighbor in neighboring_countries:
    df_hourly[f'Flow_{neighbor}'] = df_hourly[f'Flow_{neighbor}_to_NL'] - df_hourly[f'Flow_NL_to_{neighbor}']

df_hourly['Total_Flow'] = df_hourly['Flow_BE'] + df_hourly['Flow_DE'] + df_hourly['Flow_GB'] + df_hourly['Flow_DK'] + df_hourly['Flow_NO']
df_hourly.to_csv('electricity_data_nl_2022_2025_hourly_flow.csv')


In [10]:
# Generate a concise summary of the dataset
def summary(df):
    print("Dataset Summary:")
    print(df.describe(include='all'))  # Summary statistics
    print("\nMissing Values (NA) per Column:")
    print(df.isna().sum())  # Count of missing values

df_hourly = df_hourly.loc['2022-04-01':'2025-04-01']
summary(df_hourly.loc['2022-04-01':'2025-04-01'])

Dataset Summary:
               Load         Price  Flow_BE_to_NL  Flow_NL_to_BE  \
count  26328.000000  26325.000000   26328.000000   26328.000000   
mean   12450.933648    130.320129     244.056214     785.054543   
std     2182.142153    109.551000     489.528175     830.361191   
min     1213.500000   -500.000000       0.000000       0.000000   
25%    10891.937500     74.590000       0.000000       0.000000   
50%    12285.750000    102.430000       0.000000     552.000000   
75%    13755.000000    149.630000     242.000000    1360.000000   
max    19483.250000    872.960000    3709.000000    4181.000000   

       Flow_DE_to_NL  Flow_NL_to_DE  Flow_GB_to_NL  Flow_NL_to_GB  \
count   26328.000000   26328.000000   26326.000000   26326.000000   
mean      778.989868    1027.419848     209.940715     396.117523   
std       769.073452    1055.603640     349.821157     437.405463   
min         0.000000       0.000000       0.000000       0.000000   
25%        34.750000      23.00000

In [22]:
# Find timestamps with NA values for each column
def list_na_timestamps(df):
    na_timestamps = {}
    for column in df.columns:
        na_timestamps[column] = df[df[column].isna()].index.tolist()
    return na_timestamps

# Generate the list of NA timestamps
na_timestamps = list_na_timestamps(df_hourly)

# Print the NA timestamps for each variable
for column, timestamps in na_timestamps.items():
    print(f"Variable: {column}")
    print(f"NA Timestamps: {timestamps[:50]}")  # Print first 10 timestamps for brevity
    print(f"Total NA Count: {len(timestamps)}\n")

Variable: Load
NA Timestamps: [Timestamp('2025-04-14 07:00:00+0000', tz='UTC'), Timestamp('2025-04-14 08:00:00+0000', tz='UTC'), Timestamp('2025-04-14 09:00:00+0000', tz='UTC'), Timestamp('2025-04-14 10:00:00+0000', tz='UTC'), Timestamp('2025-04-14 11:00:00+0000', tz='UTC'), Timestamp('2025-04-14 12:00:00+0000', tz='UTC'), Timestamp('2025-04-14 13:00:00+0000', tz='UTC'), Timestamp('2025-04-14 14:00:00+0000', tz='UTC'), Timestamp('2025-04-14 15:00:00+0000', tz='UTC'), Timestamp('2025-04-14 16:00:00+0000', tz='UTC'), Timestamp('2025-04-14 17:00:00+0000', tz='UTC'), Timestamp('2025-04-14 18:00:00+0000', tz='UTC'), Timestamp('2025-04-14 19:00:00+0000', tz='UTC'), Timestamp('2025-04-14 20:00:00+0000', tz='UTC')]
Total NA Count: 14

Variable: Price
NA Timestamps: [Timestamp('2022-12-30 22:00:00+0000', tz='UTC'), Timestamp('2023-12-30 22:00:00+0000', tz='UTC'), Timestamp('2024-12-30 22:00:00+0000', tz='UTC')]
Total NA Count: 3

Variable: Flow_BE_to_NL
NA Timestamps: [Timestamp('2025-04-14 07: