# Get EIA Demand Data

Functions to query EIA's (U.S. Energy Information Administration) OpenData API for hourly electricity demand data.  This notebook generates a single csv file per EIA 1) balancing authority, 2) EIA regions, and 3) contiguous US with all available EIA hourly demand data.


Author:
T. Ruggles
14 June 2019

Updated by:
A. Wongel
January 2025


# EIA API Resources

EIA provides some commands here: https://www.eia.gov/opendata/documentation.php


# EIA Electricity Demand

Web interface for EIA electricity demand data: https://www.eia.gov/opendata/browser/electricity/rto/region-data

A real-time display of the U.S. interconnect is available here: https://www.eia.gov/realtime_grid/


# Details

In the cases where the result of the EIA API query skipped
an hour, the associated row will have a demand value of `MISSING`.
In the cases where the result of the EIA API query returned NONE for
an hour, the associated row will have a demand value of `EMPTY`.
These values are kept distinct to help inform further study of the EIA data set.

In [None]:
import urllib.request
import urllib.parse
import json
import csv
import os
import datetime
from collections import OrderedDict
import shutil

# Getting and EIA API key

EIA provides open data and an API for accessing them. To use their API you must first get a key here: https://www.eia.gov/opendata/register.php

In [None]:
EIA_API_KEY='YOUR_EIA_API_KEY_HERE' # as a string
print(EIA_API_KEY)

# Function definitions

In [None]:
# Query EIA to get list of regions for which hourly electricity deman data is available
def get_regions_data(ID='region-data', data_type='facets[type][]=D&', start='2020-01-01T00', end='2020-01-02T00'):
    # Limiting to a time range to handle EIA API data limit
    regions_query = urllib.request.urlopen('https://api.eia.gov/v2/electricity/rto/{}/data/?api_key={}&{}start={}&end={}'.format(ID, EIA_API_KEY, data_type, start, end))
    regions_response = regions_query.read().decode('utf-8')
    regions_data = json.loads(regions_response)

    return regions_data


# Query EIA for hour electric demand data for a given region
def get_regional_data(region_code, start, end, data_type, ID='region-data'):

    # Get region
    if not 'sub' in ID:
        find_reg_by = 'respondent'
    else:
        find_reg_by = 'subba'

    # Split in chunks of 6 months to avoid EIA API limit
    intermed_end_dt = datetime.datetime.strptime(start, '%Y-%m-%d') + datetime.timedelta(180)
    intermed_end = intermed_end_dt.strftime('%Y-%m-%d')
    region_data = {}
    while datetime.datetime.strptime(start, '%Y-%m-%d') < datetime.datetime.strptime(end, '%Y-%m-%d'):
        region_query = urllib.request.urlopen(f'https://api.eia.gov/v2/electricity/rto/{ID}/data/?api_key={EIA_API_KEY}&facets[{find_reg_by}][]={region_code}&{data_type}frequency=hourly&data[0]=value&start={start}&end={intermed_end}')
        region_response = region_query.read().decode('utf-8')
        region_data_chunk = json.loads(region_response)

        # Merge with previous data
        region_data['response'] = {'data': region_data['response']['data'] + region_data_chunk['response']['data']} if 'response' in region_data else region_data_chunk['response']
        start_dt = intermed_end_dt
        intermed_end_dt = start_dt + datetime.timedelta(180) if start_dt + datetime.timedelta(180) < datetime.datetime.strptime(end, '%Y-%m-%d') else datetime.datetime.strptime(end, '%Y-%m-%d')
        start = start_dt.strftime('%Y-%m-%d')
        intermed_end = intermed_end_dt.strftime('%Y-%m-%d')
        

    # For checking initial raw EIA output
    # with open('data/{}_raw.csv'.format(region_code), 'w', newline='') as csvfile:
    #    csvfile.write(json.dumps(region_data, sort_keys=True, indent=4))

    return region_data


# Generate full hourly date and time series from start date ending the hour before end date
def generate_full_time_series(start_date, end_date):
    full_date_range = []
    for n in range(int ((end_date - start_date).days)):
        for h in range(24):
            full_date_range.append(datetime.datetime.combine(start_date + datetime.timedelta(n), datetime.time(h, 0)))

    return full_date_range


# Save region hourly electric demand data to a format usable by MEM
def save_file(series_id, region_data, full_date_range, tgt_dir):

    region_id = series_id.replace('EBA.','').replace('-ALL', '').replace('.D.H','')

    with open(tgt_dir+'/{}.csv'.format(region_id), 'w', newline='') as csvfile:

        fieldnames = ['date_time', 'demand (MW)']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        full_date_range_dict = OrderedDict()
        datetime_format = '%Y-%m-%dT%H'
        for hour in full_date_range:
            full_date_range_dict[hour.strftime(datetime_format)] = ['MISSING', 'MISSING']

        # Actual realized demand
        for demand in region_data['response']['data']:
            # Skip dates outside the specified range
            if demand['period'] not in full_date_range_dict.keys():
                continue
            try:
                if demand['value'] == None:
                    full_date_range_dict[demand['period']][0] = 'EMPTY'
                else:
                    full_date_range_dict[demand['period']][0] = demand['value']
            except KeyError:
                print("Check date and time formatting for category {} for time {}".format(region_id, demand['period']))

        for time, demand_output in full_date_range_dict.items():
            
            dt = datetime.datetime.strptime(time, datetime_format)
            
            # From EIA form 930 instructions: 
            # "Report all data as hourly integrated values in megawatts by hour ending time."
            writer.writerow({'date_time': dt,
                'demand (MW)': demand_output[0]})
            

# Running the quries

You can adjust the data range of the output CSV files with `start_date` and `end_date`

In [None]:
# Make data directory
tgt_dir_ba = './data'
if not os.path.exists(tgt_dir_ba):
    os.mkdir(tgt_dir_ba)

regions_data = get_regions_data()
# Get unique respondents and respondent names for all regions
regions = {(record["respondent"], record["respondent-name"]) for record in regions_data["response"]["data"]}

In [None]:
# Date range of interest
start_date = datetime.date(2020, 1, 1) # Start date that includes subregion data
end_date = datetime.date(2025,1, 1) # Can update this as time progresses
full_date_range = generate_full_time_series(start_date, end_date)
for reg_code, region_name in regions: 
    print("Getting data for: {} with region code {}".format(region_name, reg_code))
    # data_type 'D' for realized demand
    region_data = get_regional_data(reg_code, start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"), data_type='facets[type][]=D&', ID='region-data')
    save_file(reg_code, region_data, full_date_range, tgt_dir_ba)

# Sub-region demand
A few BAs provide hourly demand at a sub-regional level. See a list here: https://www.eia.gov/opendata/browser/electricity/rto/region-sub-ba-data

Query these regions for their sub-regional demand profiles. The `Public Service Company of New Mexico (PNM)` BA has two listed sub-regions without any data; these are skipped.

Demand for these regions was first reported on:
 * California Independent System Operator (CISO)
  * 2018-07-01 08:00:00
 * ISO New England (ISNE)
  * 2018-07-01 05:00:00
 * Midcontinent Independent System Operator, Inc. (MISO)
  * 2018-07-01 06:00:00
 * New York Independent System Operator (NYIS)
  * 2018-06-19 05:00:00
 * PJM Interconnection, LLC (PJM)
  * 2018-07-01 05:00:00
  * substantial data gap before 2018-09-18
 * Public Service Company of New Mexico (PNM)
  * 2018-07-01 08:00:00
  * Two subregions have no reported data: FREP and JICA
 * Southwest Power Pool (SWPP)
  * 2018-08-31 06:00:00
 * Electric Reliability Council of Texas, Inc. (ERCO)
  * 2019-05-27 06:00:00

By considering data from 1 Jan 2020 through 31 Dec 2024 we have five full years of data to clean and impute for all BAs that report subregions.

In [None]:
# Make data directory
tgt_dir_sub = './data_subregions'
if not os.path.exists(tgt_dir_sub):
    os.mkdir(tgt_dir_sub)

subregions_data = get_regions_data(ID='region-sub-ba-data', data_type='', start='2020-01-01T00', end='2020-01-02T00')
# Get unique respondents and respondent names for all regions
subregions = {(record["subba"], record["subba-name"], record["parent"]) for record in subregions_data["response"]["data"]}

# Date range of interest
start_date = datetime.date(2020, 1, 1)
end_date = datetime.date(2025, 1, 1) # Can update this as time progresses
full_date_range = generate_full_time_series(start_date, end_date)

# Subregions
for subreg_code, subreg_name, subreg_parent in subregions:

    print("Getting data for: {} with region code {} (belongs to {})".format(subreg_name, subreg_code, subreg_parent))
    
    subreg_data = get_regional_data(subreg_code, start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"), data_type='', ID='region-sub-ba-data')
    
    # Catch errors for some subregions without data
    if 'data' in subreg_data.keys() and 'error' in subreg_data['data'].keys():
        print(subreg_data['data']['error'], "\nSkipping this subregion\n")
        continue
    
    save_file(subreg_parent+'-'+subreg_code, subreg_data, full_date_range, tgt_dir_sub)

# Add BA regions to subregions
for reg_code, region_name in regions: 
    print("Copying full BA {} {}".format(region_name, reg_code))
    # Copy the full region data to the subregion directory
    shutil.copy(tgt_dir_ba+'/{}.csv'.format(reg_code), tgt_dir_sub+'/{}.csv'.format(reg_code))