# Get EIA Demand Data

Functions to query EIA's (U.S. Energy Information Administration) OpenData API for hourly electricity demand data.  This notebook generates a single csv file per EIA 1) balancing authority, 2) EIA regions, and 3) contiguous US with all available EIA hourly demand data.


Author:
T. Ruggles
14 June 2019

This code was written using `python3.7`.


# EIA API Resources

EIA provides some commands here: https://www.eia.gov/opendata/commands.php


# EIA Electricity Demand

Web interface for EIA electricity demand data: https://www.eia.gov/opendata/qb.php?category=2122628

Web interface for EIA day-ahead forecasted electricity demand data: https://www.eia.gov/opendata/qb.php?category=2122627


A real-time display of the U.S. interconnect is available here: https://www.eia.gov/realtime_grid/


# Details

In the cases where the result of the EIA API query skipped
an hour, the associated row will have a demand value of `MISSING`.
In the cases where the result of the EIA API query returned NONE for
an hour, the associated row will have a demand value of `EMPTY`.
These values are kept distinct to help informe further study of the EIA data set.

Note: the first 5 hours of July 1st 2015 are empty for all BAs.  Becase data reporting began using local time, the West Coast BAs are missing 8 hours of reporting for July 1st 2015 (UTC time).

In [1]:
import urllib.request
import urllib.parse
import json
import csv
import os
import datetime
from collections import OrderedDict

# Getting and EIA API key

EIA provides open data and an API for accessing them. To use their API you must first get a key here: https://www.eia.gov/opendata/register.php

In [2]:
EIA_API_KEY='hMKhX8Es12io20405E9XdNTmRujScGFCUucidV7H' # as a string
print(EIA_API_KEY)

hMKhX8Es12io20405E9XdNTmRujScGFCUucidV7H


# Function definitions

In [3]:
# Query EIA to get list of regions for which hourly electricity deman data is available
def get_regions_data(ID='region-data'):

    regions_query = urllib.request.urlopen('https://api.eia.gov/v2/electricity/rto/{}/data/?api_key={}'.format(ID, EIA_API_KEY))
    regions_response = regions_query.read().decode('utf-8')
    regions_data = json.loads(regions_response)

    return regions_data




# EIA changed API mapping and now we need to be able to change between
# category_id and series_id
def category_id_to_series_id_demand(category_id):

    region_query = urllib.request.urlopen('http://api.eia.gov/category/?api_key={}&category_id={}&format=json'.format(EIA_API_KEY, category_id))
    region_response = region_query.read().decode('utf-8')
    region_data = json.loads(region_response)

    return region_data['category']['childseries'][0]['series_id']



# Query EIA for hour electric demand data for a given region
def get_regional_data(region_code, start, end, data_type, ID='region-data'):
    region_query = urllib.request.urlopen('https://api.eia.gov/v2/electricity/rto/{}/data/?api_key={}&facets[respondent][]={}&facets[type][]={}&frequency=hourly&data[0]=value&start={}&end={}'.format(ID, EIA_API_KEY, region_code, data_type, start, end))
    region_response = region_query.read().decode('utf-8')
    region_data = json.loads(region_response)

    # For checking initial raw EIA output
    with open('data/{}_raw.csv'.format(region_code), 'w', newline='') as csvfile:
       csvfile.write(json.dumps(region_data, sort_keys=True, indent=4))

    return region_data



# Query EIA for forecasted hourly electric demand data for a given region
def get_forecast_regional_data(region, start, end, ID='region-data'):

    # The series_id for the forecasted demand is identical to that of the realized demand with a minor string replacement
    region_query = urllib.request.urlopen('https://api.eia.gov/v2/electricity/rto/{}/data/?api_key={}&facets[respondent][]={}&facets[type][]=DF'.format(ID, EIA_API_KEY, region))
    region_response = region_query.read().decode('utf-8')
    region_data = json.loads(region_response)

    return region_data



# Generate full hourly date and time series from start date ending the hour before end date
def generate_full_time_series(start_date, end_date):
    full_date_range = []
    for n in range(int ((end_date - start_date).days)):
        for h in range(24):
            full_date_range.append(datetime.datetime.combine(start_date + datetime.timedelta(n), datetime.time(h, 0)))

    return full_date_range


# Save region hourly electric demand data to a format usable by MEM
def save_file(series_id, region_data, region_forecast_data, full_date_range, tgt_dir):

    region_id = series_id.replace('EBA.','').replace('-ALL', '').replace('.D.H','')

    with open(tgt_dir+'/{}.csv'.format(region_id), 'w', newline='') as csvfile:

        fieldnames = ['date_time', 'demand (MW)', 'forecast demand (MW)']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        full_date_range_dict = OrderedDict()
        for hour in full_date_range:
            full_date_range_dict[hour.strftime("%Y-%m-%dT%H")] = ['MISSING', 'MISSING']

        # Actual realized demand
        for demand in region_data['response']['data']:
            print("\n\nDemand period and value")
            print(demand['period'])
            print(demand['value'])
            print(full_date_range_dict.keys())
            # Skip dates outside the specified range
            if demand['period'] not in full_date_range_dict.keys():
                continue
            try:
                if demand['value'] == None:
                    full_date_range_dict[demand['period']][0] = 'EMPTY'
                else:
                    full_date_range_dict[demand['period']][0] = demand['value']
            except KeyError:
                print("Check date and time formatting for category {} for time {}".format(region_id, demand['period']))

        # Day ahead forecasted demand
        for demand_forecast in region_forecast_data['response']['data']:
            # Skip dates outside the specified range
            if demand_forecast['period'] not in full_date_range_dict.keys():
                continue
            try:
                if demand_forecast['value'] == None:
                    full_date_range_dict[demand_forecast['period']][1] = 'EMPTY'
                else:
                    full_date_range_dict[demand_forecast['period']][1] = demand_forecast['value']
            except KeyError:
                print("Check date and time formatting for forecast category {} for time {}".format(region_id, demand['period']))

        for time, demand_output in full_date_range_dict.items():
            
            dt = datetime.datetime.strptime(time, '%Y%m%dT%H')
            
            # From EIA form 930 instructions: 
            # "Report all data as hourly integrated values in megawatts by hour ending time."
            writer.writerow({'date_time': dt,
                'demand (MW)': demand_output[0], 'forecast demand (MW)': demand_output[1]})
            

# Running the quries

You can adjust the data range of the output CSV files with `start_date` and `end_date`

In [4]:
# Make data directory
tgt_dir = './data'
if not os.path.exists(tgt_dir):
    os.mkdir(tgt_dir)

regions_data = get_regions_data()

# Date range of interest
start_date = datetime.date(2015, 7, 1) # EIA demand data starts in July of 2015
end_date = datetime.date(2020, 7, 2) # Can update this as time progresses
full_date_range = generate_full_time_series(start_date, end_date)
reg_count = 0
for region in regions_data['response']['data']: 
    reg_code = region['respondent']
    print("Getting data for: {} with series_id {}".format(region['respondent-name'], reg_code))
    # data_type = 'D' for realized demand, 'DF' for forecasted demand
    region_data = get_regional_data(reg_code, start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"), data_type='D', ID='region-data')
    region_forecast_data = get_regional_data(reg_code, start_date.strftime("%Y-%m-%d"), end_date.strftime("%Y-%m-%d"), data_type='DF', ID='region-data')
    save_file(reg_code, region_data, region_forecast_data, full_date_range, tgt_dir)

Getting data for: Florida Power & Light Co. with series_id FPL


# Sub-region demand
A few BAs provide hourly demand at a sub-regional level. See a list here: https://www.eia.gov/opendata/qb.php?category=3390016

Query these regions for their sub-regional demand profiles. The `Public Service Company of New Mexico (PNM)` BA has two listed sub-regions without any data; these are skipped.

Demand for these regions was first reported on:
 * California Independent System Operator (CISO)
  * 2018-07-01 08:00:00
 * ISO New England (ISNE)
  * 2018-07-01 05:00:00
 * Midcontinent Independent System Operator, Inc. (MISO)
  * 2018-07-01 06:00:00
 * New York Independent System Operator (NYIS)
  * 2018-06-19 05:00:00
 * PJM Interconnection, LLC (PJM)
  * 2018-07-01 05:00:00
  * substantial data gap before 2018-09-18
 * Public Service Company of New Mexico (PNM)
  * 2018-07-01 08:00:00
  * Two subregions have no reported data: FREP and JICA
 * Southwest Power Pool (SWPP)
  * 2018-08-31 06:00:00
 * Electric Reliability Council of Texas, Inc. (ERCO)
  * 2019-05-27 06:00:00

By considering data from 1 Oct 2018 through 30 Sept 2020 we have two full years of data to clean and impute for all except ERCO. For this reason, ERCO will be left as BA-level resolution in the data cleaning and following imputation.

In [None]:
# Make data directory
tgt_dir = './data_subregions'
if not os.path.exists(tgt_dir):
    os.mkdir(tgt_dir)

regions_data = get_regions_data()
subregion_list_ID = 3390016
subregions_data = get_regions_data(subregion_list_ID)

# Date range of interest
start_date = datetime.date(2018, 10, 1) # EIA demand data starts in July of 2015
end_date = datetime.date(2020, 10, 1) # Can update this as time progresses
full_date_range = generate_full_time_series(start_date, end_date)


# BA regions (only 2 years compared to above 5 years)
for region in regions_data['category']['childcategories']:

    series_id = category_id_to_series_id_demand(region['category_id'])
    print("Getting data for: {} with series_id {}".format(region['name'], series_id))
    region_data = get_regional_data(series_id)
    region_forecast_data = get_forecast_regional_data(series_id)
    save_file(series_id, region_data, region_forecast_data, full_date_range, tgt_dir)

# Subregions
for region in subregions_data['category']['childcategories']:
    subregions = get_regions_data(region['category_id'])
    print(subregions['category']['name'])
    for subregion in subregions['category']['childseries']:
        series_id = subregion['series_id']
        
        # Skip the local time version
        if 'hourly - local time' in subregion['name']:
            continue
        
        print("Getting data for: {} with series_id {}".format(subregion['name'], series_id))
        region_data = get_regional_data(series_id)
        
        # Catch errors for some subregions without data
        if 'data' in region_data.keys() and 'error' in region_data['data'].keys():
            print(region_data['data']['error'], "\nSkipping this subregion\n")
            continue
        
        region_forecast_data = get_forecast_regional_data(series_id)
        save_file(series_id, region_data, region_forecast_data, full_date_range, tgt_dir)
    print("\n")

TypeError: get_regions_data() missing 2 required positional arguments: 'start_date' and 'end_date'