In [1]:
import os 
import json
import pickle

import pandas as pd
import numpy as np

In [2]:
# read in the api key, set api key as env variable and load eiay
class EIA():
    def __init__(self, credsFilePath=None):
        # eiapy requires exoects the api key as a named variable in the enivornment namespace
        self.set_api_key(credsFilePath)

        try:
            import eiapy as eia
        except ModuleNotFoundError:
            print('eiapy not installed. Package will be installed now.')
            !pip install eiapy
            import eiapy as eia

        self.category = eia.Category
        self.series = eia.Series
        self.multiseries = eia.MultiSeries

    def set_api_key(self, credsFilePath):
            creds = self.read_creds(credsFilePath)
            os.environ['EIA_KEY'] = creds['API_KEY']

    def read_creds(self, credsFilePath=None): 
        if not credsFilePath:
            credsFilePath = './creds.json'
        with open(credsFilePath, 'r') as file:
            creds = json.load(file)
            return creds
        

def build_co2_emissions_lookup_table(categoryID=None, saveData=True):
    # build a dict of form {stateName1:[seriesData1, seriesData2, ......], stateName2:[.....]}
    
    collector = {} 
    
    print('Bulding series lookup table...')
    # root for crawling is the CO2 emission by state page
    # https://www.eia.gov/opendata/qb.php?category=2251670
    if not categoryID: 
        categoryID = 2251670
    dat = eia.category(categoryID).get_info()
    states = dat['category']['childcategories']
    
    # iterate through states and collect address for CO2 emission data series broken down by sector and fuel
    for state in states:
        print(f"\tCollected data for {state['name']}")
        dat = eia.category(state['category_id']).get_info()
        collector[state['name']] = dat['category']['childseries']
                                             
    if saveData:
        save_json(collector, './seriesLookupTable.json')    
    print('Finished building series lookup table\n')

    return collector

        
def collect_data(seriesLookupTable, saveData=True):
    collector = seriesLookupTable.copy()

    print('Collecting data...')
    for state, dat in collector.items():
            # request data for all CO2 emission series in a state
            ids = [x['series_id'] for x in dat]
            seriesData = eia.multiseries(ids).get_data(all_data=True)['series']
            # seriesData contains all the info in dat + the actual series itself
            collector[state] = seriesData
            print(f"\tCollected data for {state}")
    print('Finished collecting data\n')
    
    if saveData:
        save_json(collector, './unparsedData.json')
    return collector
              
              
def parse_data(unparsedData, saveData=True):
    '''unpack the data dictionary into a DataFrame'''
    sectors = ('commercial', 'electric', 'industrial', 'residential', 'transportation', 'total')
    collector = []
            
    print('Parsing data...')
    # iterate through the states
    for state, dat in unparsedData.items(): 
        # iterate through each series in dat
        for series in dat: 
            # sector is the first word of the series title
            # we are not concenred with CO2 emission coefficients
            sector = series['name'].split(' ')[0]
            fuelType = series['name'].split(', ')[1]
            if sector.lower() in sectors: 
                df = pd.DataFrame(series['data'], columns=['year', 'CO2Emission'])
                df['sector'] = sector
                df['fuelType'] = fuelType
                df['state'] = state
                df['units'] = series['units']
                # reorder the columns
                df = df[['state', 'sector', 'fuelType', 'year', 'CO2Emission', 'units']]
                collector.append(df)
    # combine data into a single df 
    out = pd.concat(collector, axis=0)
    out.index = np.arange(0, out.shape[0])
    
    print('Finished parsing data\n')
    if saveData: 
        out.to_csv('parsedData.csv')
    return out

              
def preprocess_data(parsedData, saveData=True):
    # CO2 emission data is presented in 2 different units
    # million metric tons CO2 
    # metric tons CO2 
    # convert all emission data into the same unit - metric tons CO2
    
    print('Preprocessing data...')
    dat = parsedData.copy()
    filterMMTons = dat['units'] == 'million metric tons CO2'
    ind = dat[filterMMTons].index 
    dat.loc[ind, 'CO2Emission'] = dat.loc[ind, 'CO2Emission']*1e6  # million metric tons ---> metric tons
    
    # units can be now be dropped 
    dat.drop('units', inplace=True, axis=1)
              
    # convert year to int
    dat['year'] = dat['year'].astype(np.int64)
    
    print('Finished preprocessing data\n')
    return dat
              

def save_json(data, filePath):
    with open(filePath, 'w') as file: 
        json.dump(data, file)
        
              
def read_json(filePath):
    with open(filePath, 'r') as file: 
        data = json.load(file)
    return data

In [4]:
# initalize eiapy 
# you must have a valid api key from EIA 
# {"API_KEY":"insert_your_api_key"}
credsFilePath = './creds.json'
eia = EIA(credsFilePath=credsFilePath)

# collect, preprocess, and save data for all series in EIA's CO2 Emissions category
seriesLookupTable = build_co2_emissions_lookup_table()
unparsedData = collect_data(seriesLookupTable, saveData=True)
parsedData = parse_data(unparsedData, saveData=True)
preprocessedData = preprocess_data(parsedData, saveData=True)

Bulding series lookup table...
	Collected data for Alabama
	Collected data for Alaska
	Collected data for Arizona
	Collected data for Arkansas
	Collected data for California
	Collected data for Colorado
	Collected data for Connecticut
	Collected data for Delaware
	Collected data for District of Columbia
	Collected data for Florida
	Collected data for Georgia
	Collected data for Hawaii
	Collected data for Idaho
	Collected data for Illinois
	Collected data for Indiana
	Collected data for Iowa
	Collected data for Kansas
	Collected data for Kentucky
	Collected data for Louisiana
	Collected data for Maine
	Collected data for Maryland
	Collected data for Massachusetts
	Collected data for Michigan
	Collected data for Minnesota
	Collected data for Mississippi
	Collected data for Missouri
	Collected data for Montana
	Collected data for Nebraska
	Collected data for Nevada
	Collected data for New Hampshire
	Collected data for New Jersey
	Collected data for New Mexico
	Collected data for New York
