In [1]:
import pandas as pd

In [9]:
# === GET THE DATA ===
link = "https://www.pnnl.gov/sites/default/files/media/file/ESGC_Cost_Performance_Database_v2024.xlsx"
ESGC_raw = pd.read_excel(link, sheet_name='Database') # so we don't have to keep hitting the server during testing

In [49]:
# === START SOME BASIC FILTERING ===
ESGC = ESGC_raw

# Parameters we care about
parameters_to_keep = [
    "Total Installed Cost ($)",
    "Total Installed Cost ($/kW)",
    "Total Installed Cost ($/kWh)",
    "LCOS ($/kWh)", # We'll use this to filter to the lowest-cost tech
    "LCOS_Project_Life (yrs)", # For some tech this is longer than calendar life because it assumes industry-standard augmentation
    "Fixed O&M ($/kW-year)",
    #"Calendar Life (yrs)",
    #"Primary DOD (%)", # Depth of discharge. This and the rest times we ignore for a grid-scale model (vs. installation-scale).
    #"Rest After Discharge (hrs)",
    #"Rest Before Charge (hrs)",
    "RTE (%)" # round-trip efficiency
]

# Technologies we care about. Pumped storage and compressed air aren't geographically viable in Illinois (see citations in Zotero).
# PNNL assumes hydrogen storage in salt caverns which also aren't geographically viable in Illinois. 
technologies_to_keep = [
    #"CAES",
    #"PSH",
    "Gravitational",
    #"Hydrogen",
    "Lead Acid",
    "Lithium-ion LFP",
    "Lithium-ion NMC",
    "Thermal",
    "Vanadium Redox Flow",
    "Zinc"
]

# Years we care about. 2021 data in the db is historical only, doesn't account for all the variables included in the 2023 and 2030 projections.
years_to_keep = [
    #2021,
    2023,
    2030
]

# Power levels we care about. Assume that 100 MW is a central estimate, reasonably scalable from 50 to 499 MW.
powers_to_keep = [
    #1,
    #10,
    100#,
    #1000
]

# Drop the parameters and technologies that we don't need. Keep the 'Point' values which are the central estimate. Clean up columns.
ESGC = (
    ESGC[ESGC['Estimate_type'].isin(['Point']) & 
             ESGC['Parameter'].isin(parameters_to_keep) &
             ESGC['Technology'].isin(technologies_to_keep) &
             ESGC['Year'].isin(years_to_keep) &
             ESGC['Power_MW'].isin(powers_to_keep)]
    .drop(columns=['Estimate_type', 'Parameter_category'])
)

In [50]:
# === PIVOT THE DATA TO TURN 'PARAMETERS' INTO COLUMNS ===

# Get the list of index columns
index_columns = ESGC.columns.difference(['Parameter', 'Value']).tolist()

# Pivot the DataFrame
ESGC = ESGC.pivot_table(index=index_columns, columns='Parameter', values='Value').reset_index()

In [51]:
# === MORE ADVANCED FILTERING ===

# Get unique combinations of data year, power, and duration
unique_combinations = ESGC.groupby(['Year', 'Power_MW', 'Duration_hr'])

# Initialize a list to store result
filtered_ESGC = []

# Iterate over unique combinations, find the combination with the lowest LCOS, and discard the others. Assume that at each combination of year, power, and duration, we'd pick the most economic solution.
for (year, power, duration), group in unique_combinations:
    # Find index of row with minimum LCOS
    least_cost_index = group['LCOS ($/kWh)'].idxmin()
    # Append this row to filtered list
    filtered_ESGC.append(group.loc[[least_cost_index]])

# Concatenate all DataFrames in filtered list
filtered_ESGC = pd.concat(filtered_ESGC, ignore_index=True)

# Spit out a csv
csv_file_path = 'filtered_ESGC.csv'
filtered_ESGC.to_csv(csv_file_path, index=False)

In [52]:
# === ADJUST TO MATCH THE retrieve_costs OUTPUT FORMAT ===

# These are the columns that model currently utilizes
columns_needed = [
    'technology_alias',      # = "Battery"
    'techdetail',            # Concatenate 'Technology', 'Power_MW', and 'Duration_hr'
    'core_metric_variable',  # 'Year'
    'CAPEX',                 # 'Total Installed Cost ($)', I think. Should be per kW? per kWh?
    'FOM',                   # 'Fixed O&M ($/kW-year)' 
    'VOM',                   # = 0
    'Fuel'                   # = 0
]

# These are some other columns that the storage database contains that we need to figure out how to use
other_columns = [
    "LCOS_Project_Life (yrs)", # For some tech this is longer than calendar life because it assumes industry-standard augmentation
    "RTE (%)", # round-trip efficiency
    "Total Installed Cost ($/kW)",
    "Total Installed Cost ($/kWh)"
]

# Add a "carrier" column and fill with "Storage"
filtered_ESGC['technology_alias'] = 'Battery'
filtered_ESGC['techdetail'] = filtered_ESGC['Technology'] + ' ' + filtered_ESGC['Power_MW'].astype(str) + ' MW ' + filtered_ESGC['Duration_hr'].astype(str) + ' hrs'
filtered_ESGC = filtered_ESGC.rename(columns={'Year': 'core_metric_variable', 'Total Installed Cost ($)': 'CAPEX', 'Fixed O&M ($/kW-year)': 'FOM'})
filtered_ESGC[['VOM', 'Fuel']] = 0
filtered_ESGC = filtered_ESGC[columns_needed + other_columns]


In [53]:
# Show results
filtered_ESGC

Parameter,technology_alias,techdetail,core_metric_variable,CAPEX,FOM,VOM,Fuel,LCOS_Project_Life (yrs),RTE (%),Total Installed Cost ($/kW),Total Installed Cost ($/kWh)
0,Battery,Lithium-ion LFP 100 MW 2 hrs,2023,84601000.0,2.56,0,0,25.0,0.83,846.01,423.0
1,Battery,Lithium-ion LFP 100 MW 4 hrs,2023,149089000.0,4.27,0,0,25.0,0.83,1490.89,372.72
2,Battery,Lithium-ion LFP 100 MW 6 hrs,2023,215816000.0,5.96,0,0,25.0,0.83,2158.16,359.69
3,Battery,Thermal 100 MW 8 hrs,2023,280651000.0,32.17,0,0,34.0,0.5,2806.51,350.81
4,Battery,Lithium-ion LFP 100 MW 10 hrs,2023,349067000.0,9.33,0,0,26.0,0.83,3490.67,349.07
5,Battery,Thermal 100 MW 24 hrs,2023,405334000.0,47.69,0,0,34.0,0.48,4053.34,168.89
6,Battery,Thermal 100 MW 100 hrs,2023,808829000.0,102.08,0,0,34.0,0.47,8088.29,80.88
7,Battery,Lithium-ion LFP 100 MW 2 hrs,2030,67477000.0,2.17,0,0,20.0,0.85,674.77,337.38
8,Battery,Lithium-ion LFP 100 MW 4 hrs,2030,116240000.0,3.62,0,0,20.0,0.85,1162.4,290.6
9,Battery,Lithium-ion LFP 100 MW 6 hrs,2030,167164000.0,5.05,0,0,20.0,0.85,1671.64,278.61


# Thoughts on how to handle the data in the other columns

 - **LCOS Project Life**: This is the same as the `lifetime` parameter that is defined at the `technology_alias` level. This can be added as a column to the table for all technologies.
 - **Primary DOD**, **Rest After Discharge (hrs)**, and **Rest Before Charge (hrs)**: I believe all of these can safely be ignored because we're not modeling individual batteries at real-time resolution, but groups of batteries at hourly resolution. Ignoring these variables will give slightly optimistic results.
 - **RTE (%)**: This will need to included because additional supply is needed to account for the inefficiencies. This could be included in a seperate table and only used by the model when batteries are charged and discharged. The values are factored into the LCOS which has been used to narrow technology choices.

# A note on costs

Right now `CAPEX` contains the total cost for the specific power and duration. Does this need to be converted into a per kW value? What about variable costs by duration?

Also need to confirm whether units should be in kW, MW, kWh, etc.