In [80]:
import pandas as pd
from thefuzz import process

In [81]:
START_YEAR = 2020
END_YEAR = 2050
DEVOLVED_AUTHS = ['United Kingdom', 'Scotland', 'Wales', 'Northern Ireland']
GASES = ['CARBON', 'CH4', 'N2O']
SD_COLUMNS = ['Country', 'Sector', 'Subsector', 'Measure Name', 'Measure Variable', 'Variable Unit'] + list(range(START_YEAR, END_YEAR+1))
OUTPUT_FILE = 'sd-industry-test.xlsx'

### Load data

#### Step 1: Get measure definitions

In [71]:
industry_model_path = 'Sam Industry Model.xlsx'
with open(industry_model_path, 'rb') as f:
    measure_defs = pd.read_excel(f, 'Measure definitions', header=12, usecols='B:D')
measure_defs = measure_defs.dropna()
industry_sectors = measure_defs['EE Sector']
industry_process = measure_defs['Process']
industry_tech = measure_defs['Abatement Technology']

  for idx, row in parser.parse():


#### Step 2: load NZIP Outputs

In [72]:
#nzip_path = 'N-ZIP Model - V1.1.xlsb'
nzip_path = 'N-ZIP-Model_version1_2_AG_updated_19_12_2023.xlsb'
with open(nzip_path, 'rb') as f:
    df = pd.read_excel(f, 'CCC Outputs', header=10, usecols='F:CWV')

In [73]:
#process.extract("CCS carbon", cols, limit=10)

In [74]:
'''
with open(industry_model_path, 'rb') as f:
    col_idxs = range(2, 2+len(cols))
    df = pd.read_excel(f, 'DD Outputs', header=9, usecols=list(col_idxs))
# correct column names
df.columns = cols
'''

"\nwith open(industry_model_path, 'rb') as f:\n    col_idxs = range(2, 2+len(cols))\n    df = pd.read_excel(f, 'DD Outputs', header=9, usecols=list(col_idxs))\n# correct column names\ndf.columns = cols\n"

In [75]:
# fix missing and incorrect values
df['% CARBON Emissions'] = pd.to_numeric(df['% CARBON Emissions'], errors='coerce')
df['% CH4 Emissions'] = pd.to_numeric(df['% CH4 Emissions'], errors='coerce')
df['% N2O Emissions'] = pd.to_numeric(df['% N2O Emissions'], errors='coerce')
df['Selected Option'] = df['Selected Option'].fillna('')
df = df.fillna(0)

In [103]:
for y in range(START_YEAR, END_YEAR+1):
    df[f'Total AM costs (£m) {y}'] = pd.to_numeric(df[f'Total AM costs (£m) {y}'], errors='coerce').fillna(0)
    df[f'AM opex (£m) {y}'] = pd.to_numeric(df[f'AM opex (£m) {y}'], errors='coerce').fillna(0)
    df[f'AM fuel costs (£m) {y}'] = pd.to_numeric(df[f'AM fuel costs (£m) {y}'], errors='coerce').fillna(0)
    

### Abatement emissions

In [104]:
def sector_databook_format(df, variable_name, variable_unit):
    df = df.reset_index()
    df['Sector'] = 'Industry'
    df['Subsector'] = df['Element_sector']
    df['Measure Name'] = df['Element_sector'] + '_' + df['Process'] + '_' + df['Selected Option']
    df['Measure Variable'] = variable_name
    df['Variable Unit'] = variable_unit
    df = df[SD_COLUMNS]
    return df

def aggregate_timeseries_contry(df, timeseries, variable_name, variable_unit, weight_col=None, country='United Kingdom', scale=None):

    # get the emissions time series columns
    total_emissions_cols = [f'{timeseries} {y}' for y in range(START_YEAR, END_YEAR+1)]

    # get carbon emissions
    emissions_cols = list(range(START_YEAR, END_YEAR+1))
    df[emissions_cols] = df[total_emissions_cols].copy()
    if weight_col:
        df[emissions_cols] = df[emissions_cols].multiply(df[weight_col], axis=0)
    if scale:
        df[emissions_cols] = df[emissions_cols] * scale

    # sum rows corresponding to the same measure
    agg_emissions_df = df.groupby(['Element_sector', 'Process', 'Selected Option'])[emissions_cols].sum()

    # remove measures not pertaining to the industry model
    row_idxs = [(industry_sectors[i], industry_process[i], industry_tech[i]) for i in range(len(industry_sectors))]
    row_idxs = [idx for idx in row_idxs if idx in agg_emissions_df.index]
    agg_emissions_df = agg_emissions_df.loc[row_idxs]

    # add country column
    agg_emissions_df['Country'] = country

    # format as sector databook
    df = sector_databook_format(agg_emissions_df, variable_name, variable_unit)

    return df

def aggregate_timeseries(df, **kwargs):
    dfs = [aggregate_timeseries_contry(df, country=country, **kwargs) for country in DEVOLVED_AUTHS]
    df = pd.concat(dfs)
    return df

In [106]:
# add capex and opex cols
for y in range(START_YEAR, END_YEAR+1):
    df[f'capex {y}'] = df[f'AM capex (£m) {y}'] - df[f'Counterfactual capex (£m) {y}']
    df[f'opex {y}'] = df[f'AM opex (£m) {y}'] + df[f'AM fuel costs (£m) {y}'] - (df[f'Counterfactual opex (£m) {y}'] + df[f'Counterfactual fuel costs (£m) {y}'])

In [107]:
function_calls = [
    # Add total direct and indirect emissions
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement total direct",
        "variable_unit": "MtCO2e",
    },
    {
        "timeseries": "Total indirect emissions abated (MtCO2e)",
        "variable_name": "Abatement total indirect",
        "variable_unit": "MtCO2e",
    },

    # Add broken down emissions
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement emissions CO2",
        "variable_unit": "MtCO2",
        "weight_col": "% CARBON Emissions",
    },
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement emissions CH4",
        "variable_unit": "MtCO2e",
        "weight_col": "% CH4 Emissions",
    },
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement emissions N20",
        "variable_unit": "MtCO2e",
        "weight_col": "% N2O Emissions",
    },

    # Add demand
    {
        "timeseries": "Change in electricity use (GWh)",
        "variable_name": "Additional demand electricity",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in natural gas use (GWh)",
        "variable_name": "Additional demand gas",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in petroleum use (GWh)",
        "variable_name": "Additional demand petroleum",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in solid fuel use (GWh)",
        "variable_name": "Additional demand solid fuel",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in primary bioenergy use (GWh)",
        "variable_name": "Additional demand final bioenergy",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in hydrogen use (GWh)",
        "variable_name": "Additional demand hydrogen",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },

    # Add capex and opex
    {
        "timeseries": "capex",
        "variable_name": "Additional capital expenditure",
        "variable_unit": "£m",
    },
    {
        "timeseries": "opex",
        "variable_name": "Additional operational expenditure",
        "variable_unit": "£m",
    },
]

In [113]:
sd_df = pd.DataFrame(columns=SD_COLUMNS)
for args in function_calls:
    sd_df = pd.concat([sd_df, aggregate_timeseries(df, **args)])

In [114]:
sd_df.head()

Unnamed: 0,Country,Sector,Subsector,Measure Name,Measure Variable,Variable Unit,2020,2021,2022,2023,...,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
0,United Kingdom,Industry,Iron (Port Talbot Scunthorpe),Iron (Port Talbot Scunthorpe)_Primary Iron Pro...,Abatement total direct,MtCO2e,0.0,0.0,0.0,0.0,...,3.278571,3.278571,3.278571,3.278571,3.278904,3.260155,3.243387,3.224962,3.206537,3.188598
1,United Kingdom,Industry,Iron (Port Talbot Scunthorpe),Iron (Port Talbot Scunthorpe)_Primary Iron Pro...,Abatement total direct,MtCO2e,0.0,0.0,0.0,0.0,...,3.278571,3.278571,3.278571,3.278571,3.278904,3.260155,3.243387,3.224962,3.206537,3.188598
2,United Kingdom,Industry,Cement,Cement_Biomass Process_BECCS 1 - Calcium Looping,Abatement total direct,MtCO2e,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,United Kingdom,Industry,Food & Drink,Food & Drink_Biomass Process_BECCS 1 - Calcium...,Abatement total direct,MtCO2e,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,United Kingdom,Industry,Lime,Lime_Biomass Process_BECCS 1 - Calcium Looping,Abatement total direct,MtCO2e,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
sd_df.to_excel(OUTPUT_FILE, index=False, sheet_name='pathway')

### Baseline emissions

In [116]:
bl_cols = [
    {
        "timeseries": "Baseline emissions (MtCO2e)",
        "variable_name": "Baseline emissions CO2",
        "variable_unit": "MtCO2",
        "weight_col": "% CARBON Emissions",
    },
    {
        "timeseries": "Baseline emissions (MtCO2e)",
        "variable_name": "Baseline emissions CH4",
        "variable_unit": "MtCO2e",
        "weight_col": "% CH4 Emissions",
    },
    {
        "timeseries": "Baseline emissions (MtCO2e)",
        "variable_name": "Baseline emissions N20",
        "variable_unit": "MtCO2e",
        "weight_col": "% N2O Emissions",
    },
]

In [117]:
bl_df = pd.DataFrame(columns=SD_COLUMNS)
for args in bl_cols:
    bl_df = pd.concat([bl_df, aggregate_timeseries(df, **args)])


In [118]:
bl_df = bl_df.groupby(['Subsector', 'Measure Variable']).sum(numeric_only=True)
bl_df = bl_df.reset_index()
with pd.ExcelWriter(OUTPUT_FILE, mode='a') as writer:  
    bl_df.to_excel(writer, index=False, sheet_name='baseline')

### Aggregated outputs

## looking at measure definitions

In [13]:
industry_measures = set([industry_sectors[i] + '_' + industry_process[i] + '_' + industry_tech[i] for i in range(len(industry_sectors))])
nzip_measures = set(df['Element_sector'].astype(str) + '_' + df['Process'].astype(str) + '_' + df['Selected Option'].astype(str))
lost = set(industry_measures) - set(nzip_measures)

In [14]:
len(industry_measures), len(nzip_measures), len(lost)

(452, 180, 365)

In [15]:
# save list of lost measures to a text file in the same directory as the excel files
with open('lost_measures.txt', 'w') as f:
    # write using join
    f.write('\n'.join(sorted(lost)))