In [66]:
import pandas as pd
from thefuzz import process

In [39]:
START_YEAR = 2020
END_YEAR = 2050
DEVOLVED_AUTHS = ['United Kingdom', 'Scotland', 'Wales', 'Northern Ireland']
GASES = ['CARBON', 'CH4', 'N2O']
SD_COLUMNS = ['Country', 'Sector', 'Subsector', 'Measure Name', 'Measure Variable', 'Variable Unit'] + list(range(START_YEAR, END_YEAR+1))

### Load data

#### Step 1: Get measure definitions

In [4]:
industry_model_path = 'Sam Industry Model.xlsx'
with open(industry_model_path, 'rb') as f:
    measure_defs = pd.read_excel(f, 'Measure definitions', header=12, usecols='B:D')
measure_defs = measure_defs.dropna()
industry_sectors = measure_defs['EE Sector']
industry_process = measure_defs['Process']
industry_tech = measure_defs['Abatement Technology']

  for idx, row in parser.parse():


#### Step 2: load NZIP Outputs

In [61]:
nzip_path = 'N-ZIP Model - V1.1.xlsb'
with open(nzip_path, 'rb') as f:
    nzip_df = pd.read_excel(f, 'CCC Outputs', header=10, nrows=0, usecols='F:CWV')
cols = nzip_df.columns

In [73]:
process.extract("CCS carbon", cols, limit=10)

[('% CARBON Emissions', 86),
 ('Electricity Connection Cost? (in NPV)', 51),
 ('CO2 T&S cost from defined point (£/t)', 50),
 ('UK Government Office Region', 47),
 ('NPV', 45),
 ('Baseline in primary bioenergy use (GWh) 2016', 45),
 ('Baseline in primary bioenergy use (GWh) 2017', 45),
 ('Baseline in primary bioenergy use (GWh) 2018', 45),
 ('Baseline in primary bioenergy use (GWh) 2019', 45),
 ('Baseline in primary bioenergy use (GWh) 2020', 45)]

In [6]:
with open(industry_model_path, 'rb') as f:
    col_idxs = range(2, 2+len(cols))
    df = pd.read_excel(f, 'DD Outputs', header=9, usecols=list(col_idxs))
# correct column names
df.columns = cols

  for idx, row in parser.parse():


In [7]:
# fix missing and incorrect values
df['% CARBON Emissions'] = pd.to_numeric(df['% CARBON Emissions'], errors='coerce')
df['% CH4 Emissions'] = pd.to_numeric(df['% CH4 Emissions'], errors='coerce')
df['% N2O Emissions'] = pd.to_numeric(df['% N2O Emissions'], errors='coerce')
df['Selected Option'] = df['Selected Option'].fillna('')
df = df.fillna(0)

### Abatement emissions

In [87]:
def sector_databook_format(df, variable_name, variable_unit):
    df = df.reset_index()
    df['Sector'] = 'Industry'
    df['Subsector'] = df['Element_sector']
    df['Measure Name'] = df['Element_sector'] + '_' + df['Process'] + '_' + df['Selected Option']
    df['Measure Variable'] = variable_name
    df['Variable Unit'] = variable_unit
    df = df[SD_COLUMNS]
    return df

def aggregate_timeseries_contry(df, timeseries, variable_name, variable_unit, weight_col=None, country='United Kingdom', scale=None):

    # get the emissions time series columns
    total_emissions_cols = [f'{timeseries} {y}' for y in range(START_YEAR, END_YEAR+1)]

    # get carbon emissions
    emissions_cols = list(range(START_YEAR, END_YEAR+1))
    df[emissions_cols] = df[total_emissions_cols].copy()
    if weight_col:
        df[emissions_cols] = df[emissions_cols].multiply(df[weight_col], axis=0)
    if scale:
        df[emissions_cols] = df[emissions_cols] * scale

    # sum rows corresponding to the same measure
    agg_emissions_df = df.groupby(['Element_sector', 'Process', 'Selected Option'])[emissions_cols].sum()

    # remove measures not pertaining to the industry model
    row_idxs = [(industry_sectors[i], industry_process[i], industry_tech[i]) for i in range(len(industry_sectors))]
    row_idxs = [idx for idx in row_idxs if idx in agg_emissions_df.index]
    agg_emissions_df = agg_emissions_df.loc[row_idxs]

    # add country column
    agg_emissions_df['Country'] = country

    # format as sector databook
    df = sector_databook_format(agg_emissions_df, variable_name, variable_unit)

    return df

def aggregate_timeseries(df, **kwargs):
    dfs = [aggregate_timeseries_contry(df, country=country, **kwargs) for country in DEVOLVED_AUTHS]
    df = pd.concat(dfs)
    return df

In [75]:
sd_df = pd.DataFrame(columns=SD_COLUMNS)

In [76]:
# add total direct and indirect emissions
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Total direct emissions abated (MtCO2e)", variable_name='Abatement total direct', variable_unit='MtCO2e')])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Total indirect emissions abated (MtCO2e)", variable_name='Abatement total indirect', variable_unit='MtCO2e')])

# add broken down emissions
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Total direct emissions abated (MtCO2e)", variable_name='Abatement emissions CO2', variable_unit='MtCO2', weight_col='% CARBON Emissions')])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Total direct emissions abated (MtCO2e)", variable_name='Abatement emissions CH4', variable_unit='MtCO2e', weight_col='% CH4 Emissions')])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Total direct emissions abated (MtCO2e)", variable_name='Abatement emissions N20', variable_unit='MtCO2e', weight_col='% N2O Emissions')])

In [77]:
# add demand
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in electricity use (GWh)", variable_name='Additional demand electricity', variable_unit='TWh', scale=1e-3)])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in natural gas use (GWh)", variable_name='Additional demand gas', variable_unit='TWh', scale=1e-3)])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in petroleum use (GWh)", variable_name='Additional demand petroleum', variable_unit='TWh', scale=1e-3)])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in solid fuel use (GWh)", variable_name='Additional demand solid fuel', variable_unit='TWh', scale=1e-3)])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in primary bioenergy use (GWh)", variable_name='Additional demand final bioenergy', variable_unit='TWh', scale=1e-3)])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in hydrogen use (GWh)", variable_name='Additional demand hydrogen', variable_unit='TWh', scale=1e-3)])

In [78]:
# add capex and opex cols
for y in range(START_YEAR, END_YEAR+1):
    df[f'capex {y}'] = df[f'AM capex (£m) {y}'] - df[f'Counterfactual capex (£m) {y}']
    df[f'opex {y}'] = df[f'Total AM costs (£m) {y}'] - (df[f'Counterfactual opex (£m) {y}'] + df[f'Counterfactual fuel costs (£m) {y}'])

# note, CB6 sector templates instead use
#df['AM opex (£m) 2020'] + df['AM fuel costs 2020'] - (df['Counterfactual opex (£m) 2020'] + df['Counterfactual fuel costs 2020'])

In [80]:
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="capex", variable_name='Additional capital expenditure', variable_unit='£m')])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="opex", variable_name='Additional operational expenditure', variable_unit='£m')])

In [81]:
sd_df.head()

Unnamed: 0,Country,Sector,Subsector,Measure Name,Measure Variable,Variable Unit,2020,2021,2022,2023,...,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
0,United Kingdom,Industry,Manufacturing,Iron (Port Talbot Scunthorpe)_Primary Iron Pro...,Abatement total direct,MtCO2e,0.0,0.0,0.0,0.0,...,2.032972,2.032961,2.032951,2.032942,2.032932,2.021504,2.010077,1.99865,1.987223,1.975796
1,United Kingdom,Industry,Manufacturing,Iron (Port Talbot Scunthorpe)_Primary Iron Pro...,Abatement total direct,MtCO2e,0.0,0.0,0.0,0.0,...,1.553684,1.553676,1.553669,1.553661,1.553654,1.54492,1.536187,1.527454,1.518721,1.509988
2,United Kingdom,Industry,Manufacturing,Iron (Port Talbot Scunthorpe)_Primary Iron Pro...,Abatement total direct,MtCO2e,0.0,0.0,0.0,0.0,...,3.671363,3.671344,3.671326,3.671309,3.671292,3.650654,3.630017,3.60938,3.588745,3.568109
3,United Kingdom,Industry,Manufacturing,Cement_Biomass Process_BECCS 1 - Calcium Looping,Abatement total direct,MtCO2e,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,United Kingdom,Industry,Manufacturing,Food & Drink_Biomass Process_BECCS 1 - Calcium...,Abatement total direct,MtCO2e,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
sd_df.to_excel('sd-test.xlsx', index=False)

### Baseline emissions

In [92]:
bl_df = pd.DataFrame(columns=SD_COLUMNS)
bl_df = pd.concat([bl_df, aggregate_timeseries(df, timeseries="Baseline emissions (MtCO2e)", variable_name='Baseline emissions CO2', variable_unit='MtCO2', weight_col='% CARBON Emissions')])
bl_df = pd.concat([bl_df, aggregate_timeseries(df, timeseries="Baseline emissions (MtCO2e)", variable_name='Baseline emissions CH4', variable_unit='MtCO2e', weight_col='% CH4 Emissions')])
bl_df = pd.concat([bl_df, aggregate_timeseries(df, timeseries="Baseline emissions (MtCO2e)", variable_name='Baseline emissions N20', variable_unit='MtCO2e', weight_col='% N2O Emissions')])

In [97]:
bl_df = bl_df.groupby(['Subsector', 'Measure Variable']).sum(numeric_only=True)
bl_df = bl_df.reset_index()
bl_df

Unnamed: 0,Subsector,Measure Variable,2020,2021,2022,2023,2024,2025,2026,2027,...,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
0,Ammonia,Baseline emissions CH4,0.001299,0.001299,0.001299,0.001299,0.001299,0.001299,0.001299,0.001299,...,0.001299,0.001299,0.001299,0.001299,0.001299,0.001299,0.001299,0.001299,0.001299,0.001299
1,Ammonia,Baseline emissions CO2,5.104342,4.989529,4.917328,4.85741,4.794742,4.727818,4.664229,4.605683,...,4.533179,4.533179,4.533179,4.533179,4.533179,4.533179,4.533179,4.533179,4.533179,4.533179
2,Ammonia,Baseline emissions N20,0.001139,0.001139,0.001139,0.001139,0.001139,0.001139,0.001139,0.001139,...,0.001139,0.001139,0.001139,0.001139,0.001139,0.001139,0.001139,0.001139,0.001139,0.001139
3,Cement,Baseline emissions CH4,0.153184,0.158233,0.159213,0.16117,0.161961,0.162895,0.163768,0.165022,...,0.189658,0.189658,0.189658,0.189658,0.189658,0.189658,0.189658,0.189658,0.189658,0.189658
4,Cement,Baseline emissions CO2,25.651155,25.47124,25.041412,24.688678,24.332896,24.044852,23.799433,23.615951,...,22.726644,22.726644,22.726644,22.726644,22.726644,22.726644,22.726644,22.726644,22.726644,22.726644
5,Cement,Baseline emissions N20,0.129719,0.133995,0.134825,0.136482,0.137152,0.137942,0.138682,0.139743,...,0.160606,0.160606,0.160606,0.160606,0.160606,0.160606,0.160606,0.160606,0.160606,0.160606
6,Construction,Baseline emissions CH4,0.005151,0.004952,0.004611,0.004428,0.004268,0.004138,0.004009,0.003877,...,0.003446,0.003446,0.003446,0.003446,0.003446,0.003446,0.003446,0.003446,0.003446,0.003446
7,Construction,Baseline emissions CO2,6.682371,6.424823,5.982405,5.745263,5.536569,5.367929,5.200421,5.0294,...,4.470461,4.470461,4.470461,4.470461,4.470461,4.470461,4.470461,4.470461,4.470461,4.470461
8,Construction,Baseline emissions N20,0.007182,0.006905,0.00643,0.006175,0.005951,0.005769,0.005589,0.005406,...,0.004805,0.004805,0.004805,0.004805,0.004805,0.004805,0.004805,0.004805,0.004805,0.004805
9,Ethylene,Baseline emissions CH4,0.007037,0.007037,0.007037,0.007037,0.007037,0.007037,0.007037,0.007037,...,0.007037,0.007037,0.007037,0.007037,0.007037,0.007037,0.007037,0.007037,0.007037,0.007037


In [83]:
df['Baseline emissions (MtCO2e) 2016']

0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
4617    0.000331
4618    0.001499
4619    0.000735
4620    0.003247
4621    0.000180
Name: Baseline emissions (MtCO2e) 2016, Length: 4622, dtype: float64

## looking at measure definitions

In [13]:
industry_measures = set([industry_sectors[i] + '_' + industry_process[i] + '_' + industry_tech[i] for i in range(len(industry_sectors))])
nzip_measures = set(df['Element_sector'].astype(str) + '_' + df['Process'].astype(str) + '_' + df['Selected Option'].astype(str))
lost = set(industry_measures) - set(nzip_measures)

In [14]:
len(industry_measures), len(nzip_measures), len(lost)

(452, 180, 365)

In [15]:
# save list of lost measures to a text file in the same directory as the excel files
with open('lost_measures.txt', 'w') as f:
    # write using join
    f.write('\n'.join(sorted(lost)))