In [80]:
import pandas as pd
from thefuzz import process

In [218]:
START_YEAR = 2021
END_YEAR = 2050
DEVOLVED_AUTHS = ['United Kingdom', 'Scotland', 'Wales', 'Northern Ireland']
GASES = ['CARBON', 'CH4', 'N2O']
SD_COLUMNS = ['Country', 'Sector', 'Subsector', 'Category1 EE_sector', 'Category2 Process', 'Category3 Tech', 'Measure Name', 'Measure Variable', 'Variable Unit'] + list(range(START_YEAR, END_YEAR+1))
OUTPUT_FILE = 'sd-industry-test.xlsx'

### Load data

#### Step 1: Get measure definitions

In [219]:
industry_model_path = 'Sam Industry Model.xlsx'
with open(industry_model_path, 'rb') as f:
    measure_defs = pd.read_excel(f, 'Measure definitions', header=12, usecols='B:D')
measure_defs = measure_defs.dropna()
industry_sectors = measure_defs['EE Sector']
industry_process = measure_defs['Process']
industry_tech = measure_defs['Abatement Technology']

  for idx, row in parser.parse():


In [220]:
nzip_defs_path = 'NZIP_Sector_Process.xlsx'
with open(nzip_defs_path, 'rb') as f:
    nzip_defs_df = pd.read_excel(f, 'Sheet1')
industry_defs = nzip_defs_df.loc[nzip_defs_df['CCC sector'] == 'Industry']
industry_defs

Unnamed: 0,CCC sector,EE sector,Process
0,Industry,Cement,Kiln - Cement
1,Industry,Cement,Process CO2 - Cement
2,Industry,Cement,Biomass Process
4,Industry,Ethylene,Boiler - Steam (Non BECCS allowed)
5,Industry,Ethylene,Combustion CO2 - Reforming
...,...,...,...
91,Industry,Vehicles,Oven - Vehicles
100,Industry,Construction,Generators
101,Industry,Construction,Lubrication
102,Industry,Ammonia,Combustion CO2 - Reforming


#### Step 2: load NZIP Outputs

In [221]:
#nzip_path = 'N-ZIP Model - V1.1.xlsb'
nzip_path = 'N-ZIP-Model_version1_2_AG_updated_19_12_2023.xlsb'
with open(nzip_path, 'rb') as f:
    df = pd.read_excel(f, 'CCC Outputs', header=10, usecols='F:CWV')

In [228]:
print('length before:', len(df))
df = df.loc[df['Element_sector'].isin(industry_defs['EE sector']) & df['Process'].isin(industry_defs['Process'])]
print('length after:', len(df))

length before: 4414
length after: 4414


In [229]:
# fix missing values
df['Selected Option'] = df['Selected Option'].fillna('')

df['% CARBON Emissions'] = pd.to_numeric(df['% CARBON Emissions'], errors='coerce')
df['% CH4 Emissions'] = pd.to_numeric(df['% CH4 Emissions'], errors='coerce')
df['% N2O Emissions'] = pd.to_numeric(df['% N2O Emissions'], errors='coerce')
for y in range(START_YEAR, END_YEAR+1):
    df[f'Total AM costs (£m) {y}'] = pd.to_numeric(df[f'Total AM costs (£m) {y}'], errors='coerce')
    df[f'AM opex (£m) {y}'] = pd.to_numeric(df[f'AM opex (£m) {y}'], errors='coerce')
    df[f'AM fuel costs (£m) {y}'] = pd.to_numeric(df[f'AM fuel costs (£m) {y}'], errors='coerce')

# anything that's not a number is given a value of 0
df = df.fillna(0)



In [310]:
# seach columns with thefuzz
process.extract('traded', cols, limit=5)

[('Traded / non-traded', 90),
 ('Abatement Rate', 60),
 ('Average direct abatement cost (£/tCO2e) 2016', 60),
 ('Average direct abatement cost (£/tCO2e) 2017', 60),
 ('Average direct abatement cost (£/tCO2e) 2018', 60)]

### Abatement emissions

In [311]:
sector_map = {
 'Cement': 'Cement and Lime',
 'Compressor Station': 'Fossil fuel production',
 'Ethylene': 'Chemicals',
 'Food & Drink': 'Food, Drink, Tobacco',
 'Gas Terminal': 'Fossil fuel production',
 'Glass': 'Glass and other minerals',
 'Iron (Port Talbot Scunthorpe)': 'Iron & Steel',
 'Lime': 'Cement and Lime',
 'LNG Terminal': 'Fossil fuel production',
 'Non ferrous metal': 'Non ferrous metals',
 'Gas Platform': 'Fossil fuel production',
 'Oil Terminal': 'Fossil fuel production',
 'Other Chemicals': 'Chemicals',
 'Other Fuel Production': 'Fossil fuel production',
 'Other industry': 'Other manufacturing and construction',
 'Other Iron and Steel': 'Iron & Steel',
 'Other Minerals': 'Glass and other minerals',
 'Paper': 'Paper, Pulp, Print',
 'Refining': 'Refining',
 'Vehicles': 'Vehicles',
 'Waste Processing': 'Water and waste management',
 'Coal Mine (closed)': 'Coal mines',
 'Coal Mine (open)': 'Coal mines',
 'Gas Distribution': 'Fossil fuel production',
 'Construction': 'Other manufacturing and construction',
 'Ammonia': 'Chemicals',
 'Oil Platform': 'Fossil fuel production',
 'Shale Gas': 'Fossil fuel production',
 'Off-road mobile machinery': 'Off-road mobile machinery'
}


In [333]:
def sector_databook_format(df, variable_name, variable_unit):
    df = df.reset_index()
    df['Sector'] = 'Industry'
    df['Subsector'] = df['Element_sector'].map(sector_map)
    df['Measure Name'] = df['Element_sector'] + '_' + df['Process'] + '_' + df['Selected Option']
    df['Measure Variable'] = variable_name
    df['Variable Unit'] = variable_unit
    df['Category1 EE_sector'] = df['Element_sector']
    df['Category2 Process'] = df['Process']
    df['Category3 Tech'] = df['Selected Option']
    df = df[SD_COLUMNS]
    return df

def aggregate_timeseries_country(df, timeseries, variable_name, variable_unit, weight_col=None, country='United Kingdom', scale=None):

    # get the emissions time series columns
    total_emissions_cols = [f'{timeseries} {y}' for y in range(START_YEAR, END_YEAR+1)]
    emissions_cols = list(range(START_YEAR, END_YEAR+1))
    df[emissions_cols] = df[total_emissions_cols].copy()

    # multiply by another column and/or then scale by a fixed value
    if weight_col:
        df[emissions_cols] = df[emissions_cols].multiply(df[weight_col], axis=0)
    if scale:
        df[emissions_cols] = df[emissions_cols] * scale

    # sum rows corresponding to the same measure
    agg_emissions_df = df.groupby(['Element_sector', 'Process', 'Selected Option'])[emissions_cols].sum()

    # add country column
    agg_emissions_df['Country'] = country

    # format as sector databook
    df = sector_databook_format(agg_emissions_df, variable_name, variable_unit)

    return df

def aggregate_timeseries(df, **kwargs):
    # go through each country and combine the results
    dfs = [aggregate_timeseries_country(df, country=country, **kwargs) for country in DEVOLVED_AUTHS]
    df = pd.concat(dfs)
    return df

In [334]:
# add capex and opex cols
for y in range(START_YEAR, END_YEAR+1):
    df[f'capex {y}'] = df[f'AM capex (£m) {y}'] - df[f'Counterfactual capex (£m) {y}']
    df[f'opex {y}'] = df[f'AM opex (£m) {y}'] + df[f'AM fuel costs (£m) {y}'] - (df[f'Counterfactual opex (£m) {y}'] + df[f'Counterfactual fuel costs (£m) {y}'])

In [335]:
function_calls = [
    # Add total direct and indirect emissions
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement total direct",
        "variable_unit": "MtCO2e",
    },
    {
        "timeseries": "Total indirect emissions abated (MtCO2e)",
        "variable_name": "Abatement total indirect",
        "variable_unit": "MtCO2e",
    },

    # Add emissions by gas
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement emissions CO2",
        "variable_unit": "MtCO2",
        "weight_col": "% CARBON Emissions",
    },
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement emissions CH4",
        "variable_unit": "MtCO2e",
        "weight_col": "% CH4 Emissions",
    },
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement emissions N20",
        "variable_unit": "MtCO2e",
        "weight_col": "% N2O Emissions",
    },

    # Add demand
    {
        "timeseries": "Change in electricity use (GWh)",
        "variable_name": "Additional demand electricity",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in natural gas use (GWh)",
        "variable_name": "Additional demand gas",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in petroleum use (GWh)",
        "variable_name": "Additional demand petroleum",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in solid fuel use (GWh)",
        "variable_name": "Additional demand solid fuel",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in primary bioenergy use (GWh)",
        "variable_name": "Additional demand final bioenergy",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in hydrogen use (GWh)",
        "variable_name": "Additional demand hydrogen",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },

    # Add capex and opex
    {
        "timeseries": "capex",
        "variable_name": "Additional capital expenditure",
        "variable_unit": "£m",
    },
    {
        "timeseries": "opex",
        "variable_name": "Additional operational expenditure",
        "variable_unit": "£m",
    },
]

In [336]:
sd_df = pd.DataFrame(columns=SD_COLUMNS)
for args in function_calls:
    sd_df = pd.concat([sd_df, aggregate_timeseries(df, **args)])

In [337]:
sd_df.head()

Unnamed: 0,Country,Sector,Subsector,Category1 EE_sector,Category2 Process,Category3 Tech,Measure Name,Measure Variable,Variable Unit,2020,...,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
0,United Kingdom,Industry,Chemicals,Ammonia,Combustion CO2 - Reforming,,Ammonia_Combustion CO2 - Reforming_,Abatement total direct,MtCO2e,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,United Kingdom,Industry,Chemicals,Ammonia,Combustion CO2 - Reforming,CCS - Advanced amines or blends,Ammonia_Combustion CO2 - Reforming_CCS - Advan...,Abatement total direct,MtCO2e,0.0,...,0.245961,0.245905,0.245849,0.245793,0.245738,0.24569,0.245643,0.245595,0.245548,0.2455
2,United Kingdom,Industry,Chemicals,Ammonia,Combustion CO2 - Reforming,CCS - Calcium Looping,Ammonia_Combustion CO2 - Reforming_CCS - Calci...,Abatement total direct,MtCO2e,0.0,...,0.164822,0.164784,0.164747,0.164709,0.164672,0.16464,0.164608,0.164576,0.164545,0.164513
3,United Kingdom,Industry,Chemicals,Ammonia,Process CO2 - Reforming,,Ammonia_Process CO2 - Reforming_,Abatement total direct,MtCO2e,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,United Kingdom,Industry,Chemicals,Ammonia,Process CO2 - Reforming,CCS - Advanced amines or blends,Ammonia_Process CO2 - Reforming_CCS - Advanced...,Abatement total direct,MtCO2e,0.0,...,0.694148,0.694148,0.694148,0.694148,0.694148,0.694148,0.694148,0.694148,0.694148,0.694148


In [338]:
sd_df.to_excel(OUTPUT_FILE, index=False, sheet_name='Measure level data')

In [339]:
# write a sheet containing the measure definitions
measure_defs_df = pd.DataFrame({
    'Sector': pd.Series(sd_df['Sector'].unique()).sort_values(),
    'Subsector': pd.Series(sd_df['Subsector'].unique()).sort_values(),
    'EE_sector': pd.Series(sd_df['Category1 EE_sector'].unique()).sort_values(),
    'Process': pd.Series(sd_df['Category2 Process'].unique()).sort_values(),
    'Tech': pd.Series(sd_df['Category3 Tech'].unique()).sort_values(),
    'Measure Name': pd.Series(sd_df['Measure Name'].unique()).sort_values(),
})

with pd.ExcelWriter(OUTPUT_FILE, mode='a') as writer:
    measure_defs_df.to_excel(writer, index=False, sheet_name='Measure definitions')


### Baseline emissions

In [340]:
bl_cols = [
    {
        "timeseries": "Baseline emissions (MtCO2e)",
        "variable_name": "Baseline emissions CO2",
        "variable_unit": "MtCO2",
        "weight_col": "% CARBON Emissions",
    },
    {
        "timeseries": "Baseline emissions (MtCO2e)",
        "variable_name": "Baseline emissions CH4",
        "variable_unit": "MtCO2e",
        "weight_col": "% CH4 Emissions",
    },
    {
        "timeseries": "Baseline emissions (MtCO2e)",
        "variable_name": "Baseline emissions N20",
        "variable_unit": "MtCO2e",
        "weight_col": "% N2O Emissions",
    },
]

In [341]:
bl_df = pd.DataFrame(columns=SD_COLUMNS)
for args in bl_cols:
    bl_df = pd.concat([bl_df, aggregate_timeseries(df, **args)])


In [342]:
bl_df = bl_df.groupby(['Country', 'Subsector', 'Measure Variable', 'Variable Unit']).sum(numeric_only=True)
bl_df = bl_df.reset_index()
with pd.ExcelWriter(OUTPUT_FILE, mode='a') as writer:  
    bl_df.to_excel(writer, index=False, sheet_name='Baseline data')

### Aggregated outputs

In [343]:
agg_df = pd.DataFrame(columns=['Country', 'Measure Variable', 'Variable Unit'] + list(range(START_YEAR, END_YEAR+1)))

In [344]:
total_abatement = sd_df.loc[(sd_df['Measure Variable'] == 'Abatement total direct') & (sd_df['Country'] == 'United Kingdom')].sum(numeric_only=True)
total_baseline_emissions = bl_df.loc[(bl_df['Measure Variable'] == 'Baseline emissions CO2') & (bl_df['Country'] == 'United Kingdom')].sum(numeric_only=True)
total_pathway_emissions = total_baseline_emissions - total_abatement

In [345]:
agg_df.loc['Baseline emissions total'] = total_baseline_emissions
agg_df.loc['Baseline emissions total', 'Country'] = 'United Kingdom'
agg_df.loc['Baseline emissions total', 'Measure Variable'] = 'Baseline emissions total'
agg_df.loc['Baseline emissions total', 'Variable Unit'] = 'MtCO2e'

agg_df.loc['Direct emissions total'] = total_pathway_emissions
agg_df.loc['Direct emissions total', 'Country'] = 'United Kingdom'
agg_df.loc['Direct emissions total', 'Measure Variable'] = 'Direct emissions total'
agg_df.loc['Direct emissions total', 'Variable Unit'] = 'MtCO2e'


In [349]:
with pd.ExcelWriter(OUTPUT_FILE, mode='a', if_sheet_exists='replace') as writer:  
    agg_df.to_excel(writer, index=False, sheet_name='Aggregate data')

## looking at measure definitions

In [145]:
industry_measures = set([industry_sectors[i] + '_' + industry_process[i] + '_' + industry_tech[i] for i in range(len(industry_sectors))])
nzip_measures = set(df['Element_sector'].astype(str) + '_' + df['Process'].astype(str) + '_' + df['Selected Option'].astype(str))
lost = set(industry_measures) - set(nzip_measures)

In [146]:
len(industry_measures), len(nzip_measures), len(lost)

(452, 145, 393)

In [15]:
# save list of lost measures to a text file in the same directory as the excel files
with open('lost_measures.txt', 'w') as f:
    # write using join
    f.write('\n'.join(sorted(lost)))