In [2]:
import pandas as pd

In [3]:
START_YEAR = 2020
END_YEAR = 2050
DEVOLVED_AUTHS = ['United Kingdom', 'Scotland', 'Wales', 'Northern Ireland']
GASES = ['CARBON', 'CH4', 'N2O']

### Load data

#### Step 1: Get measure definitions

In [4]:
industry_model_path = 'Sam Industry Model.xlsx'
with open(industry_model_path, 'rb') as f:
    measure_defs = pd.read_excel(f, 'Measure definitions', header=12, usecols='B:D')
measure_defs = measure_defs.dropna()
industry_sectors = measure_defs['EE Sector']
industry_process = measure_defs['Process']
industry_tech = measure_defs['Abatement Technology']

  for idx, row in parser.parse():


#### Step 2: load NZIP Outputs

In [5]:
nzip_path = 'N-ZIP Model - V1.1.xlsb'
with open(nzip_path, 'rb') as f:
    nzip_df = pd.read_excel(f, 'CCC Outputs', header=10, nrows=0, usecols='F:CWV')
cols = nzip_df.columns

In [6]:
with open(industry_model_path, 'rb') as f:
    col_idxs = range(2, 2+len(cols))
    df = pd.read_excel(f, 'DD Outputs', header=9, usecols=list(col_idxs))
# correct column names
df.columns = cols

  for idx, row in parser.parse():


In [7]:
# fix missing and incorrect values
df['% CARBON Emissions'] = pd.to_numeric(df['% CARBON Emissions'], errors='coerce')
df['% CH4 Emissions'] = pd.to_numeric(df['% CH4 Emissions'], errors='coerce')
df['% N2O Emissions'] = pd.to_numeric(df['% N2O Emissions'], errors='coerce')
df['Selected Option'] = df['Selected Option'].fillna('')
df = df.fillna(0)

### Abatement emissions

In [8]:
def sector_databook_format(df, variable_name, variable_unit):
    df = df.reset_index()
    df['Sector'] = 'Industry'
    df['Subsector'] = 'Manufacturing'
    df['Measure Name'] = df['Sector'] + '_' + df['Process'] + '_' + df['Selected Option']
    df['Measure Variable'] = variable_name
    df['Variable Unit'] = variable_unit

    # reorder columns
    first_cols = ['Country', 'Sector', 'Subsector', 'Measure Name', 'Measure Variable', 'Variable Unit']
    df = df[first_cols + sorted(list((set(df.columns) - set(first_cols))))]
    df = df.drop(df.columns[-3:], axis=1)
    return df

def aggregate_timeseries_contry(df, timeseries, variable_name, variable_unit, weight_col=None, country='United Kingdom', scale=None):

    # get the emissions time series columns
    total_emissions_cols = [f'{timeseries} {y}' for y in range(START_YEAR, END_YEAR+1)]

    # get carbon emissions
    emissions_cols = list(map(str, range(START_YEAR, END_YEAR+1)))
    df[emissions_cols] = df[total_emissions_cols].copy()
    if weight_col:
        df[emissions_cols] = df[emissions_cols].multiply(df[weight_col], axis=0)
    if scale:
        df[emissions_cols] = df[emissions_cols] * scale

    # sum rows corresponding to the same measure
    agg_emissions_df = df.groupby(['Element_sector', 'Process', 'Selected Option'])[emissions_cols].sum()

    # remove measures not pertaining to the industry model
    row_idxs = [(industry_sectors[i], industry_process[i], industry_tech[i]) for i in range(len(industry_sectors))]
    row_idxs = [idx for idx in row_idxs if idx in agg_emissions_df.index]
    agg_emissions_df = agg_emissions_df.loc[row_idxs]

    # add country column
    agg_emissions_df['Country'] = country

    # format as sector databook
    df = sector_databook_format(agg_emissions_df, variable_name, variable_unit)

    return df

def aggregate_timeseries(df, **kwargs):
    dfs = [aggregate_timeseries_contry(df, country=country, **kwargs) for country in DEVOLVED_AUTHS]
    df = pd.concat(dfs)
    return df

In [31]:
sd_cols = ['Country', 'Sector', 'Subsector', 'Measure Name', 'Measure Variable', 'Variable Unit'] + list(map(str, range(START_YEAR, END_YEAR+1)))
sd_df = pd.DataFrame(columns=sd_cols)

In [32]:
# add total direct and indirect emissions
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Total direct emissions abated (MtCO2e)", variable_name='Abatement total direct', variable_unit='MtCO2e')])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Total indirect emissions abated (MtCO2e)", variable_name='Abatement total indirect', variable_unit='MtCO2e')])

# add broken down emissions
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Total direct emissions abated (MtCO2e)", variable_name='Abatement emissions CO2', variable_unit='MtCO2', weight_col='% CARBON Emissions')])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Total direct emissions abated (MtCO2e)", variable_name='Abatement emissions CH4', variable_unit='MtCO2e', weight_col='% CH4 Emissions')])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Total direct emissions abated (MtCO2e)", variable_name='Abatement emissions N20', variable_unit='MtCO2e', weight_col='% N2O Emissions')])

In [33]:
# add demand
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in electricity use (GWh)", variable_name='Additional demand electricity', variable_unit='TWh', scale=1e-3)])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in natural gas use (GWh)", variable_name='Additional demand gas', variable_unit='TWh', scale=1e-3)])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in petroleum use (GWh)", variable_name='Additional demand petroleum', variable_unit='TWh', scale=1e-3)])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in solid fuel use (GWh)", variable_name='Additional demand solid fuel', variable_unit='TWh', scale=1e-3)])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in primary bioenergy use (GWh)", variable_name='Additional demand final bioenergy', variable_unit='TWh', scale=1e-3)])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="Change in hydrogen use (GWh)", variable_name='Additional demand hydrogen', variable_unit='TWh', scale=1e-3)])

In [34]:
# add capex and opex cols
for y in range(START_YEAR, END_YEAR+1):
    df[f'capex {y}'] = df[f'AM capex (£m) {y}'] - df[f'Counterfactual capex (£m) {y}']
    df[f'opex {y}'] = df[f'Total AM costs (£m) {y}'] - (df[f'Counterfactual opex (£m) {y}'] + df[f'Counterfactual fuel costs (£m) {y}'])

# note, CB6 sector templates instead use
#df['AM opex (£m) 2020'] + df['AM fuel costs 2020'] - (df['Counterfactual opex (£m) 2020'] + df['Counterfactual fuel costs 2020'])

In [35]:
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="capex", variable_name='Additional capital expenditure', variable_unit='£m')])
sd_df = pd.concat([sd_df, aggregate_timeseries(df, timeseries="opex", variable_name='Additional operational expenditure', variable_unit='£m')])

In [37]:
sd_df.to_excel('sd-test.xlsx', index=False)

## looking at measure definitions

In [13]:
industry_measures = set([industry_sectors[i] + '_' + industry_process[i] + '_' + industry_tech[i] for i in range(len(industry_sectors))])
nzip_measures = set(df['Element_sector'].astype(str) + '_' + df['Process'].astype(str) + '_' + df['Selected Option'].astype(str))
lost = set(industry_measures) - set(nzip_measures)

In [14]:
len(industry_measures), len(nzip_measures), len(lost)

(452, 180, 365)

In [15]:
# save list of lost measures to a text file in the same directory as the excel files
with open('lost_measures.txt', 'w') as f:
    # write using join
    f.write('\n'.join(sorted(lost)))