## Introduction

This is a google colab for generating a preliminary industry sector databook based on what was obtained for first pass 28th March. Note this is a temporary patch code that was used in interest of time for first pass in addition to manual editing required to output files that is outlined at the end of the notebook.

Key issues and description of code purposes are outlined here - https://github.com/thecccuk/sector_databook_conversion/tree/main/nzip_industry_patch - the original code where these issues should be fixed for final pass and be usable for Fuel Supply sector too should be done here - https://github.com/thecccuk/sector_databook_conversion/tree/main/nzip 

Run the cell below to get started which will import required libraries and packages.

In [None]:
colab = True # change to False if running on your local machine, or True if running on colab

# only run on colab!
if colab:
    get_ipython().system('wget -q https://raw.githubusercontent.com/thecccuk/sector_databook_conversion/main/nzip_industry_patch/nzip.py -O nzip.py')
    get_ipython().system('wget -q https://raw.githubusercontent.com/thecccuk/sector_databook_conversion/main/nzip_industry_patch/nzip_model_sector_map.csv -O nzip_model_sector_map.csv')
    get_ipython().system('wget -q https://raw.githubusercontent.com/thecccuk/sector_databook_conversion/main/nzip_industry_patch/requirements.txt -O requirements.txt')
    get_ipython().system('pip install -q -r requirements.txt')
    
# ignore some junk output
%load_ext autoreload
%autoreload 2
import nzip
import pandas as pd
from openpyxl import load_workbook
import warnings
from google.colab import files
import os
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

## Uploading BP and AAP files

This block uploads and reads the NZIP file in addition to some data preprocessing to get it ready for sector databook format.

1. Before running the code, change the filepath variable to your NZIP file name. Please ensure it finishes with "-BP.xlsb" or "-AAP.xlsb"
2. Run the code. At the bottom, it will ask you to upload the NZIP file. This will run for 15 minutes.
3. Once it finishes, it will download a file "sd-industry-BP.xlsx" or "sd-industry-AAP.xlsx" onto your device.
4. Rerun the code for the other NZIP file pathway. Afterwhich, you should now have downloaded for "sd-industry-BP.xlsx" and "sd-industry-AAP.xlsx" files

Notes: if you wish the retrieve an excel version of the loaded NZIP file, remove the #'s from the lines: "df.to_excel("NZIP_dataframe.xlsx") files.download("NZIP_dataframe.xlsx")"

In [None]:
#Upload files
uploaded = files.upload()
upload_files()

#Read NZIP and output - please rename filepath to your corresponding path.
filepath = 'N-ZIP-Model_version1_2_ML_updated_25_03_2024-BP.xlsb'

assert os.path.exists(filepath), f"{filepath} not found."
nzip_path = filepath
sector_defs_path = 'nzip_model_sector_map.csv'
sector = 'Industry'
scenario = filepath.split('-')[-1].split('.')[0]
output_file = f'sd-industry-{scenario}.xlsx'
df = nzip.load_nzip(nzip_path, sector_defs_path, sector, scenario)
df = nzip.add_cols(df.copy())
print('Done!')

#Uncomment lines if you wish to download the dataframe at this point.
#df.to_excel("NZIP_dataframe.xlsx")
#files.download("NZIP_dataframe.xlsx")

measure_level_kwargs = [
    # Add total direct and indirect emissions
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement total direct",
        "variable_unit": "MtCO2e",
    },

    {
        "timeseries": "Total indirect emissions abated (MtCO2e)",
        "variable_name": "Abatement total indirect",
        "variable_unit": "MtCO2e",
    },

    # Add emissions by gas
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement emissions CO2",
        "variable_unit": "MtCO2",
        "weight_col": "% CARBON Emissions",
    },
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement emissions CH4",
        "variable_unit": "MtCO2e",
        "weight_col": "% CH4 Emissions",
    },
    {
        "timeseries": "Total direct emissions abated (MtCO2e)",
        "variable_name": "Abatement emissions N20",
        "variable_unit": "MtCO2e",
        "weight_col": "% N2O Emissions",
    },

    # Add demand
    {
        "timeseries": "Change in electricity use (GWh)",
        "variable_name": "Additional demand electricity",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in natural gas use (GWh)",
        "variable_name": "Additional demand gas",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in petroleum use (GWh)",
        "variable_name": "Additional demand petroleum",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in solid fuel use (GWh)",
        "variable_name": "Additional demand solid fuel",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in primary bioenergy use (GWh)",
        "variable_name": "Additional demand final bioenergy",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in hydrogen use (GWh)",
        "variable_name": "Additional demand hydrogen",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Change in non bio waste",
        "variable_name": "Additional demand final non-bio waste",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },

    # Add capex and opex
    {
        "timeseries": "capex",
        "variable_name": "Additional capital expenditure",
        "variable_unit": "£m",
    },
    {
        "timeseries": "capex annualised",
        "variable_name": "Additional capital expenditure annualised",
        "variable_unit": "£m",
    },
    {
        "timeseries": "capex low carbon",
        "variable_name": "Total capital expenditure low carbon",
        "variable_unit": "£m",
    },
    {
        "timeseries": "opex",
        "variable_name": "Additional operating expenditure",
        "variable_unit": "£m",
    },
    {
        "timeseries": "opex low carbon",
        "variable_name": "Total operating expenditure low carbon",
        "variable_unit": "£m",
    },

    # CCS
    {
        "timeseries": "Tonnes of CO2 captured (MtCO2)",
        "variable_name": "Additional CCS",
        "variable_unit": "MtCO2",
    },

    # these are intermediate variables
    {
        "timeseries": "total emissions abated",
        "variable_name": "total emissions abated",
        "variable_unit": "MtCO2e",
    },
    {
        "timeseries": "cost differential",
        "variable_name": "cost differential",
        "variable_unit": "£m",
    },
    {
        "timeseries": "cum total emissions abated",
        "variable_name": "cum total emissions abated",
        "variable_unit": "MtCO2e",
    },
    {
        "timeseries": "cum cost differential",
        "variable_name": "cum cost differential",
        "variable_unit": "£m",
    },
   
]

reee_kwargs = [
    {
        "baseline_col": "Baseline emissions (MtCO2e)",
        "post_reee_col": "Post REEE baseline emissions (MtCO2e)",
        "out_col": "Abatement emissions CO2",
        "variable_unit": "MtCO2",
    },
    {
        "baseline_col": "Baseline emissions (MtCO2e)",
        "post_reee_col": "Post REEE baseline emissions (MtCO2e)",
        "out_col": "Abatement total direct",
        "variable_unit": "MtCO2e",
    },
    
    {
        "baseline_col": "Baseline electricity use (GWh)",
        "post_reee_col": "Post REEE baseline electricity use (GWh)",
        "out_col": "Additional demand electricity",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "baseline_col": "Baseline in natural gas use (GWh)",
        "post_reee_col": "Post REEE baseline in natural gas use (GWh)",
        "out_col": "Additional demand gas",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "baseline_col": "Baseline in petroleum use (GWh)",
        "post_reee_col": "Post REEE baseline in petroleum use (GWh)",
        "out_col": "Additional demand petroleum",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "baseline_col": "Baseline in solid fuel use (GWh)",
        "post_reee_col": "Post REEE baseline in solid fuel use (GWh)",
        "out_col": "Additional demand solid fuel",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
]

# write out the measure level data for this pathway
sd_df = nzip.sd_measure_level(df.copy(), measure_level_kwargs, scenario, reee_kwargs, nzip_path=nzip_path, baseline=False)

# modelling years
START_YEAR = 2021
END_YEAR = 2050
YEARS = list(range(START_YEAR, END_YEAR+1))

for i in range(6, 21):
    sd_df[f'Category{i}'] = ''

# Define the new column order
column_order = ['Measure ID', 'Country', 'Sector', 'Subsector', 
                'Category3: Dispersed or Cluster Site', 'Category4: Process', 
                'Category5: Selected Option'] + \
               [f'Category{i}' for i in range(6, 21)] + \
               ['Measure Name', 'Measure Variable', 'Variable Unit'] + YEARS

# Reorder the DataFrame columns
sd_df = sd_df[column_order]

sd_df.to_excel(output_file, index=False, sheet_name=f'{scenario} Measure level data')

# write a sheet containing the measure definitions
measure_defs_df = pd.DataFrame({
    'Sector': pd.Series(sd_df['Sector'].unique()).sort_values(),
    'Subsector': pd.Series(sd_df['Subsector'].unique()).sort_values(),
    'Measure Name': pd.Series(sd_df['Measure Name'].unique()).sort_values(),
    **{f'Category{i+3}: {category}': pd.Series(sd_df[f'Category{i+3}: {category}'].unique()).sort_values() for i, category in enumerate(nzip.CATEGORIES)}
})
with pd.ExcelWriter(output_file, mode='a', if_sheet_exists='replace') as writer:
    measure_defs_df.to_excel(writer, index=False, sheet_name='Measure definitions')

baseline_kwargs = [
    {
        "timeseries": "Baseline emissions (MtCO2e)",
        "variable_name": "Baseline emissions CO2",
        "variable_unit": "MtCO2",
        "weight_col": "% CARBON Emissions",
    },
    {
        "timeseries": "Baseline emissions (MtCO2e)",
        "variable_name": "Baseline emissions CH4",
        "variable_unit": "MtCO2e",
        "weight_col": "% CH4 Emissions",
    },
    {
        "timeseries": "Baseline emissions (MtCO2e)",
        "variable_name": "Baseline emissions N2O",
        "variable_unit": "MtCO2e",
        "weight_col": "% N2O Emissions",
    },
    {
        "timeseries": "Baseline electricity use (GWh)",
        "variable_name": "Baseline demand electricity",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Baseline in natural gas use (GWh)",
        "variable_name": "Baseline demand gas",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Baseline in petroleum use (GWh)",
        "variable_name": "Baseline demand petroleum",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Baseline in solid fuel use (GWh)",
        "variable_name": "Baseline demand solid fuel",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Baseline in primary bioenergy use (GWh)",
        "variable_name": "Baseline demand final bioenergy",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Baseline in hydrogen use (GWh)",
        "variable_name": "Baseline demand hydrogen",
        "variable_unit": "TWh",
        "scale": 1e-3,
    },
    {
        "timeseries": "Counterfactual capex (£m)",
        "variable_name": "Baseline capital expenditure",
        "variable_unit": "£m",
    },
    {
        "timeseries": "Counterfactual opex (£m)",
        "variable_name": "Baseline operating expenditure",
        "variable_unit": "£m",
    },
]

bl_df = nzip.sd_measure_level(df, baseline_kwargs, scenario, baseline=True)
bl_df = nzip.baseline_from_measure_level(bl_df)

# modelling years
START_YEAR = 2021
END_YEAR = 2050
YEARS = list(range(START_YEAR, END_YEAR+1))

for i in range(6, 21):
    bl_df[f'Category{i}'] = ''

# Define the new column order
column_order = ['Country', 'Sector', 'Subsector', 
                'Category3: Dispersed or Cluster Site', 'Category4: Process', 
                'Category5: Selected Option'] + \
               [f'Category{i}' for i in range(6, 21)] + \
               ['Baseline Variable','Variable Unit'] + YEARS

# Reorder the DataFrame columns
bl_df = bl_df[column_order]

with pd.ExcelWriter(output_file, mode='a', if_sheet_exists='replace') as writer:
    bl_df.to_excel(writer, index=False, sheet_name='Baseline data')

agg_df = nzip.get_aggregate_df(df, measure_level_kwargs, scenario, baseline_kwargs, sector)
with pd.ExcelWriter(output_file, mode='a', if_sheet_exists='replace') as writer:
    agg_df.to_excel(writer, index=False, sheet_name='Aggregate data')
    
print('Done!')
files.download(output_file)

## Combining and processing BP and AAP Pathways



1. Ensure from the previous code you have downloaded the "sd-industry-BP.xlsx" and "sd-industry-AAP.xlsx" files.
2. Run the code below and upload BOTH files.
3. The code will run for up to 5 minutes and download a final sd-industry.xlsx


##### The code is doing the following:
- Combines AAP and BP tabs.'Iron and Steel' and replaced with 'Iron and steel'
- Subsector occurances of 'Iron and Steel' and replaced with 'Iron and steel'. This is because the previous outputs contained both subsectors of 'Iron and Steel' and 'Iron and steel'
- For REEE measures the following columns are cleared as not applicable ["Category3: Dispersed or Cluster Site", "Category4: Process", "Measure ID"]
- For REEE measures and variables, TWh units are corrected for MtCO2e for ['Abatement emissions CO2', 'Abatement total direct'] measure variables
- For REEE measures, only United Kingdom is retained as country.
- For REEE measures, 'Category5: Selected Option' is changed to represent Resource Efficiency or Energy Efficiency
- Unique Measure IDs are created based on combinations of ['Subsector', 'Category4: Process', 'Category5: Selected Option']
- Baseline data is cleaned up by removal of values in ['Category5: Selected Option'] then aggregations are performed for where duplicates are made.
- Measures names are updated where 'Other' values are present to BECCS (Calcium looping), Resource Efficiency (Strong LDAR), and Electrification (Process Change, EAF)
- Corrections are made in bp/aap measure level data where measure variable is 'Abatement emissions N20' (from NZIP) and corrected to 'Abatement emissions N2O'
- replace_values(bp_measure_level_data, 'Measure Variable', 'Abatement emissions N20', 'Abatement emissions N2O'
- Using the measure IDs a measure attributes tab is generated.

##### Current manual edits required after file download and known issues
- Scenario labelling of "AAP" or "BP" needs to be changed to "Additional Action Pathway" or "Balanced Pathway" this is in the "Aggregate data" tab
- In the "Aggregate data" tab the "Additional demand gas abated" values in "Aggregate Variable" column need be removed/fixed for UK only and all values for the years should be changed to zero.
- From the original NZIP files (you can collect a streamlined excel file in the first part of the code), the remaining emissions (all and traded) for 2021-2050 need to be collected indepedently and replace the "Direct emissions total" and "Direct traded emissions total" values for 2021-2050 in the "Aggregate data" tab. This is because current ones are incorrect as there seems to be an issue with accounting for REEE emissions/abatement.
- Measure IDs tab needs to be deleted.
- Deployment+behavioural metrics tab is created and calculated manually
- Issues with how the RE and EE abatements are being calculated in the original code.

In [30]:
files.upload()

bp_file_path = "sd-industry-BP.xlsx"
aap_file_path = "sd-industry-AAP.xlsx"

# Combined file path
combined_file_path = 'sd-industry-combined.xlsx'

# Load sheets from BP file
bp_measure_level = pd.read_excel(bp_file_path, sheet_name='BP Measure level data')
baseline = pd.read_excel(bp_file_path, sheet_name='Baseline data')
bp_aggregate = pd.read_excel(bp_file_path, sheet_name='Aggregate data')
bp_measure_defs = pd.read_excel(bp_file_path, sheet_name='Measure definitions')

# Load sheets from AAP file
aap_measure_level = pd.read_excel(aap_file_path, sheet_name='AAP Measure level data')
aap_aggregate = pd.read_excel(aap_file_path, sheet_name='Aggregate data')
aap_measure_defs = pd.read_excel(aap_file_path, sheet_name='Measure definitions')

# Combine Aggregate sheets
aggregate_combined = pd.concat([bp_aggregate, aap_aggregate], ignore_index=True)
# Remove duplicate Baseline rows
aggregate_combined = aggregate_combined.drop_duplicates(subset=['Scenario', 'Country', 'Sector', 'Aggregate Variable', 'Variable Unit'], keep='first')

# Rename Measure Definitions sheets
bp_measure_defs = bp_measure_defs.add_prefix('BP ')
aap_measure_defs = aap_measure_defs.add_prefix('AAP ')

# Write to a new Excel file
with pd.ExcelWriter(combined_file_path) as writer:
    bp_measure_level.to_excel(writer, sheet_name='BP Measure level data', index=False)
    baseline.to_excel(writer, sheet_name='Baseline data', index=False)
    aggregate_combined.to_excel(writer, sheet_name='Aggregate data', index=False)
    aap_measure_level.to_excel(writer, sheet_name='AAP Measure level data', index=False)
    bp_measure_defs.to_excel(writer, sheet_name='Measure definitions BP', index=False)
    aap_measure_defs.to_excel(writer, sheet_name='Measure definitions AAP', index=False)

#files.download(combined_file_path)

print('Now we are aggregating the REEE measures for each UK-only subsectors...')

# Define the path to your Excel file
excel_file_path = 'sd-industry-combined.xlsx'  # Replace with your actual file path

# Load the Excel file
xl = pd.ExcelFile(excel_file_path)

# Read the sheets into separate DataFrames
bp_measure_level_data = xl.parse('BP Measure level data')
aap_measure_level_data = xl.parse('AAP Measure level data')

# Standardize 'Iron and Steel' to 'Iron and steel' in 'Subsector' columns
bp_measure_level_data['Subsector'] = bp_measure_level_data['Subsector'].replace('Iron and Steel', 'Iron and steel')
aap_measure_level_data['Subsector'] = aap_measure_level_data['Subsector'].replace('Iron and Steel', 'Iron and steel')

# Define the year columns for aggregation
year_columns = [year for year in range(2021, 2051)]
grouping_columns = ["Country", "Sector", "Subsector", "Measure Variable"]

def process_efficiency(df, term):
    filtered_indices = df['Measure Name'].str.contains(term, na=False)
    columns_to_clear = ["Category3: Dispersed or Cluster Site", "Category4: Process", "Measure ID"]
    
    # Clear specified columns
    for col in columns_to_clear:
        df.loc[filtered_indices, col] = pd.NA

    # Add 'Measure Name' to grouping for aggregation
    agg_grouping_columns = grouping_columns + ['Measure Name']

    # Aggregate and keep 'Measure Name'
    aggregated_df = df[filtered_indices].groupby(agg_grouping_columns)[year_columns].sum().reset_index()

    # Update 'Variable Unit' based on 'Measure Variable'
    #aggregated_df['Variable Unit'] = aggregated_df['Measure Variable'].apply(lambda x: 'MtCO2' if x == 'Abatement emissions CO2' else 'TWh')
    
    aggregated_df['Variable Unit'] = aggregated_df['Measure Variable'].apply(
    lambda x: 'MtCO2' if x in ['Abatement emissions CO2', 'Abatement total direct'] else 'TWh')

    # Filter for 'United Kingdom' if required
    aggregated_df = aggregated_df[aggregated_df['Country'] == 'United Kingdom']

    return df[~filtered_indices], aggregated_df

# Apply efficiency processing to BP and AAP data
efficiency_terms = ['Resource Efficiency', 'Energy Efficiency']
for term in efficiency_terms:
    bp_measure_level_data, bp_aggregated = process_efficiency(bp_measure_level_data, term)
    bp_measure_level_data = pd.concat([bp_measure_level_data, bp_aggregated], ignore_index=True)
    aap_measure_level_data, aap_aggregated = process_efficiency(aap_measure_level_data, term)
    aap_measure_level_data = pd.concat([aap_measure_level_data, aap_aggregated], ignore_index=True)

# Update 'Category5: Selected Option' for both sheets
for df in [bp_measure_level_data, aap_measure_level_data]:
    for term in efficiency_terms:
        df.loc[df['Measure Name'] == term, 'Category5: Selected Option'] = term

# Open the Excel file in append mode and remove existing sheets
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a') as writer:
    # Check if the sheet exists and remove it
    if 'BP Measure level data' in writer.book.sheetnames:
        idx = writer.book.sheetnames.index('BP Measure level data')
        writer.book.remove(writer.book.worksheets[idx])
    
    if 'AAP Measure level data' in writer.book.sheetnames:
        idx = writer.book.sheetnames.index('AAP Measure level data')
        writer.book.remove(writer.book.worksheets[idx])

    # Write the updated data back into the Excel file
    bp_measure_level_data.to_excel(writer, sheet_name='BP Measure level data', index=False)
    aap_measure_level_data.to_excel(writer, sheet_name='AAP Measure level data', index=False)

print('We are now aggregating the baseline sheet and doing some data cleaning and creating unique measure IDs!')

# Define the path to the Excel file
combined_file_path = 'sd-industry-combined.xlsx'

# Load the Excel file
xl = pd.ExcelFile(combined_file_path)

# Load the specific sheets into separate DataFrames
bp_measure_level_data = xl.parse('BP Measure level data')
aap_measure_level_data = xl.parse('AAP Measure level data')
baseline_data = xl.parse('Baseline data')
aggregate_data = xl.parse('Aggregate data')

# Add a 'Source' column to each DataFrame
bp_measure_level_data['Source'] = 'BP'
aap_measure_level_data['Source'] = 'AAP'

# Combine BP and AAP measure level data
combined_measure_data = pd.concat([bp_measure_level_data, aap_measure_level_data]).reset_index(drop=True)

# Function to create unique Measure ID
def create_unique_measure_id(df):
    unique_columns = ['Subsector', 'Category4: Process', 'Category5: Selected Option']
    df['config_key'] = df[unique_columns].astype(str).apply('-'.join, axis=1)
    unique_configs = df['config_key'].drop_duplicates().reset_index(drop=True)
    id_mapping = {config: f"{i+1:02d}_Both" for i, config in enumerate(unique_configs)}
    df['Measure ID'] = df['config_key'].map(id_mapping)
    df.drop('config_key', axis=1, inplace=True)

# Apply the function to the combined data
create_unique_measure_id(combined_measure_data)

#bp_measure_level_data['Measure Name'] = bp_measure_level_data.apply(custom_update_measure_name, axis=1)
#aap_measure_level_data['Measure Name'] = aap_measure_level_data.apply(custom_update_measure_name, axis=1)

# Split the combined data back into BP and AAP measure level data
bp_measure_level_data = combined_measure_data[combined_measure_data['Source'] == 'BP'].drop('Source', axis=1)
aap_measure_level_data = combined_measure_data[combined_measure_data['Source'] == 'AAP'].drop('Source', axis=1)

# Clean up the 'Baseline data'
baseline_data['Category5: Selected Option'] = pd.NA
baseline_data.rename(columns={'Category5: Selected Option': 'Category5'}, inplace=True)

# Specify the columns to check for duplicates
columns_to_check = ['Country', 'Sector', 'Subsector', 'Category3: Dispersed or Cluster Site', 
                    'Category4: Process', 'Baseline Variable']

# Specify the year columns for aggregation
year_columns = [year for year in range(2021, 2051)]

# Identify duplicates
baseline_data['is_duplicate'] = baseline_data.duplicated(subset=columns_to_check, keep=False)

# Aggregate and update year columns for duplicates
for _, group in baseline_data[baseline_data['is_duplicate']].groupby(columns_to_check):
    sums = group[year_columns].sum()
    first_index = group.index[0]
    baseline_data.loc[first_index, year_columns] = sums.values

# Remove the remaining duplicates
baseline_data = baseline_data.drop_duplicates(subset=columns_to_check).drop(columns=['is_duplicate'])

# Create Measure IDs sheet from the combined data
measure_ids_data = combined_measure_data[['Measure ID','Subsector', 
                                          'Category4: Process', 'Category5: Selected Option'
                                          ]].copy()
measure_ids_data = measure_ids_data.drop_duplicates().sort_values(by='Measure ID').reset_index(drop=True)

# Save the data to a new Excel file
output_path = 'sd-industry.xlsx'

# Function to update 'Measure Name' based on 'Category5: Selected Option'
def update_measure_name(df):
    conditions = [
        df['Category5: Selected Option'].isin(['BECCS 1 - Calcium Looping', 'BECCS 2 - Calcium Looping']),
        df['Category5: Selected Option'] == 'Strong LDAR',
        df['Category5: Selected Option'].isin(['Process change', 'EAF'])
    ]
    choices = ['BECCS', 'Resource Efficiency', 'Electrification']
    df['Measure Name'] = np.select(conditions, choices, default=df['Measure Name'])

# Update 'Measure Name' in both dataframes
update_measure_name(bp_measure_level_data)
update_measure_name(aap_measure_level_data)

# Function to replace specific values in a column
def replace_values(df, column_name, old_value, new_value):
    df[column_name] = df[column_name].replace(old_value, new_value)

# Replace values in 'Measure Variable' column in BP and AAP data
replace_values(bp_measure_level_data, 'Measure Variable', 'Abatement emissions N20', 'Abatement emissions N2O')
replace_values(aap_measure_level_data, 'Measure Variable', 'Abatement emissions N20', 'Abatement emissions N2O')

with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
    bp_measure_level_data.to_excel(writer, sheet_name='BP Measure level data', index=False)
    aap_measure_level_data.to_excel(writer, sheet_name='AAP Measure level data', index=False)
    baseline_data.to_excel(writer, sheet_name='Baseline data', index=False)
    aggregate_data.to_excel(writer, sheet_name='Aggregate data', index=False)
    measure_ids_data.to_excel(writer, sheet_name='Measure IDs', index=False)

print('Finally, we now create the measure attributes tab...')

# Step 1: Read the 'Measure IDs' sheet from 'sd-industry.xlsx'
sd_industry_path = 'sd-industry.xlsx'
measure_ids_df = pd.read_excel(sd_industry_path, sheet_name='Measure IDs')

# Step 2: Define the structure of the 'Measure attributes' DataFrame
# Assume we know the column names and their order from the original 'Measure attributes' sheet
# The following column names are placeholders; please replace them with the actual names
column_names = [
    "Measure ID", "Scenario", "Sector", "Subsector", "Category3", "Category4: Process",
    "Category5: Selected Option", "Category6", "Category7", "Category8", "Category9", "Category10",
    "Category11", "Category12", "Category13", "Category14", "Category15", "Category16",
    "Category17", "Category18", "Category19", "Category20", "Measure Name", "Measure Description",
    "Scotland", "Wales", "Northern Ireland", "Optimism bias", "Cost with optimism bias",
    "Cost of capital (optional)", "Asset lifetime", "Reducing demand for carbon-intensive activities",
    "Improved efficiency in use of energy and resources", "Expansion of low-carbon energy (hydrogen and electricity)",
    "Take-up of low carbon solutions: electrification", "Take up of low carbon solutions: hydrogen and other low-carbon tech",
    "Take up of low carbon solutions: CO2 capture from fossil fuels and industry", "Offsetting emissions: Natural carbon storage",
    "Offsetting emissions: engineered greenhouse removals", "Investment: private", "Investment: public", "Investment: household",
    "Business supply", "Business adopt", "Business adopt percentage", "Type of choice",
    "Percentage household green choices"
]

# Create an empty DataFrame with these columns
measure_attributes_df = pd.DataFrame(columns=column_names)

# Populate the DataFrame
measure_attributes_df['Measure ID'] = measure_ids_df['Measure ID']
measure_attributes_df['Scenario'] = "Both Pathways"
measure_attributes_df['Sector'] = "Industry"
measure_attributes_df['Subsector'] = measure_ids_df['Subsector']
measure_attributes_df['Category4: Process'] = measure_ids_df['Category4: Process']
measure_attributes_df['Category5: Selected Option'] = measure_ids_df['Category5: Selected Option']

# Create a function to handle empty or NaN parts in the Measure Description
def create_description(row):
    parts = [row['Subsector'], row['Category4: Process'], row['Category5: Selected Option']]
    parts = [part for part in parts if not pd.isna(part) and part != '']
    return '_'.join(parts)

# Apply the function to generate Measure Description
measure_attributes_df['Measure Description'] = measure_ids_df.apply(create_description, axis=1)

# Leave other columns blank or fill them as required
# For example:
# measure_attributes_df['Other Column 1'] = 'Default Value' or ''
# measure_attributes_df['Other Column 2'] = 'Another Value' or ''

# Step 4: Add the DataFrame as a new sheet to 'sd-industry.xlsx'
with pd.ExcelWriter(sd_industry_path, engine='openpyxl', mode='a') as writer:
    if 'Measure attributes' in writer.book.sheetnames:
        writer.book.remove(writer.book['Measure attributes'])
    measure_attributes_df.to_excel(writer, sheet_name='Measure attributes', index=False)

print("Done! Note manual edits will be required as mentioned in instructions at the top of the cell.")

files.download(sd_industry_path)

We are combining the BP and AAP pathway files into one...
and... Done!
