<div class="alert alert-block alert-info">
<b>This script extracts data from one (or multiple) excel files with LCIA results exported from SimaPro</b> <br> </div>

>*  each workbook includes only activities of a specific sub-category of SimaPro's Tree,  <br>
>* each sheet of the workbook includes LCIA results of one specific LCIA method

In [1]:
# locals()
# globals()
# dir()
# %whos

# Imports

In [2]:
%config IPCompleter.use_jedi = False # disable jedi autocompleter (https://stackoverflow.com/a/65734178/14485040)

import project_path  # makes possible the access to `src` directory using relative path
from src.utils import explore_dir, make_readme_info, progressbar
from src.utils import read_excel_to_pandas as r_excel
from src.utils import set_outputs_dir
from src.utils import write_pandas_to_excel as w_excel

%run init_nb.ipynb

# INPUTS: Identify file(s) and read data to df

In [3]:
# Explore the directory to find the file(s)
inputs_dir, files_list = explore_dir(
    path_to_dir=r"..\data\external\LCIAs_from_SimaPro",
    file_extension="xlsx",
    print_files_list=True,
)

['Chemicals-Acids (inorganic)-Market.xlsx',
 'Chemicals-Acids (organic)-Market.xlsx',
 'Chemicals-Fertilisers (inorganic)-Market.xlsx',
 'Chemicals-Fertilisers (organic)-Market.xlsx',
 'Chemicals-Gases-Liquified-Market.xlsx',
 'Chemicals-Gases-Market.xlsx',
 'Chemicals-Inorganic-Market.xlsx',
 'Chemicals-Organic-Market.xlsx',
 'Chemicals-Others-Market.xlsx',
 'Chemicals-Pesticides-Market.xlsx',
 'Chemicals-Silicons-Market.xlsx',
 'Chemicals-Washing agents-Auxiliaries-Market.xlsx',
 'Chemicals-Washing agents-Bleaches-Market.xlsx',
 'Chemicals-Washing agents-Builders-Market.xlsx',
 'Chemicals-Washing agents-Tensides-Market.xlsx',
 'Others-Divisions 33to36-Market.xlsx',
 'OthersPartTWO-Divisions 33to36-Market.xlsx',
 'Plastics-Thermoplasts-Market.xlsx']


In [None]:
# TODO
# DONE. ~tqdm module needs ipywidgets... couldn't install it... created new progressbar~  
# 2. FIX LATER: CAN USE REGEX INSTEAD OF A FUNCTION WHICH I WROTE BEFORE: "idx_of_found_item" !!!!!  
# 3. FIX LATER: USE REGEX TO CONVERT WKB NAMES LIKE category IN df_simapro!!!  

In [4]:
%%time
# Process raw data

# wkbRegex = re.compile((r'(-+)'))  FIX LATER: USE REGEX TO CONVERT WKB NAMES LIKE category IN df_simapro!!!
wkb_names = list(
    map(lambda st: str.replace(st, ".xlsx", ""), files_list)
)  # names of the workbooks, to be stored as well

df_lcias_raw = (
    pd.DataFrame()
)  # columns=['Method', 'Impact (or Damage) category', 'Unit'])

lcia_methods_list = []
# for idx_file, file in tqdm(enumerate(files_list), total=len(files_list), desc='Progress: ', unit=' files'):
for idx_file, file in progressbar(
    enumerate(files_list), total=len(files_list), prefix="Progress: ", unit="files"
):
    # Read a workbook
    dict_wbk = r_excel(
        path_to_file=inputs_dir,
        filename=file,
        sheets=None,  # read all sheets of the workbook into a dictionary
        engine="openpyxl",
    )

    lcia_methods_per_wkb = []
    products_per_sheet = []
    appended_sheets = pd.DataFrame(columns=["Activity"])

    for key in dict_wbk.keys():  # loop through the sheets of the workbook
        # Read raw data from the sheet
        df_per_sheet = dict_wbk.get(key)

        # There is one LCIA method per sheet of data, identify and store it.
        idx_method = idx_of_found_item(
            df_per_sheet.iloc[:, 0], "Method:"
        )  # find the index of the 0-th column where "keyword" appears
        name_method = df_per_sheet.iloc[
            idx_method, 1
        ]  # the name of the method is in [idx_method, 1-st column], store it
        lcia_methods_per_wkb.append(name_method)

        # Identify the number of products per sheet. They should be the same...
        # Products are numerated sequentially. Last product is in the cell previous to keyword "Method:"
        num_products = int(
            re.search(r"\d+", df_per_sheet.iloc[idx_method - 1, 0]).group()
        )  # extract the digits only
        products_per_sheet.append(num_products)

        # In every sheet, identify the row where the relevant data starts
        # Drop all the rows before the specified one
        # Store the data range of interest
        idx_category = idx_of_found_item(
            df_per_sheet.iloc[:, 0], "category"
        )  # find the index of the 0-th column where "keyword" appears
        name_category = df_per_sheet.iloc[
            idx_category, 0
        ]  # store the name of the cell [idx_category,0-th column]

        new_df = df_per_sheet.drop(range(idx_category), axis=0)
        new_df.dropna(axis=0, how="all", inplace=True)
        new_df.dropna(axis=1, how="all", inplace=True)
        ## -------------------------------------------------
        new_df.columns = new_df.iloc[0]  # set row 0 as columns label
        new_df.drop(new_df.index[0], inplace=True)
        new_df.insert(
            0,
            column="Method_category_unit",  # insert column with tuple values
            value=list(
                zip(
                    [name_method] * new_df.shape[0],
                    new_df.iloc[:, 0],
                    new_df.iloc[:, 1],
                )
            ),
            allow_duplicates=False,
        )
        new_df.drop(columns=list(new_df.columns[1:3]), inplace=True)

        new_df = (
            new_df.set_index("Method_category_unit")
            .T.rename_axis("Activity")
            .reset_index()
        )
        new_df.columns.name = None
        ## -------------------------------------------------
        appended_sheets = appended_sheets.merge(new_df, how="outer", on="Activity")

    appended_sheets.insert(
        0,
        column="wkbName",  # insert column with wkb_name
        value=wkb_names[idx_file],
        allow_duplicates=False,
    )

    df_lcias_raw = pd.concat(
        [df_lcias_raw, appended_sheets], axis=0, join="outer", ignore_index=True
    )

    lcia_methods_list.extend(lcia_methods_per_wkb)
df_lcia_labels = pd.DataFrame(
    list(set(lcia_methods_list)), columns=["Method"]
)  # create a df of unique methods

print(
    "df of LCIAs from SimaPro ".ljust(40, "."), f" {df_lcias_raw.shape}".rjust(13, ".")
)

# pd.set_option('precision',4)
df_lcias_raw.sample(5)

Progress: [🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩🟩] 18/18 files
df of LCIAs from SimaPro ............... ... (947, 64)
Wall time: 44.2 s


Unnamed: 0,wkbName,Activity,"(PBs-LCIA V0.71 V0.71, Climate change - CO2 concentration, ppm)","(PBs-LCIA V0.71 V0.71, Climate change - Energy imbalance, Wm-2)","(PBs-LCIA V0.71 V0.71, Stratospheric ozone depletion, DU)","(PBs-LCIA V0.71 V0.71, Ocean acidification, Omega Aragon)","(PBs-LCIA V0.71 V0.71, Biogeochemical flows - P, Tg P)","(PBs-LCIA V0.71 V0.71, Biogeochemical flows - N, Tg N)","(PBs-LCIA V0.71 V0.71, Land-system change - Global, %)","(PBs-LCIA V0.71 V0.71, Freshwater use - Global, km3)",...,"(ReCiPe 2016 Endpoint (H) V1.03 / World (2010) H/A, Human health, DALY)","(ReCiPe 2016 Endpoint (H) V1.03 / World (2010) H/A, Ecosystems, species.yr)","(ReCiPe 2016 Endpoint (H) V1.03 / World (2010) H/A, Resources, USD2013)","(IPCC 2013 GWP 100a V1.03, IPCC GWP 100a, kg CO2 eq)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Non renewable, fossil, MJ)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Non-renewable, nuclear, MJ)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Non-renewable, biomass, MJ)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Renewable, biomass, MJ)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Renewable, wind, solar, geothe, MJ)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Renewable, water, MJ)"
803,Others-Divisions 33to36-Market,"Kerosene {CH}| market for | APOS, U - copy",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1e-06,0.0,0.536784,0.492965,54.721353,0.341231,0.000247,0.041065,0.022186,0.114452
203,Chemicals-Inorganic-Market,"Lime, hydrated, packed {RER}| market for lime,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1e-06,0.0,0.036705,0.879233,3.802496,0.271733,0.000129,0.206404,0.00644,0.269751
578,Chemicals-Organic-Market,"Polyvinylfluoride, film {GLO}| market for | AP...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.7e-05,0.0,1.413906,19.586966,232.60622,34.31616,0.00559,4.073584,2.887982,9.668768
162,Chemicals-Inorganic-Market,"Chemical, inorganic {GLO}| market for chemical...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6e-06,0.0,0.147754,1.990753,22.381175,1.917643,0.001207,0.605285,0.172606,0.804498
458,Chemicals-Organic-Market,Esters of versatic acid {RoW}| market for este...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6e-06,0.0,0.614612,2.777632,75.929309,3.553893,0.000443,0.355641,0.235225,1.337468


# Operations

## Filter/adjust data

<span style="background:red">REDUNDANT COPYING STEP? check later</span> <br>

In [5]:
df_lcias = df_lcias_raw.copy() # make a deepcopy of the original df
print('df of LCIAs from SimaPro '.ljust(40,'.'), f' {df_lcias.shape}'.rjust(13, '.'))
df_lcias.sample(5)

df of LCIAs from SimaPro ............... ... (947, 64)


Unnamed: 0,wkbName,Activity,"(PBs-LCIA V0.71 V0.71, Climate change - CO2 concentration, ppm)","(PBs-LCIA V0.71 V0.71, Climate change - Energy imbalance, Wm-2)","(PBs-LCIA V0.71 V0.71, Stratospheric ozone depletion, DU)","(PBs-LCIA V0.71 V0.71, Ocean acidification, Omega Aragon)","(PBs-LCIA V0.71 V0.71, Biogeochemical flows - P, Tg P)","(PBs-LCIA V0.71 V0.71, Biogeochemical flows - N, Tg N)","(PBs-LCIA V0.71 V0.71, Land-system change - Global, %)","(PBs-LCIA V0.71 V0.71, Freshwater use - Global, km3)",...,"(ReCiPe 2016 Endpoint (H) V1.03 / World (2010) H/A, Human health, DALY)","(ReCiPe 2016 Endpoint (H) V1.03 / World (2010) H/A, Ecosystems, species.yr)","(ReCiPe 2016 Endpoint (H) V1.03 / World (2010) H/A, Resources, USD2013)","(IPCC 2013 GWP 100a V1.03, IPCC GWP 100a, kg CO2 eq)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Non renewable, fossil, MJ)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Non-renewable, nuclear, MJ)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Non-renewable, biomass, MJ)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Renewable, biomass, MJ)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Renewable, wind, solar, geothe, MJ)","(Cumulative Energy Demand V1.11 / Cumulative energy demand, Renewable, water, MJ)"
170,Chemicals-Inorganic-Market,"Cryolite {GLO}| market for | APOS, S",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.2e-05,0.0,0.292883,2.762516,44.579384,4.745689,0.009049,1.030088,0.366851,1.666231
110,Chemicals-Gases-Market,"Helium, crude stockpiling {GLO}| market for | ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5e-06,0.0,0.765002,1.952384,99.319219,0.998084,0.000737,2.267158,0.077986,0.511914
743,Chemicals-Washing agents-Builders-Market,"Zeolite, slurry, without water, in 50% solutio...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8e-06,0.0,0.168615,2.304393,27.809235,3.034753,0.000949,0.598761,0.266973,1.107703
472,Chemicals-Organic-Market,Ethylene glycol dimethyl ether {GLO}| market f...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5e-06,0.0,0.533892,2.37623,67.962501,3.519132,0.000719,0.442651,0.243415,0.994779
534,Chemicals-Organic-Market,Monochlorobenzene {RER}| market for monochloro...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7e-06,0.0,0.481162,2.66968,68.636948,5.938417,0.001069,0.863479,0.370437,1.153584


In [6]:
# General performance info about the df
# df_lcias.info()

In [7]:
# General info about the memory usage
# df_lcias.memory_usage()

# OUTPUTS: Export data to excel

In [8]:
%%time

# Set output directory
outputs_dir = set_outputs_dir(use_default=True)  # default `..\data\interim`

## Export dataframe to excel
excelName = "lcia-results-from-sp910-combined.xlsx"

df_readme = make_readme_info(
    excelName,
    "Sheet1: Multiple LCIA methods results (per category) for ALL chemical markets from SimaPro910. "
    "\ndf_lcia_labels: unique names of the LCIA methods used in Sheet1.",
)

w_excel(
    path_to_file=outputs_dir,
    filename=excelName,
    dict_data_to_write={"Sheet1": df_lcias, "df_lcia_labels": df_lcia_labels},
    readme_info=("readme", df_readme),
    engine="xlsxwriter",
    ####         ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
    #     startrow=0
)

File: lcia-results-from-sp910-combined.xlsx successfully created in 
C:\Users\ViteksPC\Documents\00-ETH_projects\17-AESA_ecoinvent_chemicals\data\interim
Wall time: 1.22 s
