<div class="alert alert-block alert-info">
<b>This script extracts data from one (or multiple) excel files with LCIA results exported from SimaPro</b> <br> </div>

>*  each workbook includes only activities of a specific sub-category of SimaPro's Tree,  <br>
>* each sheet of the workbook includes LCIA results of one specific LCIA method

In [None]:
# locals()
# globals()
# dir()
# %whos

# Imports

In [None]:
%config IPCompleter.use_jedi = False # disable jedi autocompleter (https://stackoverflow.com/a/65734178/14485040)

import project_path  # makes possible the access to `src` directory using relative path
from src.utils import explore_dir, make_readme_info, progressbar
from src.utils import read_excel_to_pandas as r_excel
from src.utils import set_outputs_dir
from src.utils import write_pandas_to_excel as w_excel

%run init_nb.ipynb

# INPUTS: Identify file(s) and read data to df

In [None]:
# Explore the directory to find the file(s)
inputs_dir, files_list = explore_dir(
    path_to_dir=r"..\data\external\LCIAs_from_SimaPro",
    file_extension="xlsx",
    print_files_list=True,
)

In [None]:
# TODO
# DONE. ~tqdm module needs ipywidgets... couldn't install it... created new progressbar~  
# 2. FIX LATER: CAN USE REGEX INSTEAD OF A FUNCTION WHICH I WROTE BEFORE: "idx_of_found_item" !!!!!  
# 3. FIX LATER: USE REGEX TO CONVERT WKB NAMES LIKE category IN df_simapro!!!  

In [None]:
%%time
# Process raw data

# wkbRegex = re.compile((r'(-+)'))  FIX LATER: USE REGEX TO CONVERT WKB NAMES LIKE category IN df_simapro!!!
wkb_names = list(
    map(lambda st: str.replace(st, ".xlsx", ""), files_list)
)  # names of the workbooks, to be stored as well

df_lcias_raw = (
    pd.DataFrame()
)  # columns=['Method', 'Impact (or Damage) category', 'Unit'])

lcia_methods_list = []
# for idx_file, file in tqdm(enumerate(files_list), total=len(files_list), desc='Progress: ', unit=' files'):
for idx_file, file in progressbar(
    enumerate(files_list), total=len(files_list), prefix="Progress: ", unit="files"
):
    # Read a workbook
    dict_wbk = r_excel(
        path_to_file=inputs_dir,
        filename=file,
        sheets=None,  # read all sheets of the workbook into a dictionary
        engine="openpyxl",
    )

    lcia_methods_per_wkb = []
    products_per_sheet = []
    appended_sheets = pd.DataFrame(columns=["Activity"])

    for key in dict_wbk.keys():  # loop through the sheets of the workbook
        # Read raw data from the sheet
        df_per_sheet = dict_wbk.get(key)

        # There is one LCIA method per sheet of data, identify and store it.
        idx_method = idx_of_found_item(
            df_per_sheet.iloc[:, 0], "Method:"
        )  # find the index of the 0-th column where "keyword" appears
        name_method = df_per_sheet.iloc[
            idx_method, 1
        ]  # the name of the method is in [idx_method, 1-st column], store it
        lcia_methods_per_wkb.append(name_method)

        # Identify the number of products per sheet. They should be the same...
        # Products are numerated sequentially. Last product is in the cell previous to keyword "Method:"
        num_products = int(
            re.search(r"\d+", df_per_sheet.iloc[idx_method - 1, 0]).group()
        )  # extract the digits only
        products_per_sheet.append(num_products)

        # In every sheet, identify the row where the relevant data starts
        # Drop all the rows before the specified one
        # Store the data range of interest
        idx_category = idx_of_found_item(
            df_per_sheet.iloc[:, 0], "category"
        )  # find the index of the 0-th column where "keyword" appears
        name_category = df_per_sheet.iloc[
            idx_category, 0
        ]  # store the name of the cell [idx_category,0-th column]

        new_df = df_per_sheet.drop(range(idx_category), axis=0)
        new_df.dropna(axis=0, how="all", inplace=True)
        new_df.dropna(axis=1, how="all", inplace=True)
        ## -------------------------------------------------
        new_df.columns = new_df.iloc[0]  # set row 0 as columns label
        new_df.drop(new_df.index[0], inplace=True)
        new_df.insert(
            0,
            column="Method_category_unit",  # insert column with tuple values
            value=list(
                zip(
                    [name_method] * new_df.shape[0],
                    new_df.iloc[:, 0],
                    new_df.iloc[:, 1],
                )
            ),
            allow_duplicates=False,
        )
        new_df.drop(columns=list(new_df.columns[1:3]), inplace=True)

        new_df = (
            new_df.set_index("Method_category_unit")
            .T.rename_axis("Activity")
            .reset_index()
        )
        new_df.columns.name = None
        ## -------------------------------------------------
        appended_sheets = appended_sheets.merge(new_df, how="outer", on="Activity")

    appended_sheets.insert(
        0,
        column="wkbName",  # insert column with wkb_name
        value=wkb_names[idx_file],
        allow_duplicates=False,
    )

    df_lcias_raw = pd.concat(
        [df_lcias_raw, appended_sheets], axis=0, join="outer", ignore_index=True
    )

    lcia_methods_list.extend(lcia_methods_per_wkb)
df_lcia_labels = pd.DataFrame(
    list(set(lcia_methods_list)), columns=["Method"]
)  # create a df of unique methods

print(
    "df of LCIAs from SimaPro ".ljust(40, "."), f" {df_lcias_raw.shape}".rjust(13, ".")
)

# pd.set_option('precision',4)
df_lcias_raw.sample(5)

# Operations

## Filter/adjust data

In [None]:
# TODO: REDUNDANT COPYING STEP? check later

In [None]:
df_lcias = df_lcias_raw.copy() # make a deepcopy of the original df
print('df of LCIAs from SimaPro '.ljust(40,'.'), f' {df_lcias.shape}'.rjust(13, '.'))
df_lcias.sample(5)

In [None]:
# General performance info about the df
# df_lcias.info()

In [None]:
# General info about the memory usage
# df_lcias.memory_usage()

# OUTPUTS: Export data to excel

In [None]:
%%time

# Set output directory
outputs_dir = set_outputs_dir(use_default=True)  # default `..\data\interim`

## Export dataframe to excel
excelName = "lcia-results-from-sp910-combined.xlsx"

df_readme = make_readme_info(
    excelName,
    "Sheet1: Multiple LCIA methods results (per category) for ALL chemical markets from SimaPro910. "
    "\ndf_lcia_labels: unique names of the LCIA methods used in Sheet1.",
)

w_excel(
    path_to_file=outputs_dir,
    filename=excelName,
    dict_data_to_write={"Sheet1": df_lcias, "df_lcia_labels": df_lcia_labels},
    readme_info=("readme", df_readme),
    engine="xlsxwriter",
    ####         ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
    #     startrow=0
)