<div class="alert alert-block alert-info">
This script maps <b>LCIA results exported from SP910</b> against previously mapped <b>DB processes/activities SP910 and Ecoinvent v3.5 APOS</b>  
    <hr> 
    Note: <br>
    <i><b>Input file(s)' name(s) and metadata</b></i> (if available) are <b>printed out (below 👇🏼) in 'read data to df' section.</b>
</div>

# Imports

In [None]:
%config IPCompleter.use_jedi = False # disable jedi autocompleter (https://stackoverflow.com/a/65734178/14485040)

import project_path  # makes possible the access to `src` directory using relative path
from src.utils import explore_dir, make_readme_info
from src.utils import read_excel_to_pandas as r_excel
from src.utils import set_outputs_dir
from src.utils import write_pandas_to_excel as w_excel

%run init_nb.ipynb

# INPUTS: Identify file(s) and read data to df

In [None]:
# Explore the directory to find the file(s)

# --------------- LCIAresults file ----------------
inputs_dir_1, files_list_1 = explore_dir(
    path_to_dir=r"..\data\interim", file_extension="xlsx", print_files_list=True,
)
print("--> nº of found files in", inputs_dir_1, ": ", len(files_list_1), "\n")

# --------------- master mapped db file ----------------
inputs_dir_2, files_list_2 = explore_dir(
    path_to_dir=r"..\..\18-BW2Hub\02.Code\Outputs",
    file_extension="xlsx",
    print_files_list=True,
)
print("--> nº of found files in", inputs_dir_2, ": ", len(files_list_2))

In [None]:
%%time
# Process raw data

# LCIAs results from SimaPro
df_lcias_raw = r_excel(
    inputs_dir_1, "lcia-results-from-sp910-combined.xlsx", sheets="Sheet1"
)

print(
    "df of LCIAs from SimaPro (raw) ".ljust(40, "."),
    f" {df_lcias_raw.shape}".rjust(13, "."),
)

# master mapped db file
df_master_mapped_db_raw = r_excel(
    inputs_dir_2, "mapped_processes_SP910-EI35APOS.xlsx", sheets="Sheet1"
)
print(
    "df of master mapped db (raw) ".ljust(40, "."),
    f" {df_master_mapped_db_raw.shape}".rjust(13, "."),
)

In [None]:
# Get unique names of the LCIA methods from 'LCIAresultsSP910-combined.xlsx'
# This df is copied to the new Excel file degerated below.

df_lcia_labels = r_excel(
    inputs_dir_1, "lcia-results-from-sp910-combined.xlsx", sheets="df_lcia_labels", show_readme=False,
)
df_lcia_labels

# Operations

## Filter/adjust data

In [None]:
df_lcias = df_lcias_raw.copy() # make a deepcopy of the original df

# Substitute "APOS, U - copy" in the original df with "APOS, S", according to the last added markets
lst = []
aposRegex = re.compile((r'APOS, (.*)')) # complied regular expression of "APOS, U - copy" to be replaced with "APOS, S"
for item in df_lcias.Activity:
    lst.append(aposRegex.sub(r'APOS, S', item))
df_lcias.Activity = lst

shape_lcias = df_lcias.shape
print('df of LCIAs from SimaPro '.ljust(40,'.'), f' {shape_lcias}'.rjust(13, '.'))
df_lcias.sample(2)

In [None]:
df_master_mapped_db = df_master_mapped_db_raw.copy() # make a deepcopy of the original df

# Substitute "APOS, U" in the original df with "APOS, S", since the LCIAs are calculated with S-system instead of U-unit 
lst = []
aposRegex = re.compile((r'APOS, (\w+)')) # complied regular expression of "APOS, U" to be replaced with "APOS, S"
for item in df_master_mapped_db.fullName_SimaPro:
    lst.append(aposRegex.sub(r'APOS, S', item))
df_master_mapped_db.fullName_SimaPro = lst

shape_master_mapped = df_master_mapped_db.shape
print('df of master mapped db'.ljust(40,'.'), f' {shape_master_mapped}'.rjust(13, '.'))

df_master_mapped_db.sample(2)

In [None]:
df_master_mapped_db.columns

## Mapping

In [None]:
# Merge df_lcias with df_master_mapped_db on "Activity" and "fullName_SimaPro"
df_merged = df_lcias.merge(df_master_mapped_db,
                                 how='left',
                                 left_on=['Activity'], 
                                 right_on=['fullName_SimaPro'],
                                 suffixes=('_LCIArslts', '_SimaPro')
                                )

shape_merged = df_merged.shape
# ----------------------------------

print(
    "|".rjust(8, " ")
    + "LCIAs results".center(15, " ")
    + "|"
    + "Master mapped db".center(18, " ")
)
print("".center(41, "-"))
print(
    "Total"
    + "|".rjust(3, " ")
    + f"{shape_lcias[0]}".center(15, " ")
    + "|"
    + f"{shape_master_mapped[0]}".center(18, " ")
)
print("".center(41, "-"))
print("Mapped |", f"{shape_merged[0]} items".center(30, " "))
print("".center(41, "-"))

print('Merged df '.ljust(15,'.'), f' {shape_merged}'.rjust(13, '.'))
df_merged.sample(3)

### > Find unmatched activities and drop (if any)

    (if there are any NaNs in columns "on the right" in merge function
        e.g. 'fullName_SimaPro', or 'shortName_geo', 
    this means the activity was not matched, i.e. DOESN'T EXIST IN DF ON THE RIGHT)

In [None]:
print('CAUTION: These columns have at least one NaN entry:\n')

dict_nans = dict(zip(
                    list(df_merged.columns[df_merged.isnull().any()]),      # .isnull() exactly the same as .isna()
                    [col for col in df_merged.isnull().sum() if col != 0]
                    )
                )

pprint.pprint(dict_nans)

> <font color=red>'activityName_EI'</font> is a **required** field. MUST not have empty entries!

In [None]:
print('This is a list of activities from SimaPro not matched in Ecoinvent db:\n')
list(df_merged[df_merged.activityName_EI.isnull()].Activity)

> (!) <span style='background:red'> WARNING:</span> Deinking emulsion, in paper production {GLO} is not in Ecoinvent database, only RoW and RER...<br>
> Drop it from df.<br>

In [None]:
df_merged = df_merged.dropna(
    axis=0, how="any", subset=["activityName_EI"], inplace=False
)

### > Check duplicates (if any)

In [None]:
# Unique items per column
print('nº of unique items per column:'.center(35))
print(''.center(35,'-'))
df_merged.nunique()

In [None]:
# Check for duplicates

print('These are the "duplicated" items in column Activity:\n')
df_merged[df_merged.Activity.duplicated(False)]
# df_merged.loc[85,'activityName_Ecoinvent']

## TEMPORTAL (inactive)

# OUTPUTS: Export data to excel

In [None]:
%%time

# Set output directory
outputs_dir = set_outputs_dir(use_default=True)  # default `..\data\interim`

## Export dataframe to excel
excelName = "mapped-lcia-results.xlsx"

df_readme = make_readme_info(
    excelName,
    "Sheet1: LCIA method results (per category) for ALL chemical markets from SimaPro910 "
    "mapped against metadata from Ecoinvent v3.5 APOS. "
    "\ndf_lcia_labels: unique names of the LCIA methods used in Sheet1.",
)

w_excel(
    path_to_file=outputs_dir,
    filename=excelName,
    dict_data_to_write={"Sheet1": df_merged, "df_lcia_labels": df_lcia_labels},
    readme_info=("readme", df_readme),
    ####         ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
    #     startrow=0
)