<div class="alert alert-block alert-info">
This script maps <b>LCIA results exported from SP910</b> against previously mapped <b>DB processes/activities SP910 and Ecoinvent v3.5 APOS</b>  
    <hr> 
    Note: <br>
    <i><b>Input file(s)' name(s) and metadata</b></i> (if available) are <b>printed out (below 👇🏼) in 'read data to df' section.</b>
</div>

# Imports

In [1]:
%config IPCompleter.use_jedi = False # disable jedi autocompleter (https://stackoverflow.com/a/65734178/14485040)

import project_path  # makes possible the access to `src` directory using relative path
from src.utils import explore_dir, make_readme_info
from src.utils import read_excel_to_pandas as r_excel
from src.utils import set_outputs_dir
from src.utils import write_pandas_to_excel as w_excel

%run init_nb.ipynb

# INPUTS: Identify file(s) and read data to df

In [2]:
# Explore the directory to find the file(s)

# --------------- LCIAresults file ----------------
inputs_dir_1, files_list_1 = explore_dir(
    path_to_dir=r"..\data\interim", file_extension="xlsx", print_files_list=True,
)
print("--> nº of found files in", inputs_dir_1, ": ", len(files_list_1), "\n")

# --------------- master mapped db file ----------------
inputs_dir_2, files_list_2 = explore_dir(
    path_to_dir=r"..\..\18-BW2Hub\02.Code\Outputs",
    file_extension="xlsx",
    print_files_list=True,
)
print("--> nº of found files in", inputs_dir_2, ": ", len(files_list_2))

['CPC33to36_other_chemicals.xlsx',
 'df_chemproperties.xlsx',
 'extended-data-chosen-methods-metadata-pubchem-properties.xlsx',
 'lcia-results-from-sp910-combined.xlsx',
 'list-chemicals-not-shown-on-fig-prices-vs-impacts.xlsx',
 'list-chemicals-not-shown-on-fig3.xlsx',
 'list-outliers.xlsx',
 'mapped-lcia-results.xlsx',
 'pubchem-properties.xlsx',
 'raw-data-chosen-lcia-methods-and-metadata.xlsx',
 'temp-df_base_full_wCAS.xlsx',
 'temp-df_base_full_wCAS_woOutliersRMDk9a5.xlsx',
 'temp-dict_fullMethods.xlsx']
--> nº of found files in C:\Users\ViteksPC\Documents\00-ETH_projects\17-AESA_ecoinvent_chemicals\data\interim :  13 

['ecoinvent_3.5_apos_activities_db-extracted.xlsx',
 'mapped_processes_SP910-EI35APOS.xlsx',
 'RybergMethod_CFs_PB-LCIA_ecoinvent_linkage_V2.037_V2037-extracted.xlsx',
 'SP910_Impact_CFs_PBs-LCIA_(baseline)_V072-extracted.xlsx',
 'SP910_Impact_CFs_ReCiPe_Midpoint_(H)_V113-extracted.xlsx',
 'SP910_processes_db-extracted.xlsx',
 'SP910_substances-extracted.xlsx',
 's

In [3]:
%%time
# Process raw data

# LCIAs results from SimaPro
df_lcias_raw = r_excel(
    inputs_dir_1, "lcia-results-from-sp910-combined.xlsx", sheets="Sheet1"
)

print(
    "df of LCIAs from SimaPro (raw) ".ljust(40, "."),
    f" {df_lcias_raw.shape}".rjust(13, "."),
)

# master mapped db file
df_master_mapped_db_raw = r_excel(
    inputs_dir_2, "mapped_processes_SP910-EI35APOS.xlsx", sheets="Sheet1"
)
print(
    "df of master mapped db (raw) ".ljust(40, "."),
    f" {df_master_mapped_db_raw.shape}".rjust(13, "."),
)


===> Trying to load 'readme' data... ===
File: lcia-results-from-sp910-combined.xlsx from
C:\Users\ViteksPC\Documents\00-ETH_projects\17-AESA_ecoinvent_chemicals\notebooks\0.01-vt-parse-lcia-results-from-sp910.ipynb
Generated on 2021-07-29 (Thursday), 15:03:28 by Tulus, V.
Includes:
<<<
Sheet1: Multiple LCIA methods results (per category) for ALL chemical markets from SimaPro910. 
df_lcia_labels: unique names of the LCIA methods used in Sheet1.
>>>

df of LCIAs from SimaPro (raw) ......... ... (947, 64)

===> Trying to load 'readme' data... ===
File: mapped_processes_SP910-EI35APOS.xlsx from
C:\Users\ViteksPC\Documents\00-ETH_projects\18-BW2Hub\02.Code\05.map_processes_SP910-EI35APOS.ipynb
Generated on 2021-05-12 (Wednesday), 16:44:58 by Tulus, V.
Includes:
<<<
Mapped SimaPro910 (with EI35-APOS) processes against Ecoinvent v3.5 APOS activities. Note: for now only 18 activities from Ecoinvent remain unmapped after automatic mapping. Manual refinement is needed.
>>>

df of master mapped

In [4]:
# Get unique names of the LCIA methods from 'LCIAresultsSP910-combined.xlsx'
# This df is copied to the new Excel file degerated below.

df_lcia_labels = r_excel(
    inputs_dir_1, "lcia-results-from-sp910-combined.xlsx", sheets="df_lcia_labels", show_readme=False,
)
df_lcia_labels

Unnamed: 0,Method
0,ReCiPe 2016 Endpoint (H) V1.03 / World (2010) H/A
1,PBs - Alternative: EF - LANCA V0.70
2,PBs-LCIA (baseline) V0.72
3,ReCiPe 2016 Midpoint (H) V1.03 / World (2010) H
4,PBs-LCIA V0.71 V0.71
5,IPCC 2013 GWP 100a V1.03
6,Cumulative Energy Demand V1.11 / Cumulative en...


# Operations

## Filter/adjust data

In [5]:
df_lcias = df_lcias_raw.copy() # make a deepcopy of the original df

# Substitute "APOS, U - copy" in the original df with "APOS, S", according to the last added markets
lst = []
aposRegex = re.compile((r'APOS, (.*)')) # complied regular expression of "APOS, U - copy" to be replaced with "APOS, S"
for item in df_lcias.Activity:
    lst.append(aposRegex.sub(r'APOS, S', item))
df_lcias.Activity = lst

shape_lcias = df_lcias.shape
print('df of LCIAs from SimaPro '.ljust(40,'.'), f' {shape_lcias}'.rjust(13, '.'))
df_lcias.sample(2)

df of LCIAs from SimaPro ............... ... (947, 64)


Unnamed: 0,wkbName,Activity,"('PBs-LCIA V0.71 V0.71', 'Climate change - CO2 concentration', 'ppm')","('PBs-LCIA V0.71 V0.71', 'Climate change - Energy imbalance', 'Wm-2')","('PBs-LCIA V0.71 V0.71', 'Stratospheric ozone depletion', 'DU')","('PBs-LCIA V0.71 V0.71', 'Ocean acidification', 'Omega Aragon')","('PBs-LCIA V0.71 V0.71', 'Biogeochemical flows - P', 'Tg P')","('PBs-LCIA V0.71 V0.71', 'Biogeochemical flows - N', 'Tg N')","('PBs-LCIA V0.71 V0.71', 'Land-system change - Global', '%')","('PBs-LCIA V0.71 V0.71', 'Freshwater use - Global', 'km3')",...,"('ReCiPe 2016 Endpoint (H) V1.03 / World (2010) H/A', 'Human health', 'DALY')","('ReCiPe 2016 Endpoint (H) V1.03 / World (2010) H/A', 'Ecosystems', 'species.yr')","('ReCiPe 2016 Endpoint (H) V1.03 / World (2010) H/A', 'Resources', 'USD2013')","('IPCC 2013 GWP 100a V1.03', 'IPCC GWP 100a', 'kg CO2 eq')","('Cumulative Energy Demand V1.11 / Cumulative energy demand', 'Non renewable, fossil', 'MJ')","('Cumulative Energy Demand V1.11 / Cumulative energy demand', 'Non-renewable, nuclear', 'MJ')","('Cumulative Energy Demand V1.11 / Cumulative energy demand', 'Non-renewable, biomass', 'MJ')","('Cumulative Energy Demand V1.11 / Cumulative energy demand', 'Renewable, biomass', 'MJ')","('Cumulative Energy Demand V1.11 / Cumulative energy demand', 'Renewable, wind, solar, geothe', 'MJ')","('Cumulative Energy Demand V1.11 / Cumulative energy demand', 'Renewable, water', 'MJ')"
158,Chemicals-Inorganic-Market,"Calcium carbonate, precipitated {RoW}| market ...",4.173653e-11,5.596044e-13,5.253571e-15,1.275348e-13,5.889272e-16,9.588936e-13,2.590902e-16,5.363869e-12,...,3e-06,7.02813e-09,0.097966,1.761009,14.908673,1.147431,0.000463,0.409637,0.108292,0.719907
912,Plastics-Thermoplasts-Market,Ortho-phenylene diamine {GLO}| market for | AP...,3.131877e-10,4.544084e-12,1.173624e-12,9.57021e-13,2.28329e-14,1.199126e-10,2.154978e-15,4.027254e-11,...,3.2e-05,6.423948e-08,1.528111,14.834046,211.69909,9.399846,0.044901,2.197984,0.686347,3.076006


In [6]:
df_master_mapped_db = df_master_mapped_db_raw.copy() # make a deepcopy of the original df

# Substitute "APOS, U" in the original df with "APOS, S", since the LCIAs are calculated with S-system instead of U-unit 
lst = []
aposRegex = re.compile((r'APOS, (\w+)')) # complied regular expression of "APOS, U" to be replaced with "APOS, S"
for item in df_master_mapped_db.fullName_SimaPro:
    lst.append(aposRegex.sub(r'APOS, S', item))
df_master_mapped_db.fullName_SimaPro = lst

shape_master_mapped = df_master_mapped_db.shape
print('df of master mapped db'.ljust(40,'.'), f' {shape_master_mapped}'.rjust(13, '.'))

df_master_mapped_db.sample(2)

df of master mapped db.................. . (16027, 29)


Unnamed: 0,activity_comment,type,referenceProduct,shortName_geo,activityName_SP,fullName_SimaPro,unit,amount,allocation_percentage,wasteType,...,referenceProductUnit,referenceProduct_prodVolume,referenceProduct_prodVolumeComment,referenceProduct_price,referenceProduct_priceUnit,referenceProduct_priceComment,referenceProduct_casNumber,referenceProduct_CPCclass,activity_generalComment,sourceFilename
11033,landfill for untreated municipal solid waste[...,Products,"Process-specific burden, sanitary landfill",CH,processing,"Process-specific burden, sanitary landfill {CH...",kg,1,100,not defined,...,kg,0.033668,This is a placeholder value that does not refl...,0.00104,EUR2005,Calculated based on inputs: The price of the p...,,39: Wastes or scraps,landfill for untreated municipal solid waste.\...,24313_7d47ffb3-cc3c-4189-b825-bd92c30dbc83_4a2...
14576,"Estimated based on UN statistics, data on plas...",Waste treatment,Waste polyethylene terephtalate,LT,market for waste polyethylene terephthalate,Waste polyethylene terephtalate {LT}| market f...,kg,1,(blank),All waste types,...,kg,473112.604473,,0.0,EUR2005,Products classifies as Waste are not assigned ...,25038-59-9,"39270: Waste, parings and scrap of plastics","Estimated based on UN statistics, data on plas...",17591_9a28f1f6-337a-58a9-8a6d-ab005792e18c_dcb...


In [7]:
df_master_mapped_db.columns

Index(['activity_comment', 'type', 'referenceProduct', 'shortName_geo',
       'activityName_SP', 'fullName_SimaPro', 'unit', 'amount',
       'allocation_percentage', 'wasteType', 'category', 'inline_comment',
       'activityName_EI', 'geo', 'activity_ISICclass',
       'activity_ecoSpold01class', 'technologyLevel', 'referenceProductName',
       'referenceProductAmount', 'referenceProductUnit',
       'referenceProduct_prodVolume', 'referenceProduct_prodVolumeComment',
       'referenceProduct_price', 'referenceProduct_priceUnit',
       'referenceProduct_priceComment', 'referenceProduct_casNumber',
       'referenceProduct_CPCclass', 'activity_generalComment',
       'sourceFilename'],
      dtype='object')

## Mapping

In [8]:
# Merge df_lcias with df_master_mapped_db on "Activity" and "fullName_SimaPro"
df_merged = df_lcias.merge(df_master_mapped_db,
                                 how='left',
                                 left_on=['Activity'], 
                                 right_on=['fullName_SimaPro'],
                                 suffixes=('_LCIArslts', '_SimaPro')
                                )

shape_merged = df_merged.shape
# ----------------------------------

print(
    "|".rjust(8, " ")
    + "LCIAs results".center(15, " ")
    + "|"
    + "Master mapped db".center(18, " ")
)
print("".center(41, "-"))
print(
    "Total"
    + "|".rjust(3, " ")
    + f"{shape_lcias[0]}".center(15, " ")
    + "|"
    + f"{shape_master_mapped[0]}".center(18, " ")
)
print("".center(41, "-"))
print("Mapped |", f"{shape_merged[0]} items".center(30, " "))
print("".center(41, "-"))

print('Merged df '.ljust(15,'.'), f' {shape_merged}'.rjust(13, '.'))
df_merged.sample(3)

       | LCIAs results | Master mapped db 
-----------------------------------------
Total  |      947      |      16027       
-----------------------------------------
Mapped |           947 items           
-----------------------------------------
Merged df ..... ... (947, 93)


Unnamed: 0,wkbName,Activity,"('PBs-LCIA V0.71 V0.71', 'Climate change - CO2 concentration', 'ppm')","('PBs-LCIA V0.71 V0.71', 'Climate change - Energy imbalance', 'Wm-2')","('PBs-LCIA V0.71 V0.71', 'Stratospheric ozone depletion', 'DU')","('PBs-LCIA V0.71 V0.71', 'Ocean acidification', 'Omega Aragon')","('PBs-LCIA V0.71 V0.71', 'Biogeochemical flows - P', 'Tg P')","('PBs-LCIA V0.71 V0.71', 'Biogeochemical flows - N', 'Tg N')","('PBs-LCIA V0.71 V0.71', 'Land-system change - Global', '%')","('PBs-LCIA V0.71 V0.71', 'Freshwater use - Global', 'km3')",...,referenceProductUnit,referenceProduct_prodVolume,referenceProduct_prodVolumeComment,referenceProduct_price,referenceProduct_priceUnit,referenceProduct_priceComment,referenceProduct_casNumber,referenceProduct_CPCclass,activity_generalComment,sourceFilename
66,Chemicals-Fertilisers (organic)-Market,"Green manure, Swiss integrated production, unt...",2.312992e-13,9.689839e-15,2.020841e-14,7.0678e-16,1.427563e-14,7.398807e-15,3.931081e-18,1.913703e-13,...,ha,4.0,,25.0,EUR2005,Calculated based on inputs: The price of the p...,,34659: Fertilizers n.e.c.,,21887_85187640-1ab5-4213-858e-77df40baf038_862...
928,Plastics-Thermoplasts-Market,"Polyethylene, high density, granulate, recycle...",1.34901e-11,1.853787e-13,3.244146e-15,4.122207e-14,6.87493e-16,1.052313e-12,7.693501000000001e-17,1.035177e-12,...,kg,252476100.0,Calculated global production volume (PV) based...,0.7,EUR2005,From UN data (http://data.un.org/Data.aspx?q=p...,,"34710: Polymers of ethylene, in primary forms",This is a constrained market. The justificatio...,26607_a6020df2-6142-4ac2-a914-b40124248704_f3a...
116,Chemicals-Gases-Market,Chlorine dioxide {RoW}| market for chlorine di...,2.207909e-10,3.027347e-12,4.585756e-14,6.746794e-13,8.297465e-15,2.74592e-12,1.090516e-15,2.168859e-11,...,kg,2143281000.0,,1.31,EUR2005,Calculated based on inputs: The price of the p...,10049-04-4,"34110: Hydrocarbons and their halogenated, sul...",This dataset represents the supply of 1 kg of ...,25674_e41b9bc8-de9f-418c-969d-fbedf5f8165f_29f...


### > Find unmatched activities and drop (if any)

    (if there are any NaNs in columns "on the right" in merge function
        e.g. 'fullName_SimaPro', or 'shortName_geo', 
    this means the activity was not matched, i.e. DOESN'T EXIST IN DF ON THE RIGHT)

In [9]:
print('CAUTION: These columns have at least one NaN entry:\n')

dict_nans = dict(zip(
                    list(df_merged.columns[df_merged.isnull().any()]),      # .isnull() exactly the same as .isna()
                    [col for col in df_merged.isnull().sum() if col != 0]
                    )
                )

pprint.pprint(dict_nans)

CAUTION: These columns have at least one NaN entry:

{'activityName_EI': 1,
 'activityName_SP': 1,
 'activity_ISICclass': 1,
 'activity_comment': 1,
 'activity_ecoSpold01class': 408,
 'activity_generalComment': 400,
 'allocation_percentage': 1,
 'amount': 1,
 'category': 1,
 'fullName_SimaPro': 1,
 'geo': 1,
 'inline_comment': 1,
 'referenceProduct': 1,
 'referenceProductAmount': 1,
 'referenceProductName': 1,
 'referenceProductUnit': 1,
 'referenceProduct_CPCclass': 2,
 'referenceProduct_casNumber': 517,
 'referenceProduct_price': 7,
 'referenceProduct_priceComment': 7,
 'referenceProduct_priceUnit': 7,
 'referenceProduct_prodVolume': 1,
 'referenceProduct_prodVolumeComment': 935,
 'shortName_geo': 1,
 'sourceFilename': 1,
 'technologyLevel': 265,
 'type': 1,
 'unit': 1,
 'wasteType': 1}


> <font color=red>'activityName_EI'</font> is a **required** field. MUST not have empty entries!

In [10]:
print('This is a list of activities from SimaPro not matched in Ecoinvent db:\n')
list(df_merged[df_merged.activityName_EI.isnull()].Activity)

This is a list of activities from SimaPro not matched in Ecoinvent db:



['Deinking emulsion, in paper production {GLO}| market for | APOS, S']

> (!) <span style='background:red'> WARNING:</span> Deinking emulsion, in paper production {GLO} is not in Ecoinvent database, only RoW and RER...<br>
> Drop it from df.<br>

In [11]:
df_merged = df_merged.dropna(
    axis=0, how="any", subset=["activityName_EI"], inplace=False
)

### > Check duplicates (if any)

In [12]:
# Unique items per column
print('nº of unique items per column:'.center(35))
print(''.center(35,'-'))
df_merged.nunique()

   nº of unique items per column:  
-----------------------------------


wkbName                                                                   18
Activity                                                                 946
('PBs-LCIA V0.71 V0.71', 'Climate change - CO2 concentration', 'ppm')    938
('PBs-LCIA V0.71 V0.71', 'Climate change - Energy imbalance', 'Wm-2')    943
('PBs-LCIA V0.71 V0.71', 'Stratospheric ozone depletion', 'DU')          939
                                                                        ... 
referenceProduct_priceComment                                            361
referenceProduct_casNumber                                               275
referenceProduct_CPCclass                                                116
activity_generalComment                                                  312
sourceFilename                                                           946
Length: 93, dtype: int64

In [13]:
# Check for duplicates

print('These are the "duplicated" items in column Activity:\n')
df_merged[df_merged.Activity.duplicated(False)]
# df_merged.loc[85,'activityName_Ecoinvent']

These are the "duplicated" items in column Activity:



Unnamed: 0,wkbName,Activity,"('PBs-LCIA V0.71 V0.71', 'Climate change - CO2 concentration', 'ppm')","('PBs-LCIA V0.71 V0.71', 'Climate change - Energy imbalance', 'Wm-2')","('PBs-LCIA V0.71 V0.71', 'Stratospheric ozone depletion', 'DU')","('PBs-LCIA V0.71 V0.71', 'Ocean acidification', 'Omega Aragon')","('PBs-LCIA V0.71 V0.71', 'Biogeochemical flows - P', 'Tg P')","('PBs-LCIA V0.71 V0.71', 'Biogeochemical flows - N', 'Tg N')","('PBs-LCIA V0.71 V0.71', 'Land-system change - Global', '%')","('PBs-LCIA V0.71 V0.71', 'Freshwater use - Global', 'km3')",...,referenceProductUnit,referenceProduct_prodVolume,referenceProduct_prodVolumeComment,referenceProduct_price,referenceProduct_priceUnit,referenceProduct_priceComment,referenceProduct_casNumber,referenceProduct_CPCclass,activity_generalComment,sourceFilename


## TEMPORTAL (inactive)

# OUTPUTS: Export data to excel

In [14]:
%%time

# Set output directory
outputs_dir = set_outputs_dir(use_default=True)  # default `..\data\interim`

## Export dataframe to excel
excelName = "mapped-lcia-results.xlsx"

df_readme = make_readme_info(
    excelName,
    "Sheet1: LCIA method results (per category) for ALL chemical markets from SimaPro910 "
    "mapped against metadata from Ecoinvent v3.5 APOS. "
    "\ndf_lcia_labels: unique names of the LCIA methods used in Sheet1.",
)

w_excel(
    path_to_file=outputs_dir,
    filename=excelName,
    dict_data_to_write={"Sheet1": df_merged, "df_lcia_labels": df_lcia_labels},
    readme_info=("readme", df_readme),
    ####         ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
    #     startrow=0
)

File: mapped-lcia-results.xlsx successfully created in 
C:\Users\ViteksPC\Documents\00-ETH_projects\17-AESA_ecoinvent_chemicals\data\interim
Wall time: 1.73 s
