## Mapping ecoinvent_database (EIDB)

In [1]:
# dataframe tools
import pandas as pd
import numpy as np
from tqdm import tqdm

# metrics functions
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import r2_score

# custom package
from caml import config
from caml.similarity import MLModel

# interactive input tools
import ipywidgets as widgets
from ipywidgets import VBox

# measure wall time running code
import time

In [2]:
import sys
sys.path.append('../Module')  #a level up & then down to Module folder
from lci_ml_mod import *

In [3]:
# if you have specific version eidb_overview spreadsheet saved to local drive, 
eidb_df = pd.read_excel("data/EIDB_38.xlsx", sheet_name =  "Cut-Off AO")
# or download from ecoinvent directly, latest version as of Apr7 2023:
#url = 'https://ecoinvent.org/wp-content/uploads/2022/12/Database-Overview-for-ecoinvent-v3.9.1.xlsx'
#eidb_df = pd.read_excel(url, sheet_name =  "Cut-Off AO")

In [4]:
import numpy as np
eidb_list = np.unique(eidb_df["Reference Product Name"].values)
print()
eidb_act_list = np.unique(eidb_df["Activity Name"].values)

print("Total N of database is %d, unique Reference products is %d, and unique activity is %d"  % ( len(eidb_df), 
                        len(eidb_list), len(eidb_act_list)))


Total N of database is 19565, unique Reference products is 3292, and unique activity is 7646


In [5]:
product_list = [
    "softwood, sustainable forest",
    "coffee bean",
    "organic milk",
    "textile production polyester",
]

### Below you choose either use reference product name or activity name as the mapping list: 
- reference product: less N, quicker, less accurate (or could be depending on product)
- activity name: more N, slower, more accurate as the name itself contains more information ,e.g., production tech

In [6]:
st = time.time()

model = MLModel(config.model_name)
#list1  = eidb_list      # or use eidb_act_list
list2 = eidb_act_list
cosine_scores = model.compute_similarity_scores(product_list, list2)
#check cos_score: cosine_scores.sort(dim=1, descending=True)[1]

elapsed_time = time.time() - st
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

Execution time: 00:01:40


### Only see LCI mapped with highest score

In [7]:
dt = map_single_lci(cosine_scores = cosine_scores, product_list = product_list, mapdb_list = eidb_act_list)
dt

Unnamed: 0,your_product,LCI_mapped,cosine_score
0,"softwood, sustainable forest","softwood forestry, pine, sustainable forest management",0.883
1,coffee bean,"coffee green bean production, robusta",0.566
2,organic milk,"skimmed milk, from cow milk to generic market for protein feed",0.612
3,textile production polyester,"polyester fibre production, finished",0.843


### If you wanna see first N [up to 20, define in below n=] closest mapped LCI

In [8]:
dtt = map_multiple_lci(cosine_scores = cosine_scores, n=8, product_list = product_list, mapdb_list = eidb_act_list)
dtt

Unnamed: 0_level_0,Unnamed: 1_level_0,LCI_mapped,ML_score
your_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"softwood, sustainable forest",1,"softwood forestry, pine, sustainable forest management",0.883181
"softwood, sustainable forest",2,"softwood forestry, spruce, sustainable forest management",0.867411
"softwood, sustainable forest",3,"softwood forestry, mixed species, sustainable forest management",0.845838
"softwood, sustainable forest",4,"softwood forestry, mixed species, boreal forest",0.837387
"softwood, sustainable forest",5,"softwood forestry, paraná pine, sustainable forest management",0.816098
"softwood, sustainable forest",6,"hardwood forestry, beech, sustainable forest management",0.738258
"softwood, sustainable forest",7,"hardwood forestry, eucalyptus ssp., sustainable forest management",0.73558
"softwood, sustainable forest",8,"hardwood forestry, oak, sustainable forest management",0.732208
coffee bean,1,"coffee green bean production, robusta",0.565652
coffee bean,2,"coffee green bean production, arabica",0.553562


### You can try first model search within ISIC product category, second model use screened EIDB based on first model, save a lot time, but not accurate for some product

In [9]:
isic_list = np.unique(eidb_df["ISIC Classification"].values)

In [10]:
st = time.time()
model = MLModel(config.model_name)
list2 = isic_list
cosine_scores = model.compute_similarity_scores(product_list, list2)
elapsed_time = time.time() - st
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

Execution time: 00:00:03


#### after the first model, let's choose first 20 ISIC product categories, based on which, the whole EIDB will be screened

In [11]:
isic_df = map_multiple_lci(cosine_scores =cosine_scores, n=20, product_list = product_list, mapdb_list = isic_list)
#isic_df

In [12]:
cut_eidb = eidb_df.loc[eidb_df['ISIC Classification'].isin(isic_df['LCI_mapped'].values)]
len(cut_eidb)

5793

In [13]:
eidb_act_list2 = np.unique(cut_eidb["Activity Name"].values)
len(eidb_act_list2)

2380

In [14]:
st = time.time()
model = MLModel(config.model_name)
list3 = eidb_act_list2
cosine_scores = model.compute_similarity_scores(product_list, list3)
elapsed_time = time.time() - st
print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

Execution time: 00:00:26


- below softwood does not come up with the same answer as using the whole EIDB, because the first round model ISIC category search doesn't come up with the right ISIC "0220:Logging" 

- but mostly same results for other products 

In [15]:
dt1 = map_single_lci(cosine_scores = cosine_scores, product_list = product_list, mapdb_list = eidb_act_list2)
dt1

Unnamed: 0,your_product,LCI_mapped,cosine_score
0,"softwood, sustainable forest","hardwood forestry, eucalyptus ssp., planted forest management",0.719
1,coffee bean,"coffee green bean production, robusta",0.566
2,organic milk,"skimmed milk, from cow milk to generic market for protein feed",0.612
3,textile production polyester,"polyester fibre production, finished",0.843


In [16]:
ddt2 = map_multiple_lci(cosine_scores = cosine_scores, n=8, product_list = product_list, mapdb_list = eidb_act_list2)
ddt2

Unnamed: 0_level_0,Unnamed: 1_level_0,LCI_mapped,ML_score
your_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"softwood, sustainable forest",1,"hardwood forestry, eucalyptus ssp., planted forest management",0.719305
"softwood, sustainable forest",2,"market for residual softwood, wet",0.67832
"softwood, sustainable forest",3,"sawnwood production, softwood, raw, dried (u=20%)",0.669071
"softwood, sustainable forest",4,"sawnwood production, softwood, raw, dried (u=10%)",0.666445
"softwood, sustainable forest",5,"sawing, softwood",0.662865
"softwood, sustainable forest",6,"sawnwood production, softwood, dried (u=20%), planed",0.650806
"softwood, sustainable forest",7,"bamboo forestry, sustainable forest management",0.648417
"softwood, sustainable forest",8,"sawnwood production, softwood, dried (u=10%), planed",0.648186
coffee bean,1,"coffee green bean production, robusta",0.565652
coffee bean,2,"coffee green bean production, arabica",0.553562


In [17]:
dt == dt1

Unnamed: 0,your_product,LCI_mapped,cosine_score
0,True,False,False
1,True,True,True
2,True,True,True
3,True,True,True


In [18]:
ddt2 == dtt

Unnamed: 0_level_0,Unnamed: 1_level_0,LCI_mapped,ML_score
your_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"softwood, sustainable forest",1,False,False
"softwood, sustainable forest",2,False,False
"softwood, sustainable forest",3,False,False
"softwood, sustainable forest",4,False,False
"softwood, sustainable forest",5,False,False
"softwood, sustainable forest",6,False,False
"softwood, sustainable forest",7,False,False
"softwood, sustainable forest",8,False,False
coffee bean,1,True,True
coffee bean,2,True,True
