<div class="alert alert-block alert-info">
This script <b>extends the data by adding properties extracted from PubChem</b>.
    <hr> 
    Note: <br>
    <i><b>Input file(s)' name(s) and metadata</b></i> (if available) are <b>printed out (below 👇🏼) in 'read data to df' section.</b>
</div>

In [None]:
# %env
# %who_ls
# %who
# %who int
# %pinfo <var name>

# Imports

In [None]:
%config IPCompleter.use_jedi = False # disable jedi autocompleter (https://stackoverflow.com/a/65734178/14485040)

import project_path  # makes possible the access to `src` directory using relative path
from src.data import filter_dataframe, internal_funcs
from src.utils import explore_dir, make_readme_info
from src.utils import read_excel_to_pandas as r_excel
from src.utils import set_outputs_dir
from src.utils import write_pandas_to_excel as w_excel

%run init_nb.ipynb

# INPUTS: Identify file(s) and read data to df

In [None]:
# Explore the directory to find the file(s)
inputs_dir, files_list = explore_dir(
    path_to_dir=r"..\data\interim", file_extension="xlsx", print_files_list=True
)

In [None]:
# Read data
df_raw_data = r_excel(inputs_dir, "raw-data-chosen-lcia-methods-and-metadata.xlsx")
print(
    "df of raw data".ljust(40, "."),
    f"{df_raw_data.shape}\n".rjust(13, "."),
)

# Get list of LCIA methods and list of metadata
METHODS = r_excel(
    inputs_dir, "raw-data-chosen-lcia-methods-and-metadata.xlsx", sheets="METHODS", show_readme=False,
)["METHODS"].to_list()

METADATA = r_excel(
    inputs_dir, "raw-data-chosen-lcia-methods-and-metadata.xlsx", sheets="METADATA", show_readme=False,
)["METADATA"].to_list()

# Read PubChem properties
df_pubchem_prop = r_excel(inputs_dir, "pubchem-properties.xlsx")
print(
    "df of PubChem chemical properties".ljust(40, "."),
    f"{df_pubchem_prop.shape}\n".rjust(13, "."),
)

# Operations 
- add data with chemical properties (from PubChem) to df_raw_data
<div class="alert alert-block alert-info">
    created: <strong>df_analysis</strong>
</div>

In [None]:
df_pubchem_prop.sort_values(by=["MW", "pubchem_match"], ascending=True, inplace=True)
df_pubchem_prop.sample(3)

In [None]:
percent_not_matched = (
    1
    - df_pubchem_prop[df_pubchem_prop.num_matches != 0].referenceProduct.count()
    / df_pubchem_prop.referenceProduct.count()
) * 100
print("{}% of referenceProducts had no match...".format(percent_not_matched.round(2)))

## Explore df_pubchem_prop

In [None]:
# df_pubchem_prop[df_pubchem_prop.num_matches == 0]

In [None]:
df_pubchem_prop[df_pubchem_prop.num_matches == 0]

In [None]:
# Components not identified automatically by PubChem
# have to be added manually !

COMP_ADDED_MANUALLY = [
    # from Javier's list
    "Liquefied petroleum gas",
    "Petrol, low-sulfur",  # or 'Petrol, unleaded',
    "Diesel",
    "Diesel, low-sulfur",
    "Kerosene",
    # other
    "Xylene",  # o-, m- or p-
]
COMP_ADDED_MANUALLY

In [None]:
# Properties added here, 
# if desired, specific MW and complexity and other can be added below

df_pubchem_prop.loc[
    df_pubchem_prop.referenceProduct == "Liquefied petroleum gas",
    ["pubchem_match", "num_matches", "MW"],
] = ["manual match", 1, 44.097] # Propane MW used as proxy

df_pubchem_prop.loc[
    df_pubchem_prop.referenceProduct == "Petrol, low-sulfur",
    ["pubchem_match", "num_matches", "MW"],
] = ["manual match", 1, 105] # MW taken as average...

df_pubchem_prop.loc[
    df_pubchem_prop.referenceProduct == "Diesel",
    ["pubchem_match", "num_matches", "MW"],
] = ["manual match", 1, 200] # MW taken as average...

df_pubchem_prop.loc[
    df_pubchem_prop.referenceProduct == "Diesel, low-sulfur",
    ["pubchem_match", "num_matches", "MW"],
] = ["manual match", 1, 200] # MW taken as average...

df_pubchem_prop.loc[
    df_pubchem_prop.referenceProduct == "Kerosene",
    ["pubchem_match", "num_matches", "MW"],
] = ["manual match", 1, 170] # MW taken as average...

df_pubchem_prop.loc[
    df_pubchem_prop.referenceProduct == "Xylene",
    ["pubchem_match", "num_matches", "MW"],
] = ["manual match", 1, 106]

for item in COMP_ADDED_MANUALLY:
    if df_pubchem_prop.loc[df_pubchem_prop.referenceProduct == item, ["pubchem_match"]].values[0] == "No match":
        df_pubchem_prop.loc[
            df_pubchem_prop.referenceProduct == item, ["pubchem_match", "num_matches", "MW"],
        ] = ["manual match", 1, 100] # dummy MW

In [None]:
filter_dataframe(
    df_in=df_pubchem_prop,
    col_name="referenceProduct",
    filter_in=COMP_ADDED_MANUALLY,
    exact_match=True,
    print_unique=True,
)

In [None]:
percent_not_matched = (
    1
    - df_pubchem_prop[df_pubchem_prop.num_matches != 0].referenceProduct.count()
    / df_pubchem_prop.referenceProduct.count()
) * 100
print("{}% of referenceProducts had no match...".format(percent_not_matched.round(2)))

<div class="alert alert-block alert-warning"> 
❗❗❗ <br>
    the cas number of "Praseodymium oxide" is not found in the PubChem database, <br>
    but could be found in Sigma-Aldrich (which references to a compound name in PubChem -> "Praseodymium (III, IV) oxide"). <br>
    <strong>Change the name to make it searchable in PubChem...</strong> 
</div>

In [None]:
NO_MATCH_PRODS = list(df_pubchem_prop[df_pubchem_prop.num_matches == 0].referenceProduct)
e, *_ = NO_MATCH_PRODS[0].split(", ")
print("{} not matched reference products".format(len(NO_MATCH_PRODS)))
NO_MATCH_PRODS

### Try cirpy module (should be used in combination with pubchempy?)

In [None]:
import cirpy as crp
import pubchempy as pcp
from cirpy import Molecule

In [None]:
crp.resolve("Anhydrite", "iupac_name")

In [None]:
crp.query("Krypton", "iupac_name")

In [None]:
c = pcp.get_compounds("EINECS 222-037-3", namespace="name", searchtype=None, as_dataframe=False)
print(c[0].molecular_formula)
print(c[0].molecular_weight)

In [None]:
mol = Molecule("Nylon 6/6")
print(mol.cas)
print(mol.formula)
print(mol.mw)
print(mol.image_url)
print(mol.names)

In [None]:
c = pcp.get_compounds("52349-42-5", namespace="name", searchtype=None, as_dataframe=False)
print(c[0].molecular_formula)
print(c[0].molecular_weight)

In [None]:
indices_NO_MATCH_PRODS = [
    idx
    for idx in df_raw_data.index
    if df_raw_data.referenceProduct[idx] in NO_MATCH_PRODS
]
print("{} not matched reference products in df_analysis_prev".format(len(indices_NO_MATCH_PRODS)))
# indices_NO_MATCH_PRODS

In [None]:
df_raw_data.columns

In [None]:
df_to_explore = df_raw_data.loc[indices_NO_MATCH_PRODS][
    METADATA
    #     [
    #         "Activity",
    #         "category",
    #         "referenceProduct_CPCclass",
    #         "referenceProduct",
    #         "referenceProduct_prodVolume",
    #         "wasteType",
    #         "geo",
    #         "referenceProductUnit",
    #         "referenceProduct_casNumber",
    #     ]
].sort_values(
    by=["referenceProduct_prodVolume", "category"]
)  # .category.unique()  # .sort_index()

# df_to_explore
# sorted(
#     filter_dataframe(
#         _filter_by_geo_and_FU(df_to_explore, geo="GLO", FU="kg"),
#         col_name="referenceProduct_CPCclass",
#         filter_in=["3"],
#     ).Activity,  # .referenceProduct_CPCclass
#     reverse=False,
# )


lst_temp = []

for idx in internal_funcs.filter_by_geo_and_fu(
    df_to_explore, geo="GLO", funit="kg"
).index:
    x = df_to_explore.referenceProduct_CPCclass[idx]
    if (
        str(x).startswith("33")
        or str(x).startswith("34")
        or str(x).startswith("35")
        or str(x).startswith("36")
    ):
        lst_temp.append(df_to_explore.Activity[idx])
sorted(lst_temp)

In [None]:
len(lst_temp)

In [None]:
# find = 'Stone meal'
# find = "Horn meal"
find = "Polyvinylfluoride, dispersion"

print(list(df_raw_data[df_raw_data.referenceProduct==find].activity_comment),"\n")
print(list(df_raw_data[df_raw_data.referenceProduct==find].inline_comment),"\n")
print(list(df_raw_data[df_raw_data.referenceProduct==find].referenceProduct_prodVolumeComment),"\n")
print(list(df_raw_data[df_raw_data.referenceProduct==find].referenceProduct_priceComment),"\n")
print(list(df_raw_data[df_raw_data.referenceProduct==find].activity_generalComment),"\n")

df_raw_data[df_raw_data.referenceProduct==find]

## Created ``df_analysis``, ``df_metadata`` and ``df_methods``

In [None]:
df_analysis = df_raw_data.merge(
    df_pubchem_prop, how="left", on=["referenceProduct", "referenceProduct_casNumber"]
)


# (!) update the list of non-method column labels
cols_from_pubchem_prop = df_pubchem_prop.columns.difference(df_raw_data.columns)
METADATA = [i for i in METADATA if i not in cols_from_pubchem_prop]
METADATA = METADATA + cols_from_pubchem_prop.to_list()
# # ---------------
df_analysis = df_analysis.loc[:, list(METADATA + METHODS)]

pd.options.display.max_columns = None

print("Created **df_analysis** dataframe is of {} shape.\n".format(df_analysis.shape))
df_analysis.sample(3)

In [None]:
# Make df of METADATA and METHODS for later export
df_metadata = pd.DataFrame(METADATA, columns=["METADATA"])
df_methods = pd.DataFrame(METHODS, columns=["METHODS"])

# OUTPUTS: Export data to excel

In [None]:
%%time

# Set output directory
outputs_dir = set_outputs_dir(use_default=True)  # default `..\data\interim`

## Export dataframe to excel
excelName = "extended-data-chosen-methods-metadata-pubchem-properties.xlsx"

df_readme = make_readme_info(
    excelName,
    "Sheet1: Extended data with chosen LCIA methods, important metadata and PubChem properties"
    "\nMETADATA: list of relevant metadata used in Sheet1."
    "\nMETHODS: list of LCIA methods used in Sheet1."
    "\n[METHODS + METADATA have to be the only column labels in Sheet1]",
)

w_excel(
    path_to_file=outputs_dir,
    filename=excelName,
    dict_data_to_write={
        "Sheet1": df_analysis,
        "METADATA": df_metadata,
        "METHODS": df_methods,
    },
    readme_info=("readme", df_readme),
    #     ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
    #     startrow=0
)