<div class="alert alert-block alert-info">
    This script <b> filters out chemicals that were not found in PubChem database <code>pubchem_match!=No match</code></b>.
    <hr> 
    Note: <br>
    <i><b>Input file(s)' name(s) and metadata</b></i> (if available) are <b>printed out (below 👇🏼) in 'read data to df' section.</b>

In [None]:
# %env
# %who_ls
# %who
# %who int
# %pinfo <var name>

# Imports

In [None]:
%config IPCompleter.use_jedi = False # disable jedi autocompleter (https://stackoverflow.com/a/65734178/14485040)

import project_path  # makes possible the access to `src` directory using relative path
from src.data import internal_funcs
from src.utils import explore_dir, make_readme_info
from src.utils import read_excel_to_pandas as r_excel
from src.utils import set_outputs_dir
from src.utils import write_pandas_to_excel as w_excel

%run init_nb.ipynb

In [None]:
pd.options.display.max_columns = None

# INPUTS: Identify file(s) and read data to df

In [None]:
# Explore the directory to find the file(s)
inputs_dir, files_list = explore_dir(
    path_to_dir=r"..\data\interim", file_extension="xlsx", print_files_list=True
)

In [None]:
# Read data
df_base_full = r_excel(inputs_dir, "data-extended-added-transgression-pbs.xlsx")
print(
    "df_base_full".ljust(40, "."), f"{df_base_full.shape}\n".rjust(13, "."),
)

# Get list of LCIA methods, TL PBs and list of metadata
METHODS = r_excel(
    inputs_dir,
    "data-extended-added-transgression-pbs.xlsx",
    sheets="METHODS",
    show_readme=False,
)["METHODS"].values.tolist()

TL_METHODS = r_excel(
    inputs_dir,
    "data-extended-added-transgression-pbs.xlsx",
    sheets="TL_METHODS",
    show_readme=False,
)["TL_METHODS"].values.tolist()

METADATA = r_excel(
    inputs_dir,
    "data-extended-added-transgression-pbs.xlsx",
    sheets="METADATA",
    show_readme=False,
)["METADATA"].values.tolist()

# Operations

## Filter out activities not found in PubChem
<div class="alert alert-block alert-info">
    created: <code>df_base_full_in_pubchem</code>  
</div>

In [None]:
df_base_full_in_pubchem = df_base_full[df_base_full.num_matches != 0]

print(
    "Created **df_base_full_in_pubchem** dataframe is of {} shape.\n".format(
        df_base_full_in_pubchem.shape
    )
)
df_base_full_in_pubchem.sample(2)

In [None]:
# Grouping by 'category_regrouped' and ploting the size of each group on a barh plot (in one line)
internal_funcs.plot_categories(
    df_in=df_base_full_in_pubchem,
    groupby="category_regrouped",
    color="darkorange",
    fontsize=12,
)

In [None]:
# Make df of METADATA, METHODS, and TL_METHODS for later export
df_metadata = pd.DataFrame(METADATA, columns=["METADATA"])
df_methods = pd.DataFrame(METHODS, columns=["METHODS"])
df_tl_methods = pd.DataFrame(TL_METHODS, columns=["TL_METHODS"])

# OUTPUTS: Export data to excel

In [None]:
%%time

# Set output directory
outputs_dir = set_outputs_dir(use_default=True)  # default `..\data\interim`

## Export dataframe to excel
excelName = "data-full-only-in-pubchem.xlsx"

df_readme = make_readme_info(
    excelName,
    "Sheet1: Dataframe with columns as specified below, "
    "filtered only for chemicals found in PubChem database."
    "\nMETADATA: list of relevant metadata used in Sheet1."
    "\nMETHODS: list of LCIA methods used in Sheet1."
    "\nTL_METHODS: list of TL in PBs used in Sheet1."
    "\n[METHODS + METADATA + TL_METHODS have to be the only column labels in Sheet1]",
)

w_excel(
    path_to_file=outputs_dir,
    filename=excelName,
    dict_data_to_write={
        "Sheet1": df_base_full_in_pubchem,
        "METADATA": df_metadata,
        "METHODS": df_methods,
        "TL_METHODS": df_tl_methods,
    },
    readme_info=("readme", df_readme),
    #     ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
    #     startrow=0
)