> This script is TEMPORTAL. Holds steps 0.40 to 0.83 <br>
> Sources:
  
> Input file `data-regrouped-cpc-divisions-into-3-categories.xlsx` containing:
>  * Sheet1: Extended data with chosen LCIA methods, important metadata and PubChem properties
>  * METADATA: list of relevant metadata used in Sheet1.
>  * METHODS: list of LCIA methods used in Sheet1.

Note:
> `data-regrouped-cpc-divisions-into-3-categories.xlsx` file was generated in `0.31-vt-regroup-cpc-divisions-into-3-categories.ipynb`

In [None]:
# %env
# %who_ls
# %who
# %who int
# %pinfo <var name>

# Imports

In [None]:
%config IPCompleter.use_jedi = False # disable jedi autocompleter (https://stackoverflow.com/a/65734178/14485040)

import project_path  # makes possible the access to `src` directory using relative path
from src.data import (
    create_glo_market,
    filter_dataframe,
    internal_funcs,
    outlier_detectors,
)
from src.utils import explore_dir, make_readme_info
from src.utils import read_excel_to_pandas as r_excel
from src.utils import set_outputs_dir
from src.utils import write_pandas_to_excel as w_excel

%run init_nb.ipynb

# INPUTS: Identify file(s) and read data to df

In [None]:
# Explore the directory to find the file(s)
inputs_dir, files_list = explore_dir(
    path_to_dir=r"..\data\interim", file_extension="xlsx", print_files_list=True
)

In [None]:
# Read data
df_cpc33to36 = r_excel(
    inputs_dir, "data-regrouped-cpc-divisions-into-3-categories.xlsx"
)
print(
    "df_cpc33to36".ljust(40, "."),
    f"{df_cpc33to36.shape}\n".rjust(13, "."),
)

# Get list of LCIA methods and list of metadata
METHODS = r_excel(
    inputs_dir,
    "data-regrouped-cpc-divisions-into-3-categories.xlsx",
    sheets="METHODS",
    show_readme=False, 
)["METHODS"].values.tolist()

METADATA = r_excel(
    inputs_dir,
    "data-regrouped-cpc-divisions-into-3-categories.xlsx",
    sheets="METADATA",
    show_readme=False, 
)["METADATA"].values.tolist()

In [None]:
pd.options.display.max_columns = None

# Operations

## Create mass allocated GLO markets from non-GLO markets
<div class="alert alert-block alert-info">
created: <code>df_analysis_extended</code>
</div>

In [None]:
df_analysis_extended = create_glo_market(
    df_in=df_cpc33to36,
    columns_to_allocate=METHODS,
    activity_column="Activity",
    refprod_column="referenceProduct",
    geo_column="geo",
    prodvol_column="referenceProduct_prodVolume",
    comment_column="activity_generalComment",
)

print("Created **df_analysis_extended** dataframe is of {} shape.".format(df_analysis_extended.shape))
df_analysis_extended.tail(2)

internal_funcs.plot_categories(
    df_analysis_extended,
    groupby="category_regrouped",
    cutoff_value=0,
    color="purple",
    fontsize=12,
)

### Checks!

#### -- export to excel (activate if required)

In [None]:
# ## Export dataframe to excel
# excelName = "df_to_analyze_extended_GLOmarkets.xlsx"

# df_readme = readme_data(
#     excelName,
#     "Dataframe similar to df_to_analyze.xlsx [(selected) metadata of chemical markets and scores for multiple LCIA methods], but with additional GLO chemical markets, obtained from mass allocation of respective non-GLO markets.",
# )

# writedf_to_Excel(
#     path_to_file=outputsDir,
#     filename=excelName,
#     sheetname_and_data={"Sheet1": df_analysis_extended}, 
#     readme_data={"readme":df_readme},
# #     ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
# #     startrow=0
# )

#### Checks

## Created dfs to work with: `df_base_full` and `df_base_full_wCAS`

#### create df_base only with GLO markets and individual FU=kg

In [None]:
df_base = df_analysis_extended[
    (df_analysis_extended.geo == "GLO")
    & (df_analysis_extended.referenceProductUnit == "kg")
]# .shape


# # Group by shortName_geo_SP -> filter by GLO -> Filter "unit" by "kg"
# df_base = _filter_by_geo_and_FU(
# #     df=df_analysis,
#     df=df_analysis_extended,
#     geo="GLO", 
#     FU="kg"
# )
print("Created **df_base** dataframe is of {} shape.\n".format(df_base.shape))
# df_base

# Grouping by 'category' and ploting the size of each group on a barh plot (in one line)
internal_funcs.plot_categories(
    df_in=df_base,
    groupby="category_regrouped",   
    color="blue", 
    fontsize=12,
    cutoff_value=8,
)

In [None]:
# Products excluded from the analysis
internal_funcs.excluded_products(
    df_raw=df_analysis_extended, # or =df_analysis_extended, (both have the same nº of unique products)
    df_filtered=df_base
)

#### add transgression levels (TLs) to df_base
<div class="alert alert-block alert-warning">
created: <strong>df_base_full and df_base_full_wCAS</strong>  
</div> 

In [None]:
# correct methods name for TLs... add lst_methods_TLs
lst_methods_TLs = lst_methods_TLs = ["TL in " + sub for sub in lst_methods[1:]]
lst_methods_TLs

##### create `df_base_full` with ALL activities

In [None]:
df_base_full = pd.concat(
    [
        df_base,
        calculate_TL_PBs(
            df_base,
            method_labels=lst_methods[1:],
            price_column="referenceProduct_price",
            GVA_world = 7.38e13, # in 2018
            correctGVA=None,
#             correctGVA="sales",
#             correctGVA="purchases",
#             share_of_SOS=0.0689, # aggregated shares of 4 sectors (C19-22) using GGG method
#             share_of_SOS=0.0237, # only C20 sector using GGG method
#             share_of_SOS=0.0274, # aggregated shares of 4 sectors (C19-22) using WIOD with L inverse
#             share_of_SOS=0.0076, # only C20 sector using WIOD with L inverse
        ).add_prefix("TL in "),
    ],
    axis=1,
)
print("Created **df_base_full** dataframe is of {} shape.\n".format(df_base_full.shape))
df_base_full.tail(5)

In [None]:
# convert prices from EUR2005 to USD2018
# using this unit the TLs were calculated !!!
PPI_2018 = 104.5 # Producer Price Index from Eurostat
PPI_2005 = 86.0  # Producer Price Index from Eurostat
USD_per_EUR_2018 = 1.1811 # average exchange rate EUR to USD in 2018

df_base_full.referenceProduct_price = (df_base_full.referenceProduct_price * PPI_2018 / PPI_2005) * USD_per_EUR_2018
df_base_full.referenceProduct_priceUnit = "USD2018"
df_base_full.sample(2)

In [None]:
# Grouping by 'category_regrouped' and ploting the size of each group on a barh plot (in one line)
internal_funcs.plot_categories(
    df_in=df_base_full,
    groupby="category_regrouped",
    color="green",
    fontsize=12,
)

In [None]:
df_base_full[df_base_full.referenceProduct == 'Cyclic N-compound']

In [None]:
# ## Export dataframe to excel
# excelName = "df_base_full.xlsx"

# df_readme = readme_data(
#     excelName,
#     "Dataframe df_base_full - all GLO markets with PBs and TLs.",
# )

# writedf_to_Excel(
#     path_to_file=outputsDir,
#     filename=excelName,
#     sheetname_and_data={"Sheet1": df_base_full}, 
#     readme_data={"readme":df_readme},
# #     ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
# #     startrow=0
# )

##### create `df_base_full_wCAS` ONLY with activities detected in PubChem

In [None]:
df_base_full_wCAS = df_base_full[df_base_full.num_matches != 0]

print("Created **df_base_full_wCAS** dataframe is of {} shape.\n".format(df_base_full_wCAS.shape))
df_base_full_wCAS.sample(2)

In [None]:
# Grouping by 'category_regrouped' and ploting the size of each group on a barh plot (in one line)
internal_funcs.plot_categories(
    df_in=df_base_full_wCAS,
    groupby="category_regrouped",
    color="darkorange",
    fontsize=12,
)

## `highlighted_product` list

### research possible highlighted products

In [None]:
lst_known_chemicals = [
    "Toluene",
    "Xylene",
    # Javier's list below
    "Liquefied petroleum gas",  # ok
    "Petrol",  # "Gasoline", # ok
    "Diesel",  # ok
    "Kerosene",  # ok
    "Ethylene",  # ok
    "Propylene",  # ok
    "Benzene",  # ok
    "Synthetic gas",  # FU 1m3
    "Ammonia, liquid",  # ok
    "Methanol",  # ok
    "Sulfuric acid",  # ok
    "Chlorine",  # ok
    "Acetic acid",  # ok
    "Formaldehyde",  # ok
    "Urea",  # ok
    "Ethylene oxide",  # ok
    "Acrylonitrile",  # ok
    "Acetaldehyde",  # ok
    "Polyethylene",  # ok
    "Polypropylene",  # ok
    "Polyvinylchloride",  # ok
    "Hydrogen",  # ok
]

for item in lst_known_chemicals:
    print("Looking for " + item)
    filter_dataframe(
        df_analysis_extended,
        col_name="referenceProduct",
        filter_in=[item],
        print_unique=True,
    )

### selected products

In [None]:
# Exact names only!

highlighted_product = [
    "Sulfuric acid",
    "Kerosene",
    "Diesel, low-sulfur", # or "Diesel",
    "Liquefied petroleum gas",
    "Methanol",
    "Petrol, low-sulfur",
    "Formaldehyde",
    "Chlorine, liquid",
    "Ethylene, average",
    "Propylene",
    "Toluene, liquid",
    "Acetic acid, without water, in 98% solution state",
    "Acetaldehyde",
    "Polyethylene, high density, granulate",
    "Benzene",
    "Ammonia, liquid",
    "Polypropylene, granulate",
    "Ethylene oxide",
    "Polyvinylchloride, bulk polymerised",
    "Hydrogen, liquid",
    "Acrylonitrile",
    "Urea, as N",
    "1-propanol",
    "Acetylene",
    "Chlorotoluron",
    "Methylene diphenyl diisocyanate",
    "Ammonium nitrate, as N",
    "Pyridine",
    "Nylon 6-6",
    "Glyphosate",
    "Para-phenylene diamine",
    "Fluorine, liquid",
    "Adipic acid",
    "Xylene"
]

selected = internal_funcs.find_chemicals(
    df_base_full,
#     df_base_full_wCAS,
    highlighted_product,
    colname="referenceProduct",
)[
    ["Activity"]
    + ["referenceProduct"]
    + ["geo"]
    #     + ["category"]
        + ["category_regrouped"]
    + ["referenceProduct_CPCclass"]
    #     + ["referenceProduct_prodVolume"]
    + ["('IPCC 2013 GWP 100a V1.03', 'IPCC GWP 100a', 'kg CO2 eq')"]
    #     + ["complexity"]
        + ["MF"]
    #     lst_metadata
    #     + lst_methods
]

selected[selected.geo == "GLO"].sort_values(
    by="('IPCC 2013 GWP 100a V1.03', 'IPCC GWP 100a', 'kg CO2 eq')"
)

In [None]:
# Make df of highlighted_product for later export
df_highlighted_product = pd.DataFrame(highlighted_product, columns=["highlighted_product"])
# df_highlighted_product

### -- export to excel (activate if required)

In [None]:
# ## Export dataframe to excel
# excelName = "df_GLO_markets.xlsx"

# df_readme = readme_data(
#     excelName,
#     "Filtered dataframe includes only GLO chemical markets with FU = 1kg",
# )

# writedf_to_Excel(
#     path_to_file=outputsDir,
#     filename=excelName,
#     sheetname_and_data={"Sheet1": df_base}, 
#     readme_data={"readme":df_readme},
# #     ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
# #     startrow=0
# )

## Filtering outliers
<div class="alert alert-block alert-info">
use previously created:  <br>
    <strong>df_base_full</strong> - only GLO, kg markets from CPC 33-36  <br>
    <strong>ddf_base_full_wCAS</strong> - only GLO, kg markets from CPC 33-36 with identified chemical properties (could be refined!)
</div>

### apply Mahalanobis Distance method to detect outliers

In [None]:
# df_to_detect_MDm = df_base_full
df_to_detect_MDm = df_base_full_wCAS

(
    df_clean_metNtlNpr_MDm,
    df_outliers_metNtlNpr_MDm,
    more_metNtlNpr_MDm,
) = outlier_detectors.mahalanobis_method(
    df_raw=df_to_detect_MDm[
        lst_methods[0:1]
        + lst_methods[1:]
        + lst_methods_TLs
        + ["referenceProduct_price"]
    ],
    alpha=(1 - 0.95),
)
print("out of", df_to_detect_MDm.shape[0], "items")
print(len(more_metNtlNpr_MDm[0]), "outliers detected")

outlier_detectors.make_full_df_after_outlier_detection_method(
    df_to_detect_MDm, df_outliers_metNtlNpr_MDm
)[
    ["referenceProduct"]
    + [lst_methods[0]]
    + [lst_methods_TLs[0]]
    + ["referenceProduct_price"]
]

In [None]:
# df_base_full_wCAS[
#     df_base_full_wCAS["('IPCC 2013 GWP 100a V1.03', 'IPCC GWP 100a', 'kg CO2 eq')"]
#     > 1000
# ]

In [None]:
# Grouping by 'category_regrouped' and ploting the size of each group on a barh plot (in one line)

df_base_full_wCAS_woOutliersMDk20a5 = outlier_detectors.make_full_df_after_outlier_detection_method(
    df_to_detect_MDm, df_clean_metNtlNpr_MDm
)

internal_funcs.plot_categories(
    df_in=df_base_full_wCAS_woOutliersMDk20a5,
    groupby="category_regrouped",
    color="darkorange",
    fontsize=12,
)

### apply Robust Mahalanobis Distance method to detect outliers

In [None]:
# df_to_detect_RMDm = df_base_full
df_to_detect_RMDm = df_base_full_wCAS

(
    df_clean_metNtlNpr_RMDm,
    df_outliers_metNtlNpr_RMDm,
    more_metNtlNpr_RMDm,
) = outlier_detectors.robust_mahalanobis_method(
    df_to_detect_RMDm[
        #         lst_methods
        #         +
        lst_methods_TLs
        #         + ["referenceProduct_price"]
    ],
    alpha=(1 - 0.95),
    support_fraction=None,
)
print("out of", df_to_detect_RMDm.shape[0], "items")
print(len(more_metNtlNpr_RMDm[0]), "outliers detected")

outlier_detectors.make_full_df_after_outlier_detection_method(
    df_to_detect_RMDm, df_outliers_metNtlNpr_RMDm
)[
    ["referenceProduct"]
    + [lst_methods[0]]
    + [lst_methods_TLs[0]]
    + ["referenceProduct_price"]
]

In [None]:
df_outliers = df_base_full_wCAS.loc[more_metNtlNpr_RMDm[0],:][
    lst_metadata[0:1]
    + lst_metadata[4:5]
#     + ["referenceProduct_price"]
#     + lst_methods[0:1]
    + lst_methods_TLs
]
# df_outliers

In [None]:
%%time

# Set output directory
outputs_dir = set_outputs_dir(use_default=True)  # default `..\data\interim`

## Export dataframe to excel
excelName = "list-outliers.xlsx"

df_readme = make_readme_info(
    excelName,
    "Sheet1: Dataframe of chemicals detected as outliers using the robust MD method",
)

w_excel(
    path_to_file=outputs_dir,
    filename=excelName,
    dict_data_to_write={
        "Sheet1": df_outliers,
    },
    readme_info=("readme", df_readme),
    #     ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
    #     startrow=0
)

In [None]:
# Grouping by 'category_regrouped' and ploting the size of each group on a barh plot (in one line)
df_base_full_wCAS_woOutliersRMDk9a5 = outlier_detectors.make_full_df_after_outlier_detection_method(
    df_to_detect_RMDm, df_clean_metNtlNpr_RMDm
)
internal_funcs.plot_categories(
    df_in=df_base_full_wCAS_woOutliersRMDk9a5,
    groupby="category_regrouped",
    color="darkorange",
    fontsize=12,
)

In [None]:
internal_funcs.plot_categories(
#     df_in=df_base_full_wCAS_woOutliersRMDk9a5,
#     df_in=df_analysis,
#         df_in=df_base_full, 
#     df_in=df_base_full_wCAS,
    df_in=df_cpc33to36, 
    groupby="activity_ISICclass",
    cutoff_value=10,
    color="gray",
    fontsize=12,
)

In [None]:
# df_cpc33to36[df_cpc33to36.activity_ISICclass=="1920:Manufacture of refined petroleum products"]

In [None]:
# sorted(df_base_full_wCAS_woOutliersRMDk9a5.activity_ISICclass.unique())
# sorted(df_base_full_wCAS.activity_ISICclass.unique())
# sorted(df_analysis.activity_ISICclass.unique())
sorted(df_base_full.activity_ISICclass.unique())

In [None]:
# df_base_full_wCAS_woOutliersRMDk9a5.shape
df_base_full_wCAS.shape

In [None]:
# df_base_full_wCAS_woOutliersRMDk9a5[
#     df_base_full_wCAS_woOutliersRMDk9a5.activity_ISICclass.isin(
# df_base_full_wCAS[
#     df_base_full_wCAS.activity_ISICclass.isin(
df_base_full[
    df_base_full.activity_ISICclass.isin(
        [
            '0891:Mining of chemical and fertilizer minerals',
#             "2011:Manufacture of basic chemicals", # 361
#             "2011a: Manufacture of nuclear fuels", # 5
#             "2012:Manufacture of fertilizers and nitrogen compounds", # 11
#             "2013:Manufacture of plastics and synthetic rubber in primary forms", # 37
#             "2021:Manufacture of pesticides and other agrochemical products", # 20
#             "2023:Manufacture of soap and detergents, cleaning and polishing preparations, pe", # 7
#             "2029:Manufacture of other chemical products n.e.c.", # 4
#             "20:Manufacture of chemicals and chemical products", # 2
        ]
    )
]#.shape

### apply Tukey method to detect outliers (univariate)

In [None]:
# df_to_detect_Tm = df_base_full
df_to_detect_Tm = df_base_full_wCAS

df_clean_tukey, df_outliers_tukey = outlier_detectors.tukey_method_bulk(
    df_to_detect_Tm[lst_methods + lst_methods_TLs + ["referenceProduct_price"]],
    outlier_detection_fence="tight",
)
print("out of", df_to_detect_Tm.shape[0], "items")
print("Tukey univariate method detected:")
for i in df_outliers_tukey.columns:
    print("in ", i, ">>>>> ", df_outliers_tukey[i].count(), "outliers.")

### apply Mahalanobis Distance method to detect outliers depending on CATEGORY

In [None]:
# chemical_cat = "Organic chemical"
chemical_cat = "Inorganic chemical"
# chemical_cat = "Other chemical"
# df_to_detect_MDmCAT = df_base_full[df_base_full.category_regrouped==chemical_cat]
df_to_detect_MDmCAT = df_base_full_wCAS[
    df_base_full_wCAS.category_regrouped == chemical_cat
]

(
    df_clean_metNtlNpr_MDmCAT,
    df_outliers_metNtlNpr_MDmCAT,
    more_metNtlNpr_MDmCAT,
) = outlier_detectors.mahalanobis_method(
    df_to_detect_MDmCAT[lst_methods + lst_methods_TLs + ["referenceProduct_price"]],
    alpha=(1 - 0.95),
)
print("out of", df_to_detect_MDmCAT.shape[0], "items")
print(len(more_metNtlNpr_MDmCAT[0]), "outliers detected")

outlier_detectors.make_full_df_after_outlier_detection_method(
    df_to_detect_MDmCAT, df_outliers_metNtlNpr_MDmCAT
)[
    ["referenceProduct"]
    + [lst_methods[0]]
    + [lst_methods_TLs[0]]
    + ["referenceProduct_price"]
]

# df_clean_metNtlNpr_MDmCAT.sort_values(by="('IPCC 2013 GWP 100a V1.03', 'IPCC GWP 100a', 'kg CO2 eq')").tail(5)

In [None]:
df_base_full_wCAS.loc[162:163]

# Insight from the data

In [None]:
df_for_insights = df_base_full_wCAS_woOutliersRMDk9a5

In [None]:
lst_methods_TLs

In [None]:
num_chemicals_tot = df_for_insights.shape[0]
num_chemicals_tot

#### * functions

In [None]:
def ecdf(ar):
    """Create df with empirical CDF data.
    
    eCDF - empirical Cumulative Distribution Function
    
    Parameters
    ----------
    ar: 1-D array-like
        Input array, should be 1-D pandas series
    
    Returns
    -------
    df_out: DataFrame
        Dataframe containing the original index of the input data with
        - column, named as passed data, containing ordered input items, 
        - column, named "counts", containing the number of times each unique item appears
        - column, named "cumsum", containing the cumulate sum of counts
        - column, named "Probability", containing the cumulative probability of occurance of each item
    """
    x, indices, counts = np.unique(ar, return_index=True, return_counts=True)
    cusum = np.cumsum(counts)
    prob = cusum / cusum[-1] 
    df_out = pd.DataFrame({ar.name: x, "counts": counts, "cumsum": cusum, "Probability": prob}, index=indices)
    
    return df_out

In [None]:
import functools
def conjunction(*conditions):
    """All conditions met at the same time"""
    return functools.reduce(np.logical_and, conditions)

def disjunction(*conditions):
    """Any condition met"""
    return functools.reduce(np.logical_or, conditions)

## % of chemicals transgressing at least one PB?

In [None]:
c1 = df_for_insights[lst_methods_TLs[0]] > 1
c2 = df_for_insights[lst_methods_TLs[1]] > 1
c3 = df_for_insights[lst_methods_TLs[2]] > 1
c4 = df_for_insights[lst_methods_TLs[3]] > 1
c5 = df_for_insights[lst_methods_TLs[4]] > 1
c6 = df_for_insights[lst_methods_TLs[5]] > 1
c7 = df_for_insights[lst_methods_TLs[6]] > 1
c8 = df_for_insights[lst_methods_TLs[7]] > 1
c9 = df_for_insights[lst_methods_TLs[8]] > 1

num_chemicals_trans_at_least_onePB = df_for_insights[
    disjunction(c1, c2, c3, c4, c5, c6, c7, c8, c9)
].shape[0]
print(
    "{} of the chemicals transgress at least one PB, i.e., {}% of the dataset.".format(
        num_chemicals_trans_at_least_onePB,
        round(num_chemicals_trans_at_least_onePB/num_chemicals_tot*100, 2)
    )
)

# df_for_insights[disjunction(c1, c2, c3, c4, c5, c6, c7, c8, c9)]

## % of chemicals absolute sustainable?

In [None]:
c1 = df_for_insights[lst_methods_TLs[0]] <= 1
c2 = df_for_insights[lst_methods_TLs[1]] <= 1
c3 = df_for_insights[lst_methods_TLs[2]] <= 1
c4 = df_for_insights[lst_methods_TLs[3]] <= 1
c5 = df_for_insights[lst_methods_TLs[4]] <= 1
c6 = df_for_insights[lst_methods_TLs[5]] <= 1
c7 = df_for_insights[lst_methods_TLs[6]] <= 1
c8 = df_for_insights[lst_methods_TLs[7]] <= 1
c9 = df_for_insights[lst_methods_TLs[8]] <= 1

num_chemicals_abs_sustainable = df_for_insights[
    conjunction(c1, c2, c3, c4, c5, c6, c7, c8, c9)
].shape[0]
print(
    "{} of the chemicals are absolute sustainable ({}% of the dataset),"
    " i.e., they don't transgress any of the PBs.".format(
        num_chemicals_abs_sustainable,
        round(num_chemicals_abs_sustainable / num_chemicals_tot * 100, 2),
    )
)

df_for_insights[conjunction(c1, c2, c3, c4, c5, c6, c7, c8, c9)]

## % of chemicals transgressing all the PBs at the same time?

In [None]:
c1 = df_for_insights[lst_methods_TLs[0]] > 1
c2 = df_for_insights[lst_methods_TLs[1]] > 1
c3 = df_for_insights[lst_methods_TLs[2]] > 1
c4 = df_for_insights[lst_methods_TLs[3]] > 1
c5 = df_for_insights[lst_methods_TLs[4]] > 1
c6 = df_for_insights[lst_methods_TLs[5]] > 1
c7 = df_for_insights[lst_methods_TLs[6]] > 1
c8 = df_for_insights[lst_methods_TLs[7]] > 1
c9 = df_for_insights[lst_methods_TLs[8]] > 1

num_chemicals_bad_in_allPBs = df_for_insights[
    conjunction(c1, c2, c3, c4, c5, c6, c7, c8, c9)
].shape[0]
print(
    "{} of the chemicals trasnsgress all the PBs simultaneously ({}% of the dataset)".format(
        num_chemicals_bad_in_allPBs,
        round(num_chemicals_bad_in_allPBs / num_chemicals_tot * 100, 2),
    )
)

df_for_insights[conjunction(c1, c2, c3, c4, c5, c6, c7, c8, c9)]

## % of chemicals transgressing each PB? (or above/below any value of TL)

In [None]:
cat_of_interest = lst_methods_TLs[5]
df = df_for_insights[cat_of_interest]

df_out = ecdf(df)

stored_indices = []  # they are indices of df_out, not df
indices_duplicated_scores = []  # they are indices of df_out, not df

for ix in df_out.index:
    if df_out[df.name][ix] <= 1:
        print_prob = df_out.Probability[ix]
        print_item = df_out[df.name][ix]
        stored_indices.append(ix)
    if df_out.counts[ix] != 1:
        indices_duplicated_scores.append(ix)

# find the chemicals in the original df (translate indices from df_out to df)
# for stored_indices:
df_from_stored_indices = df_for_insights[
    df_for_insights[cat_of_interest].isin(df_out[cat_of_interest][stored_indices])
]

# for indices_duplicated_scores:
df_from_duplicated_scores = df_for_insights[
    df_for_insights[cat_of_interest].isin(
        df_out[cat_of_interest][indices_duplicated_scores]
    )
]

print(
    "set of chemicals not transgressing the PB: {} of {}".format(
        df_from_stored_indices.shape[0], df_for_insights.shape[0]
    )
)
print("Probability of the set:", np.round(print_prob, 3))
print("Max value of TL included in set, max(TL)=", print_item)

print(
    "{} % of chemicals are transgesssed in {}".format(
        np.round((1 - print_prob) * 100, 2), df.name
    )
)

# visualize the data (uncomment)
# df_from_duplicated_scores
df_from_stored_indices

## maximum/minimum TL for each PB

In [None]:
df_for_insights[lst_methods_TLs].max()

In [None]:
df_for_insights[lst_methods_TLs].min()

In [None]:
df_for_insights[lst_methods_TLs].max()/df_for_insights[lst_methods_TLs].min()

In [None]:
df_for_insights[df_for_insights.referenceProduct.isin(highlighted_product)][lst_methods_TLs].min()

## sort by specific TL for each PB

In [None]:
df_for_insights.sort_values(by=lst_methods_TLs[4]).tail(24)

## For any PB: which chemicals transgress? to which category they belong?

In [None]:
# df_for_insights[df_for_insights.referenceProduct.isin(highlighted_product)]

In [None]:
# ctg = lst_methods_TLs[6] # Land-system change - Global !!!! NO TRANSGRESSED CHEMICALS
# ctg = lst_methods_TLs[7] # Freshwater use - Global !!!! 7 transgressed
# ctg = lst_methods_TLs[2] # Stratospheric ozone depletion !!!! 33 transgressed
# ctg = lst_methods_TLs[4] # Biogeochemical flows - P !!!! 23 transgressed
ctg = lst_methods_TLs[5] # Biogeochemical flows - N !!!! 126 transgressed




print(ctg, "\n")
df_for_insights[df_for_insights[ctg] > 1].category_regrouped.unique()
df_for_insights[df_for_insights[ctg] > 1].activity_ISICclass.unique()
# sorted(df_for_insights[df_for_insights[lst_methods_TLs[5]]>1].referenceProduct_CPCclass.unique())

internal_funcs.plot_categories(
    df_for_insights[df_for_insights[ctg] > 1], # .between(1, 73.8, inclusive=True)
#     groupby="referenceProduct_CPCclass",
    groupby="category_regrouped",
    cutoff_value=0,
)

# df_for_insights[df_for_insights[ctg].between(1, 73.8, inclusive=True)].sort_values(by=ctg) # .tail(10)

# how many fixate N in their formula?
N_regex = re.compile(
    r"(.*N[A-Z0-9].*)|(.*N$)"
)  # compiled regular expression for formulas with N

formulas_with_N = []
for i in df_for_insights[df_for_insights[ctg] > 1].sort_values(by=ctg).MF:
    mo = N_regex.match(str(i))  # match object
    if mo:
        formulas_with_N.append(i) #mo.group())
#         print(mo.group())
print(round(
    len(formulas_with_N)
    / len(df_for_insights[df_for_insights[ctg] > 1].sort_values(by=ctg).MF)
    * 100, 2
),"% fixate N directly")

In [None]:
# 38.1 % fixate N directly (48 chemicals in total)
df_trnNflow = df_for_insights[df_for_insights[ctg] > 1]
# to which classification they belong?
df_trnNflow_fixateNdirectly = df_trnNflow[df_trnNflow.MF.isin(formulas_with_N)]
df_trnNflow_fixateNdirectly
# df_trnNflow_fixateNdirectly.referenceProduct_CPCclass.unique()
# # how many of them belong to Fertilisers and pesticides (CPC: 346)?
# df_trnNflow_fixateNdirectly[
#     df_trnNflow_fixateNdirectly.referenceProduct_CPCclass.isin(
#        [ "34663: Herbicides, anti-sprouting products and plant-growth regulators", 
#         '34653: Ammonium chloride; nitrites',
#         '34662: Fungicides',]
#     )
# ]

In [None]:
df_for_insights[df_for_insights[ctg] < 0].sort_values(by=ctg)

In [None]:
df_for_insights[df_for_insights[ctg].between(1, 73.8, inclusive=True)].sort_values(by="category_regrouped").tail(31)

In [None]:
# list of Fertilizers and pesticides, directly related to agriculture and their TLs in BGC flows
ddd = {}
for i in df_for_insights.index:
    if df_for_insights.referenceProduct_CPCclass[i].startswith(
        "346"
    ):  # Group 346 of CPC: Fertilizers and pesticides
        ddd[i] = (
            df_for_insights.referenceProduct_CPCclass[i],
            df_for_insights.referenceProduct[i],
            df_for_insights.MF[i],
            df_for_insights[lst_methods_TLs[5]][i],
            df_for_insights[lst_methods_TLs[4]][i],
        )
pd.DataFrame.from_dict(
    ddd,
    orient="index",
    columns=["CPC", "refProduct", "MF", "TL in N flow", "TL in P flow"],
).sort_values(by="TL in N flow")

## probability TL(EPC) > TL(GF) ?

In [None]:
TL_GF = {
    "TL in ('PBs-LCIA (baseline) V0.72', 'Climate change - CO2 concentration', 'ppm')": 15.069444444444445,
    "TL in ('PBs-LCIA (baseline) V0.72', 'Climate change - Energy imbalance', 'Wm-2')": 14.8,
    "TL in ('PBs-LCIA (baseline) V0.72', 'Stratospheric ozone depletion', 'DU')": 0.4827586206896552,
    "TL in ('PBs-LCIA (baseline) V0.72', 'Ocean acidification', 'Omega Aragon')": 4.811594202898551,
    "TL in ('PBs-LCIA (baseline) V0.72', 'Biogeochemical flows - P', 'Tg P')": 2.1111111111111107,
    "TL in ('PBs-LCIA (baseline) V0.72', 'Biogeochemical flows - N', 'Tg N')": 2.4193548387096775,
    "TL in ('PBs-LCIA (baseline) V0.72', 'Land-system change - Global', '%')": 1.52,
    "TL in ('PBs-LCIA (baseline) V0.72', 'Freshwater use - Global', 'km3')": 0.65,
    "TL in ('PBs-LCIA (baseline) V0.72', 'Change in biosphere integrity - BII loss', '% BII loss')": 2.68,
}

In [None]:
cat_of_interest = lst_methods_TLs[8]
df = df_for_insights[cat_of_interest]

df_out = ecdf(df)

stored_indices = []  # they are indices of df_out, not df
indices_duplicated_scores = []  # they are indices of df_out, not df

for ix in df_out.index:
    if df_out[df.name][ix] <= TL_GF[cat_of_interest]:
        print_prob = df_out.Probability[ix]
        print_item = df_out[df.name][ix]
        stored_indices.append(ix)
    if df_out.counts[ix] != 1:
        indices_duplicated_scores.append(ix)

# find the chemicals in the original df (translate indices from df_out to df)
# for stored_indices:
df_from_stored_indices = df_for_insights[
    df_for_insights[cat_of_interest].isin(df_out[cat_of_interest][stored_indices])
]

# for indices_duplicated_scores:
df_from_duplicated_scores = df_for_insights[
    df_for_insights[cat_of_interest].isin(
        df_out[cat_of_interest][indices_duplicated_scores]
    )
]

print(
    "set of chemicals with TL_EPC lower than TL_GF ({}): {} of {}".format(
        TL_GF[cat_of_interest],
        df_from_stored_indices.shape[0], 
        df_for_insights.shape[0]
    )
)
print("Probability of the set:", np.round(print_prob, 3))
print("Max value of TL_EPC included in set, max(TL_EPC)=", print_item)

print(
    "{}% of chemicals with TL_EPC > TL_GF in {}".format(
        np.round((1 - print_prob) * 100, 2), 
        df.name
    )
)

# visualize the data (uncomment)
# df_from_duplicated_scores
# df_from_stored_indices

## d

In [None]:
df_for_insights[df_for_insights.referenceProduct.isin(highlighted_product)]

In [None]:
len(highlighted_product)

# OUTPUTS: Export data to excel

In [None]:
# TEMPORAL
# Make df of lst_metadata, lst_methods_TLs and lst_methods for later export

df_metadata = pd.DataFrame(lst_metadata, columns=["lst_metadata"])
df_methods_TLs = pd.DataFrame(lst_methods_TLs, columns=["lst_methods_TLs"]) 
df_methods = pd.DataFrame(lst_methods, columns=["lst_methods"])


In [None]:
%%time

# Set output directory
outputs_dir = set_outputs_dir(use_default=True)  # default `..\data\interim`

## Export dataframe to excel
excelName = "temp-df_base_full_wCAS_woOutliersRMDk9a5.xlsx"

df_readme = make_readme_info(
    excelName,
    "Temporal output from 0.30-to-0.83.ipynb. \n"
    "Is used for plotting\n"
    "Will have to be split later...",
)

w_excel(
    path_to_file=outputs_dir,
    filename=excelName,
    dict_data_to_write={
        "Sheet1": df_base_full_wCAS_woOutliersRMDk9a5,
        "highlighted_product": df_highlighted_product,
        "lst_metadata" : df_metadata,
        "lst_methods_TLs": df_methods_TLs,
        "lst_methods": df_methods,
    },
    readme_info=("readme", df_readme),
    #     ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
    #     startrow=0
)

In [None]:
%%time

# Set output directory
outputs_dir = set_outputs_dir(use_default=True)  # default `..\data\interim`

## Export dataframe to excel
excelName = "temp-df_base_full_wCAS.xlsx"

df_readme = make_readme_info(
    excelName,
    "Temporal output from 0.30-to-0.83.ipynb. \n"
    "Is used for plotting\n"
    "Will have to be split later...",
)

w_excel(
    path_to_file=outputs_dir,
    filename=excelName,
    dict_data_to_write={
        "Sheet1": df_base_full_wCAS,
        "highlighted_product": df_highlighted_product,
        "lst_metadata" : df_metadata,
        "lst_methods_TLs": df_methods_TLs,
        "lst_methods": df_methods,        
    },
    readme_info=("readme", df_readme),
    #     ExcelWriter_kwargs={"engine": "openpyxl", "encoding": "UTF-8"}
    #     startrow=0
)