## dataframe generation
generating data used in other plots

#### parameters

In [1]:
if "pbx_E_cutoff" not in locals():
    pbx_E_cutoff = 0.5
if "ads_model" not in locals():
    ads_model = "gemnet_relax_90"
if "showfigs" not in locals():
    showfigs = True
if showfigs:
    from IPython.display import display
if "use_premade" not in locals():
    use_premade = False

#### imports

In [2]:
import numpy as np
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import seaborn as sns

from DOE_HER import load_data, params

from pymatgen.ext.matproj import MPRester
from tqdm import tqdm

In [4]:
from matplotlib.pyplot import rc

rc("font", **{"family": "sans-serif", "sans-serif": ["Arial"]})

In [5]:
def _unpickle_block(values, placement, ndim):
    # Work around a pandas bug

    from pandas.core.internals.blocks import new_block

    return new_block(values, placement, ndim=ndim)


pd._libs.internals._unpickle_block = _unpickle_block

#### load experimental data

In [6]:
exp_df = load_data.experimental_data()  # expt_type='Water', drop_desc=False)

In [7]:
exp_df["LogRate"] = exp_df.MaxRate.apply(np.log)
exp_df = exp_df[exp_df.LogRate.apply(np.isfinite)]

#### load adsorption data

In [8]:
lc_ads_df = load_data.adsorption_energies(
    min_energy=True
)  # .min_adsorption_energies(origin='gemnet_relax_90')

#### load stability data

In [9]:
PDF_full = load_data.binary_echem_stabilities(
    V_range=[
        -1.2,
    ],
    pH_range=[
        8.5,
    ],
)
PDF_full["Components"] = PDF_full.chemsys.apply(
    lambda x: [(x[0], "X") if len(x) == 1 else x][0]
)
PDF_full["mpid"] = PDF_full.entry_id
stab_df = PDF_full[PDF_full.type == "solid"].drop("type", axis=1)
stab_df = stab_df[stab_df.energy < pbx_E_cutoff]

100%|██████████| 253/253 [00:01<00:00, 241.54it/s]


#### match mpids with compositions and chosoe best H binding energy for each composition

In [10]:
comp_stab_df = stab_df.groupby("Components").mpid.aggregate(list)
n_mpids = comp_stab_df.apply(len).max()
comp_stab_df = pd.DataFrame(
    comp_stab_df.tolist(),
    columns=[("mpid%2i" % i).replace(" ", "_") for i in range(n_mpids)],
    index=comp_stab_df.index,
).reset_index()
comp_stab_df["mat_ct"] = comp_stab_df.count(axis=1) - 1
lc_ads_df = pd.concat(
    [
        comp_stab_df.merge(
            lc_ads_df,
            left_on=mpid_col,
            right_on="mpid",
            how="left",
        ).drop(mpid_col, axis=1)
        for mpid_col in [c for c in comp_stab_df.columns if "mpid" in c]
    ]
).drop([c for c in comp_stab_df.columns if "mpid" in c], axis=1)
lc_ads_df = (
    lc_ads_df.drop(lc_ads_df.columns[lc_ads_df.isna().all()], axis=1)
    .dropna(axis=0)
    .reset_index(drop=True)
)
lc_ads_df["miss_mat"] = lc_ads_df.groupby("Components").mpid.transform(
    lambda x: len(np.unique(x)) != lc_ads_df.mat_ct
)
lc_ads_df = lc_ads_df.drop("mat_ct", axis=1)

In [11]:
def choose_best(x, xopt=-0.24):
    x = np.array(x)
    x_dist = np.abs(xopt - x)
    return x[x_dist == min(x_dist)][0]

In [12]:
# lc_ads_df['adsorption_energy_H'] = lc_ads_df.adsorption_energy

In [13]:
stab_ads_df = lc_ads_df[
    lc_ads_df.adsorption_energy_H
    == lc_ads_df.groupby(["Components"]).adsorption_energy_H.transform(choose_best)
].reset_index(drop=True)

#### Average the LogRate for experiments run at the same composition, then choose the best composition for every metal system

In [14]:
exp_df = (
    exp_df.groupby(["Concentrations", "Components"])
    .MaxRate.mean()
    .reset_index()
    .groupby("Components")
    .MaxRate.max()
    .reset_index()
).merge(stab_ads_df, on="Components")

In [15]:
exp_df["has_noble"] = exp_df.Components.apply(
    lambda x: any([xx in params.noble_metals for xx in x])
)

#### define functions for plotting

In [17]:
def order_materials(EDF_in, comp_col, rate_col, ct_cutoff=5):
    if comp_col not in EDF_in.columns:
        EDF_in = EDF_in.reset_index()
    pure_exp_DF = EDF_in[EDF_in.Components.apply(lambda x: "X" in x)].copy()
    pure_exp_DF["elem"] = pure_exp_DF.Components.apply(
        lambda x: np.array(x)[np.array(x) != "X"][0]
    )
    elem_ordering = (
        pure_exp_DF[["elem", rate_col]]
        .sort_values(rate_col, ascending=False)
        .drop(rate_col, axis=1)
        .squeeze()
        .values.tolist()
    )
    elem_counts = (
        pd.Series(
            [
                x
                for X in EDF_in[comp_col].apply(list).values.tolist()
                for x in X
                if x != "X"
            ]
        )
        .value_counts()
        .reset_index()
        .rename({0: "count", "index": "elem"}, axis=1)
    )
    elem_ordering = [
        e
        for e in elem_ordering
        if e in elem_counts[elem_counts["count"] > ct_cutoff].elem.values.tolist()
    ]
    return elem_ordering


def color_materials(
    EDF_in,
    elem_ordering=None,
    EDF_color=None,
    comp_col="Components",
    rate_col="LogRate",
    elems=None,
    activity_quantile_cutoff=0.45,
    ct_cutoff=5,
    return_order=False,
):
    if EDF_color is None:
        EDF_color = EDF_in.copy()

    if elem_ordering is None:
        elem_ordering = order_materials(
            EDF_color[
                EDF_color[rate_col]
                > EDF_color[rate_col].quantile(activity_quantile_cutoff)
            ],
            comp_col=comp_col,
            rate_col=rate_col,
            ct_cutoff=ct_cutoff,
        )
        if elems is not None:
            elem_ordering = [e for e in elem_ordering if e in elems]

    EDF_in["color"] = "other"
    for elem in elem_ordering:
        EDF_in.loc[
            (EDF_in.color == "other") & (EDF_in.Components.apply(lambda x: elem in x)),
            "color",
        ] = elem
    if return_order:
        elem_ordering.append("other")
        return EDF_in["color"], elem_ordering
    return EDF_in["color"]

In [18]:
metal_color_palette = {
    "Au": "darkgoldenrod",
    "Cu": "darkorange",
    "Pd": "forestgreen",
    "Ni": "violet",
    "Pt": "grey",
    "Ag": "blue",
    "other": "steelblue",
}

In [19]:
dominant_metal, metal_ordering = color_materials(
    exp_df.copy(),
    elem_ordering=["Pt", "Pd", "Au", "Ag", "Ni", "Cu", "other"],
    rate_col="MaxRate",
    ct_cutoff=3,
    return_order=True,
    elems=metal_color_palette.keys(),
    activity_quantile_cutoff=0.45,
)
exp_df["dominant_metal"] = dominant_metal
exp_df["dominant_metal"].fillna("other", inplace=True)