See the [fireworks documentation](https://github.com/materialsproject/fireworks/blob/e37cd8ceef1b4696f4ea821ae7e209b93532f080/fw_tutorials/worker/my_launchpad.yaml#L4) for an example of how the `my_launchpad.yaml` file is set up

The key piece here is obtaining a MongoDB client with atomate2 outputs as a collection

In [None]:
from pymongo import MongoClient
import yaml
from monty.serialization import loadfn, dumpfn
from pathlib import Path
import os
import plotly.graph_objects as go

from seaborn import color_palette

with open("/path/to/fireworks/like/my_launchpad.yaml", "r") as f:
    client = MongoClient(**yaml.load(f, Loader=yaml.Loader))
db = client.mp_matgen
wf_collec = db.mof_diff_data

output_dir = Path("mof_workflows")
if not output_dir.exists():
    output_dir.mkdir(exist_ok=True, parents=True)

The first few steps obtain the range of valid CIF identifiers used in the calculations

In [None]:
query = {
    "metadata.job_info": "mof discovery v2",
    "$or": [
        {"output.structure": {"$exists": True}},
        {"output.is_mof": {"$exists": True}},
    ],
}
projections = {
    k: 1
    for k in [
        "name",
        "metadata",
        *[f"output.{p}" for p in ("structure", "is_mof", "energy", "N2", "H2O", "CO2")],
    ]
}

In [None]:
mofs = set()
wf_steps = set()
cif_collec = set()
for wf_meta in wf_collec.find(query, {"_id": 1, "name": 1, "metadata.cif_ext": 1}):
    wf_name = wf_meta["name"]
    mof_name = wf_name.split()[0]
    wf_step = wf_name.split(f"{mof_name} ")[-1]
    mofs.update({mof_name})
    wf_steps.update({wf_step})
    cif_collec.update({wf_meta["metadata"]["cif_ext"]})
dumpfn(
    {"cif_ext": list(cif_collec), "wf_steps": list(wf_steps), "mof_ids": list(mofs)},
    "wf_meta.json.gz",
)

In [None]:
wf_meta = loadfn("wf_meta.json.gz")

In [None]:
out_path = Path("mof_workflows/by_cif_ext")
if not out_path.exists():
    out_path.mkdir(exist_ok=True, parents=True)

for cif_collec in wf_meta["cif_ext"]:
    collec_docs = {}
    print(cif_collec)
    mof_docs = {}
    for doc in wf_collec.find({**query, "metadata.cif_ext": cif_collec}, projections):
        mof = doc["metadata"]["MOF"]
        job_name = None
        for wf_step in wf_meta["wf_steps"]:
            if wf_step in doc["name"]:
                job_name = wf_step
                break

        if mof not in collec_docs:
            collec_docs[mof] = {}

        if (idfr := wf_step) in collec_docs[mof]:
            for idx in range(len(collec_docs[mof])):
                if (new_idfr := f"{idfr}_{idx}") not in collec_docs[mof]:
                    idfr = new_idfr
                    break
        collec_docs[mof][idfr] = doc["output"]

    dumpfn(collec_docs, out_path / f"{cif_collec}.json.gz")

Now we query MongoDB for each step of the workflow

In [None]:
ordered_wf_steps = {
    "zeo++ input structure": "Zeo++ on diffusion structure",
    "MACE relax": "MACE(-MP-0 relaxation",
    "zeo++ mace-relaxed structure": "Zeo++ on MACE-relaxed structure",
    "gfn-xtb relax": "GFN1-xTB relaxation",
}

In [None]:
if (mof_wf_comp_file := Path("mof_wf_step_completion.json.gz")).exists():
    mof_completion = loadfn(mof_wf_comp_file)
else:
    mof_idxs = set()
    mof_completion = {wf_step.replace("<br>", " "): [] for wf_step in ordered_wf_steps}

    for f in sorted(
        Path(output_dir / "by_cif_ext").glob("*.json.gz"), key=os.path.getsize
    ):
        cif_ext = f.name.split("".join(f.suffixes))[0]
        for mof, entries in loadfn(f, cls=None).items():
            mof_idx = f"{cif_ext}.{mof}"
            if mof_idx in mof_idxs:
                for i in range(len(mof_idxs)):
                    if (new_idx := f"{mof_idx}.{i}") not in mof_idxs:
                        mof_idx = new_idx
                        break
            mof_idxs.update({mof_idx})
            for wf_step in entries:
                mof_completion[wf_step].append(mof_idx)
    dumpfn(mof_completion, mof_wf_comp_file)

The following will create a plot of the number of MOF structures which have successfully completed each step of the screening pipeline

In [None]:
labels = {
    "zeo++ input structure": "Zeo++ on<br>diffusion structure",
    "MACE relax": "MACE(-MP-0)<br>relaxation",
    "zeo++ mace-relaxed structure": "Zeo++ on<br>MACE-relaxed<br>structure",
    "gfn-xtb relax": "GFN1-xTB relaxation",
}

data = [[labels[k], len(mof_completion[k])] for k, v in ordered_wf_steps.items()]

lbls = [v[0] for v in data[:-1]]
pctgs = [100 * v[1] / data[0][1] for v in data[1:]]

In [None]:
base_axis_opts = {
    "title_font_size": 32,
    "title_font_color": "black",
    "tickfont_color": "black",
    "tickfont_size": 28,
    "title_font_family": "Arial",
}

colors = [
    f"rgb({cstr})"
    for cstr in [",".join(str(f) for f in v) for v in color_palette("colorblind")]
]


fig = go.Figure()
fig.add_trace(
    go.Bar(
        x=lbls,
        y=pctgs,
        marker_color=colors[0],
        text=[
            f"{v:.1f}%" + ("" if i > 0 else f"<br>of {data[0][1]}<br>structures")
            for i, v in enumerate(pctgs)
        ],
        textfont_size=32,
        textfont_family="Arial",
    )
)

h = 800
aspect_ratio = 7 / 5
fig.update_layout(
    height=h,
    width=h * aspect_ratio,
    yaxis={
        "title_text": "Percentage of Structures (%)",
        "showline": True,
        "linecolor": "black",
        "linewidth": 2,
        # "type": "log",
        **base_axis_opts,
    },
    xaxis={
        "showline": True,
        "linecolor": "black",
        "linewidth": 2,
        "tickangle": 0,
        **base_axis_opts,
    },
    margin={"t": 0, "r": 0, "b": 0, "l": 0},
    bargroupgap=0.1,
    bargap=0.0,
    plot_bgcolor="rgba(0,0,0,0)",
    paper_bgcolor="rgba(0,0,0,0)",
)
# fig.show()
fig.write_image("workflow_completion.pdf", scale=3)