In [None]:
import os

def get_preamble():
    # this gets flagged by detect secrets if done in multiline string, no way to flag false positive for specific lines in multiline
    return "REPO_ROOT=/opt/gpudata/steven/label-boosted-RAG-for-RRG\nMIMIC_CXR_DIR=/opt/gpudata/mimic-cxr\nCHEXPERTPLUS_DIR=/opt/gpudata/chexpertplus\nLABEL_DIR=/opt/gpudata/cxr-derived\nBASE_OUTPUT_DIR=/opt/gpudata/labrag\n\nset -e\n"

def get_exp_str(**kwargs):
    label_type = "true"
    if kwargs["pred_label_csv"] not in ["None", "none", "''", '""']:
        label_type = f"{os.path.basename(kwargs['emb_h5'].replace('.h5', ''))}-pred"
    kwargs["label_type"] = label_type

    kwargs["model_name"] = os.path.basename(kwargs["llm"])

    return """
python $REPO_ROOT/rrg/generate.py \\
--model {llm} \\
--filter_type {filter} \\
--prompt_type {prompt} \\
--section_type {section} \\
--k {k} \\
--batch_size 32 \\
--prompt_yaml $REPO_ROOT/rrg/prompts.yaml \\
--split_csv {split_csv} \\
--metadata_csv {metadata_csv} \\
--true_label_csv {true_label_csv} \\
--predicted_label_csv {pred_label_csv} \\
--report_csv {report_csv} \\
--feature_h5 {emb_h5} \\
--output_dir {output_dir}

python $REPO_ROOT/rrg/eval.py \\
--report_csv {output_dir}/{section}_top-{k}_{label_type}-label_{filter}_{prompt}_{model_name}.csv \\
--output_csv {output_dir}/{section}_top-{k}_{label_type}-label_{filter}_{prompt}_{model_name}_METRICS.csv
""".format(**kwargs)

In [None]:
DEFAULT_LLM = "mistralai/Mistral-7B-Instruct-v0.3"
DEFAULT_FILTER = "exact"
DEFAULT_PROMPT = "simple"
DEFAULT_K = 5

for dataset in ["mimic", "chexpertplus"]:
    if dataset == "mimic":
        split_csv = "$MIMIC_CXR_DIR/mimic-cxr-2.0.0-split.csv"
        metadata_csv = "$MIMIC_CXR_DIR/mimic-cxr-2.0.0-metadata.csv"
        report_csv = "$MIMIC_CXR_DIR/mimic_cxr_sectioned.csv"
        emb_model = "biovilt"
        emb_h5 = f"$BASE_OUTPUT_DIR/mimic-cxr-{emb_model}.h5"
    elif dataset == "chexpertplus":
        split_csv = "$CHEXPERTPLUS_DIR/split.csv"
        metadata_csv = "$CHEXPERTPLUS_DIR/metadata.csv"
        report_csv = "$CHEXPERTPLUS_DIR/report.csv"
        emb_model = "gloria"
        emb_h5 = f"$BASE_OUTPUT_DIR/chexpertplus-{emb_model}.h5"
    else:
        raise ValueError(f"Unknown dataset: {dataset}")

    for section in ["findings", "impression"]:
        script_file = f"3-run-generate-eval-{dataset}-{section}.sh"
        true_label_csv = f"$LABEL_DIR/{dataset}-{section}-labels.csv"
        pred_label_csv = f"$BASE_OUTPUT_DIR/{dataset}-{section}-{emb_model}-classifiers/pred_pr.csv"
        output_dir = f"$BASE_OUTPUT_DIR/exp-{dataset}/exp-{section}"

        base_kwargs = {
            "llm": DEFAULT_LLM,
            "filter": DEFAULT_FILTER,
            "prompt": DEFAULT_PROMPT,
            "k": DEFAULT_K,
            "section": section,
            "split_csv": split_csv,
            "metadata_csv": metadata_csv,
            "report_csv": report_csv,
            "true_label_csv": true_label_csv,
            "pred_label_csv": pred_label_csv,
            "emb_h5": emb_h5,
        }

        cmds = []

        cmds.append("# Core Experiments")
        base_kwargs["output_dir"] = f"{output_dir}/exp-core"
        for filter_type, prompt_type in [("no-filter", "naive"), ("exact", "naive"), ("no-filter", "simple"), ("exact", "simple")]:
            kwargs = base_kwargs.copy()
            kwargs["filter"] = filter_type
            kwargs["prompt"] = prompt_type
            cmds.append(get_exp_str(**kwargs))

        cmds.append("# Filter Experiments")
        base_kwargs["output_dir"] = f"{output_dir}/exp-filter"
        for filter_type in ["no-filter", "exact", "partial"]:
            kwargs = base_kwargs.copy()
            kwargs["filter"] = filter_type
            cmds.append(get_exp_str(**kwargs))

        cmds.append("# Prompt Experiments")
        base_kwargs["output_dir"] = f"{output_dir}/exp-prompt"
        for prompt_type in ["naive", "simple", "verbose", "instruct"]:
            kwargs = base_kwargs.copy()
            kwargs["prompt"] = prompt_type
            cmds.append(get_exp_str(**kwargs))

        cmds.append("# LLM Experiments")
        base_kwargs["output_dir"] = f"{output_dir}/exp-llm"
        for temp in ["mistralai/Mistral-7B-Instruct-v0.3", "mistralai/Mistral-7B-Instruct-v0.1", "BioMistral/BioMistral-7B"]:
            kwargs = base_kwargs.copy()
            kwargs["llm"] = temp
            cmds.append(get_exp_str(**kwargs))

        cmds.append("# Embedding Experiments")
        base_kwargs["output_dir"] = f"{output_dir}/exp-embedding"
        for temp in [emb_model, "resnet50"]:
            kwargs = base_kwargs.copy()
            kwargs["emb_h5"] = f"$BASE_OUTPUT_DIR/chexpertplus-{temp}.h5"
            kwargs["pred_label_csv"] = f"$BASE_OUTPUT_DIR/{dataset}-{section}-{temp}-classifiers/pred_pr.csv"
            cmds.append(get_exp_str(**kwargs))

        cmds.append("# True Label Experiments")
        base_kwargs["output_dir"] = f"{output_dir}/exp-true-label"
        for temp in [pred_label_csv, "None"]:
            kwargs = base_kwargs.copy()
            kwargs["pred_label_csv"] = temp
            cmds.append(get_exp_str(**kwargs))

        cmds.append("# Top K Experiments")
        base_kwargs["output_dir"] = f"{output_dir}/exp-top-k"
        for temp in [3, 5, 10]:
            kwargs = base_kwargs.copy()
            kwargs["k"] = temp
            cmds.append(get_exp_str(**kwargs))

        with open(script_file, "w") as f:
            f.write(get_preamble())
            for cmd in cmds:
                if cmd.startswith("#"):
                    f.write("\n")
                    f.write("# =================================\n")
                    f.write(cmd + "\n")
                    f.write("# =================================\n")
                else:
                    f.write(cmd)