## Inspect Prompts

In [None]:
import yaml
import difflib

In [None]:
with open("rrg/prompts.yaml") as f:
    prompts = yaml.safe_load(f)

def diff(a: str, b: str):
    a = a.splitlines(keepends=True)
    b = b.splitlines(keepends=True)
    diff = difflib.unified_diff(a, b)
    print("".join(diff))

In [None]:
diff(prompts["naive"], prompts["simple"])

In [None]:
diff(prompts["simple"], prompts["verbose"])

In [None]:
diff(prompts["verbose"], prompts["instruct"])

## Evaluate Runs

In [None]:
# Install from source while waiting for merge of https://github.com/trevismd/statannotations/pull/155
# !pip install https://github.com/getzze/statannotations/archive/compat-seaborn-13.zip

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statannotations.Annotator import Annotator
from collections import defaultdict

In [None]:
experiments = {
    "Findings - Filter": [
        ("No-filter", "/opt/gpudata/rrg-data-2/exp-findings/exp-filter/Mistral-7B-Instruct-v0.3_no-filter_pred-label_simple_top-5_findings_METRICS.csv"),
        ("Exact", "/opt/gpudata/rrg-data-2/exp-findings/exp-filter/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_findings_METRICS.csv"),
        ("Partial", "/opt/gpudata/rrg-data-2/exp-findings/exp-filter/Mistral-7B-Instruct-v0.3_partial_pred-label_simple_top-5_findings_METRICS.csv"),
    ],
    "Findings - Prompt": [
        ("Naive", "/opt/gpudata/rrg-data-2/exp-findings/exp-prompt/Mistral-7B-Instruct-v0.3_exact_pred-label_naive_top-5_findings_METRICS.csv"),
        ("Simple", "/opt/gpudata/rrg-data-2/exp-findings/exp-prompt/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_findings_METRICS.csv"),
        ("Verbose", "/opt/gpudata/rrg-data-2/exp-findings/exp-prompt/Mistral-7B-Instruct-v0.3_exact_pred-label_verbose_top-5_findings_METRICS.csv"),
        ("Instruct", "/opt/gpudata/rrg-data-2/exp-findings/exp-prompt/Mistral-7B-Instruct-v0.3_exact_pred-label_instruct_top-5_findings_METRICS.csv"),
    ],
    "Findings - Model": [
        ("Mistral-v3", "/opt/gpudata/rrg-data-2/exp-findings/exp-model/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_findings_METRICS.csv"),
        ("Mistral-v1", "/opt/gpudata/rrg-data-2/exp-findings/exp-model/Mistral-7B-Instruct-v0.1_exact_pred-label_simple_top-5_findings_METRICS.csv"),
        ("BioMistral", "/opt/gpudata/rrg-data-2/exp-findings/exp-model/BioMistral-7B_exact_pred-label_simple_top-5_findings_METRICS.csv"),
    ],
    "Findings - Label": [
        ("True", "/opt/gpudata/rrg-data-2/exp-findings/exp-label/Mistral-7B-Instruct-v0.3_exact_true-label_simple_top-5_findings_METRICS.csv"),
        ("Predicted", "/opt/gpudata/rrg-data-2/exp-findings/exp-label/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_findings_METRICS.csv"),
    ],
    "Findings - Redundancy": [
        ("No-filter, Naive-prompt", "/opt/gpudata/rrg-data-2/exp-findings/exp-redundancy/Mistral-7B-Instruct-v0.3_no-filter_pred-label_naive_top-5_findings_METRICS.csv"),
        ("No-filter, Simple-prompt", "/opt/gpudata/rrg-data-2/exp-findings/exp-redundancy/Mistral-7B-Instruct-v0.3_no-filter_pred-label_simple_top-5_findings_METRICS.csv"),
        ("Exact-filter, Naive-prompt", "/opt/gpudata/rrg-data-2/exp-findings/exp-redundancy/Mistral-7B-Instruct-v0.3_exact_pred-label_naive_top-5_findings_METRICS.csv"),
        ("Exact-filter, Simple-prompt", "/opt/gpudata/rrg-data-2/exp-findings/exp-redundancy/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_findings_METRICS.csv"),
    ],
    "Impression - Filter": [
        ("No-filter", "/opt/gpudata/rrg-data-2/exp-impression/exp-filter/Mistral-7B-Instruct-v0.3_no-filter_pred-label_simple_top-5_impression_METRICS.csv"),
        ("Exact", "/opt/gpudata/rrg-data-2/exp-impression/exp-filter/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_impression_METRICS.csv"),
        ("Partial", "/opt/gpudata/rrg-data-2/exp-impression/exp-filter/Mistral-7B-Instruct-v0.3_partial_pred-label_simple_top-5_impression_METRICS.csv"),
    ],
    "Impression - Prompt": [
        ("Naive", "/opt/gpudata/rrg-data-2/exp-impression/exp-prompt/Mistral-7B-Instruct-v0.3_exact_pred-label_naive_top-5_impression_METRICS.csv"),
        ("Simple", "/opt/gpudata/rrg-data-2/exp-impression/exp-prompt/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_impression_METRICS.csv"),
        ("Verbose", "/opt/gpudata/rrg-data-2/exp-impression/exp-prompt/Mistral-7B-Instruct-v0.3_exact_pred-label_verbose_top-5_impression_METRICS.csv"),
        ("Instruct", "/opt/gpudata/rrg-data-2/exp-impression/exp-prompt/Mistral-7B-Instruct-v0.3_exact_pred-label_instruct_top-5_impression_METRICS.csv"),
    ],
    "Impression - Model": [
        ("Mistral-v3", "/opt/gpudata/rrg-data-2/exp-impression/exp-model/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_impression_METRICS.csv"),
        ("Mistral-v1", "/opt/gpudata/rrg-data-2/exp-impression/exp-model/Mistral-7B-Instruct-v0.1_exact_pred-label_simple_top-5_impression_METRICS.csv"),
        ("BioMistral", "/opt/gpudata/rrg-data-2/exp-impression/exp-model/BioMistral-7B_exact_pred-label_simple_top-5_impression_METRICS.csv"),
    ],
    "Impression - Label": [
        ("True", "/opt/gpudata/rrg-data-2/exp-impression/exp-label/Mistral-7B-Instruct-v0.3_exact_true-label_simple_top-5_impression_METRICS.csv"),
        ("Predicted", "/opt/gpudata/rrg-data-2/exp-impression/exp-label/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_impression_METRICS.csv"),
    ],
    "Impression - Redundancy": [
        ("No-filter, Naive-prompt", "/opt/gpudata/rrg-data-2/exp-impression/exp-redundancy/Mistral-7B-Instruct-v0.3_no-filter_pred-label_naive_top-5_impression_METRICS.csv"),
        ("No-filter, Simple-prompt", "/opt/gpudata/rrg-data-2/exp-impression/exp-redundancy/Mistral-7B-Instruct-v0.3_no-filter_pred-label_simple_top-5_impression_METRICS.csv"),
        ("Exact-filter, Naive-prompt", "/opt/gpudata/rrg-data-2/exp-impression/exp-redundancy/Mistral-7B-Instruct-v0.3_exact_pred-label_naive_top-5_impression_METRICS.csv"),
        ("Exact-filter, Simple-prompt", "/opt/gpudata/rrg-data-2/exp-impression/exp-redundancy/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_impression_METRICS.csv"),
    ],
    "Section": [
        ("Both", "/opt/gpudata/rrg-data-2/exp-section/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_both_METRICS.csv"),
        ("Findings-Intersect", "/opt/gpudata/rrg-data-2/exp-section/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_findings-intersect_METRICS.csv"),
        ("Impression-Intersect", "/opt/gpudata/rrg-data-2/exp-section/Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_impression-intersect_METRICS.csv"),
    ],
}

## Check duplicate runs are equivalent

In [None]:
count = defaultdict(list)
for g, ts in experiments.items():
    for _, t in ts:
        base = os.path.basename(t)
        count[base].append(t)

In [None]:
len(count.keys())

In [None]:
sum([len(l) for l in count.values()])

In [None]:
dupes = {k: v for k, v in count.items() if len(v) > 1}
print(len(dupes))
dupes

In [None]:
for group, runs in dupes.items():
    group_dfs = []
    for run in runs:
        df = pd.read_csv(run)
        group_dfs.append(df)
    ref = group_dfs[0]
    for df in group_dfs[1:]:
        assert np.isclose(ref, df).all()

In [None]:
# map colors to experiments
list(count.keys())

In [None]:
import seaborn as sns

In [None]:
cmap = sns.color_palette(palette='Set3')

In [None]:
cmap

In [None]:
temp = {
    cmap[0]: [
        "Mistral-7B-Instruct-v0.3_no-filter_pred-label_simple_top-5_findings_METRICS.csv",
        "Mistral-7B-Instruct-v0.3_no-filter_pred-label_simple_top-5_impression_METRICS.csv",
    ],
    cmap[1]: [
        "Mistral-7B-Instruct-v0.3_partial_pred-label_simple_top-5_findings_METRICS.csv",
        "Mistral-7B-Instruct-v0.3_partial_pred-label_simple_top-5_impression_METRICS.csv",
    ],
    cmap[4]: [
        "Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_findings_METRICS.csv",
        "Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_impression_METRICS.csv",
        "Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_both_METRICS.csv",
    ],
    cmap[2]: [
        "Mistral-7B-Instruct-v0.3_exact_pred-label_naive_top-5_findings_METRICS.csv",
        "Mistral-7B-Instruct-v0.3_exact_pred-label_naive_top-5_impression_METRICS.csv",
    ],
    cmap[3]: [
        "Mistral-7B-Instruct-v0.3_exact_pred-label_verbose_top-5_findings_METRICS.csv",
        "Mistral-7B-Instruct-v0.3_exact_pred-label_verbose_top-5_impression_METRICS.csv",
    ],
    cmap[5]: [
        "Mistral-7B-Instruct-v0.3_exact_pred-label_instruct_top-5_findings_METRICS.csv",
        "Mistral-7B-Instruct-v0.3_exact_pred-label_instruct_top-5_impression_METRICS.csv",
    ],
    cmap[6]: [
        "Mistral-7B-Instruct-v0.1_exact_pred-label_simple_top-5_findings_METRICS.csv",
        "Mistral-7B-Instruct-v0.1_exact_pred-label_simple_top-5_impression_METRICS.csv",
    ],
    cmap[7]: [
        "BioMistral-7B_exact_pred-label_simple_top-5_findings_METRICS.csv",
        "BioMistral-7B_exact_pred-label_simple_top-5_impression_METRICS.csv",
    ],
    cmap[8]: [
        "Mistral-7B-Instruct-v0.3_exact_true-label_simple_top-5_findings_METRICS.csv",
        "Mistral-7B-Instruct-v0.3_exact_true-label_simple_top-5_impression_METRICS.csv",
    ],
    cmap[9]: [
        "Mistral-7B-Instruct-v0.3_no-filter_pred-label_naive_top-5_findings_METRICS.csv",
        "Mistral-7B-Instruct-v0.3_no-filter_pred-label_naive_top-5_impression_METRICS.csv",
    ],
    cmap[10]: ["Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_findings-intersect_METRICS.csv"],
    cmap[11]: ["Mistral-7B-Instruct-v0.3_exact_pred-label_simple_top-5_impression-intersect_METRICS.csv"],
}
colors = {v: k for k, vs in temp.items() for v in vs}

## Figures

In [None]:
metrics = ["bleu4", "rougeL", "bertscore", "f1radgraph", "f1chexbert"]

In [None]:
for group, runs in experiments.items():
    print("\n\n\n\n")
    print(group)
    group_results = []
    for name, run in experiments[group]:
        results = pd.read_csv(run).melt(id_vars="study_id", var_name="metric")
        results[group] = name
        group_results.append(results)
    df = pd.concat(group_results, ignore_index=True)
    x = "metric"
    y = "value"
    hue = group
    hue_order = [n for n, _ in experiments[group]]
    palette = [colors[os.path.basename(fp)] for _, fp in experiments[group]]
    order = metrics
    pairs = [
        ((metric, n1), (metric, n2))
        for metric in metrics
        for i, n1 in enumerate(hue_order)
        for n2 in hue_order[i+1:]
    ]
    fig, ax = plt.subplots(figsize=(5, 5))
    sns.boxplot(
        df,
        x=x,
        y=y,
        order=order,
        hue=hue,
        hue_order=hue_order,
        palette=palette,
        ax=ax,
        fliersize=0.1,
        showmeans=True,
        meanprops={
            "markersize": 5,
            "markeredgecolor": "black",
            "marker": "+",
            # "marker": "P",
            # "markerfacecolor": "black",
            # "markeredgecolor": "darkgray",
            # "markeredgewidth": 1,
        },
    )
    annot = Annotator(
        ax,
        pairs,
        data=df,
        x=x,
        y=y,
        order=order,
        hue=hue,
        hue_order=hue_order,
        palette=palette,
    )
    # test = "t-test_paired" if group not in ["Section", "Section-true"] else "t-test_ind"
    test = "t-test_paired"
    annot.configure(
        test=test,
        comparisons_correction="Bonferroni",
        hide_non_significant=True,
        loc="outside",
    )
    annot.apply_test().annotate()
    ax.set_ylim([-0.05, 1.65])
    ax.set_yticks([0, 0.2, 0.4, 0.6, 0.8, 1.0])
    ax.grid(which="major", axis="y")
    ax.set_title(group)
    legend = ax.legend(title=None, loc="upper left")
    legend.remove()
    fig.show()
    fig.tight_layout()
    fig.savefig(f"figs/pngs/{group}.png", dpi=300)
    fig.savefig(f"figs/pdfs/{group}.pdf")

    fig2, ax2 = plt.subplots(figsize=(3, 1))
    handles, labels = ax.get_legend_handles_labels()
    ax2.legend(handles, labels, loc="center")
    ax2.axis("off")
    fig2.savefig(f"figs/pngs/legends/{group}-legend.png", dpi=300)
    fig2.savefig(f"figs/pdfs/legends/{group}-legend.pdf")