<a href="https://colab.research.google.com/github/tomheston/fragility-metrics/blob/main/notebooks/diagnostic_2x2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# @title
# Fragility Metrics Toolkit: Diagnostic Metrics (Benchmark Framework)
# 20-NOV-2025
# Fully aligned with FRAGILITY_METRICS.md v9.5 §3.4 DFQ and §4 DNB
#
# Input: TP, FN, FP, TN + chosen metric + benchmark p₀ + claim direction
# Output: p (display p-value), fr (DFI/DFQ), nb (DNB)
#
# Logic identical to the released proportion_vs_benchmark.ipynb
#
# IF YOU USE THIS CALCULATOR PLEASE CITE:
# Heston, T. F. (2025). Fragility Metrics Toolkit [Software]. Zenodo. https://doi.org/10.5281/zenodo.17254763
#
# © Thomas F. Heston 2025. CC-BY 4.0

try:
    from scipy.stats import binomtest
    from math import log, sqrt
    import numpy as np
except ImportError:
    import subprocess, sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "scipy"])
    from scipy.stats import binomtest
    from math import log, sqrt
    import numpy as np

ALPHA = 0.05

# ---------- DNB (v9.5 exact, with documented Haldane–Anscombe 0.5 correction) ----------
def compute_dnb(tp, fn, fp, tn):
    a, b, c, d = tp + 0.5, fn + 0.5, fp + 0.5, tn + 0.5
    dor = (a * d) / (b * c)
    se = sqrt(1/a + 1/b + 1/c + 1/d)
    ln_dor = abs(log(dor))
    return 0.0 if ln_dor == 0 else ln_dor / (ln_dor + se)

# ---------- Metric → (k, n_relevant) ----------
def get_diagnostic_params(tp, fn, fp, tn, metric):
    metric = metric.lower()
    if metric == "sensitivity":
        return tp, tp + fn
    elif metric == "specificity":
        return tn, fp + tn
    elif metric == "ppv":
        return tp, tp + fp
    elif metric == "npv":
        return tn, fn + tn
    elif metric == "accuracy":
        return tp + tn, tp + fn + fp + tn
    else:
        raise ValueError("metric must be sensitivity, specificity, ppv, npv, or accuracy")

# ---------- Core BFI/BFQ logic (identical to proportion_vs_benchmark.ipynb) ----------
def compute_dfi_dfq(k: int, n: int, p0: float, alternative: str = "greater", alpha: float = ALPHA):
    opp_alt = "less" if alternative == "greater" else "greater"
    opposite_proven = binomtest(k, n, p0, alternative=opp_alt).pvalue <= alpha

    if alternative == "greater":                                 # Claim: ≥ p0
        if opposite_proven:                                      # Already proven below → increase k until not
            for d in range(1, n - k + 1):
                if binomtest(k + d, n, p0, alternative="less").pvalue > alpha:
                    return d, d / n
            return None, None
        else:                                                    # Supported → decrease k until proven below
            for d in range(1, k + 1):
                if binomtest(k - d, n, p0, alternative="less").pvalue <= alpha:
                    return d, d / n
            return None, None
    else:                                                        # Claim: ≤ p0
        if opposite_proven:                                      # Already proven above → decrease k until not
            for d in range(1, k + 1):
                if binomtest(k - d, n, p0, alternative="greater").pvalue > alpha:
                    return d, d / n
            return None, None
        else:                                                    # Supported → increase k until proven above
            for d in range(1, n - k + 1):
                if binomtest(k + d, n, p0, alternative="greater").pvalue <= alpha:
                    return d, d / n
            return None, None

# ---------- Display p-value (context-aware, same as proportion_vs_benchmark) ----------
def display_pvalue(k, n, p0, alternative):
    if alternative == "greater":
        return binomtest(k, n, p0, alternative="greater").pvalue
    else:
        phat = k / n
        if phat > p0:
            return binomtest(k, n, p0, alternative="greater").pvalue
        else:
            return binomtest(k, n, p0, alternative="less").pvalue

# ---------- High-level ----------
def diagnostic_2x2(tp: int, fn: int, fp: int, tn: int,
                   metric: str = "accuracy",
                   p0: float = 0.50,
                   alternative: str = "greater"):
    k, n_rel = get_diagnostic_params(tp, fn, fp, tn, metric)
    if n_rel == 0:
        raise ValueError(f"{metric.capitalize()} undefined (denominator = 0)")

    p_disp = display_pvalue(k, n_rel, p0, alternative)
    dfi, dfq = compute_dfi_dfq(k, n_rel, p0, alternative, ALPHA)
    dnb = compute_dnb(tp, fn, fp, tn)

    return {
        "metric": metric,
        "p0": p0,
        "alternative": alternative,
        "k": k,
        "n_relevant": n_rel,
        "observed": k / n_rel,
        "p_display": p_disp,
        "fr": {"DFI": dfi, "DFQ": dfq},
        "nb": {"DNB": dnb}
    }

# ---------- Narrative (aligned with proportion_vs_benchmark) ----------
def generate_narrative(res, tp, fn, fp, tn):
    claim_text = "at or above" if res["alternative"] == "greater" else "at or below"
    opp_alt = "less" if res["alternative"] == "greater" else "greater"
    opposite_proven = binomtest(res["k"], res["n_relevant"], res["p0"], alternative=opp_alt).pvalue <= ALPHA

    claim_state = "rejected" if opposite_proven else "NOT rejected"

    lines = [
        f"Diagnostic 2×2: TP={tp}, FN={fn}, FP={fp}, TN={tn}",
        f"Metric: {res['metric'].upper()}  (benchmark p₀ = {res['p0']})",
        f"Observed = {res['observed']:.4f} ({res['k']}/{res['n_relevant']})",
        f"Claim '≥ p₀' is {claim_state} (α = 0.05)" if res["alternative"] == "greater"
        else f"Claim '≤ p₀' is {claim_state} (α = 0.05)",
        f"Displayed p-value = {res['p_display']:.6f}",
    ]

    if res["fr"]["DFI"] is not None:
        if opposite_proven:
            frag_txt = f"DFI = {res['fr']['DFI']}: toggling {res['fr']['DFI']} event(s) means the claim can no longer be rejected."
        else:
            frag_txt = f"DFI = {res['fr']['DFI']}: toggling {res['fr']['DFI']} event(s) would reject the claim."

        stability = ("extremely fragile" if res['fr']['DFQ'] < 0.01 else
                     "very fragile" if res['fr']['DFQ'] < 0.05 else
                     "fragile" if res['fr']['DFQ'] < 0.10 else
                     "moderately stable" if res['fr']['DFQ'] < 0.25 else
                     "very stable")
        lines += [frag_txt, f"DFQ = {res['fr']['DFQ']:.6f} → {stability}"]
    else:
        lines.append("DFQ = 1.000000 (maximally stable)")

    sep = ("at neutrality boundary" if res['nb']['DNB'] < 0.05 else
           "near neutrality" if res['nb']['DNB'] < 0.10 else
           "moderately separated" if res['nb']['DNB'] < 0.25 else
           "clearly separated" if res['nb']['DNB'] < 0.50 else
           "far from neutrality")
    lines.append(f"DNB = {res['nb']['DNB']:.6f} → diagnostic is {sep}")

    return "\n".join(lines)

# ---------- CLI ----------
def main():
    print("Diagnostic 2x2 Calculator – v9.5 compliant\n")
    tp = int(input("TP: ").strip())
    fn = int(input("FN: ").strip())
    fp = int(input("FP: ").strip())
    tn = int(input("TN: ").strip())

    print("\nMetric: 1=sensitivity 2=specificity 3=ppv 4=npv 5=accuracy")
    mchoice = input("Choose (1-5) [default 5]: ").strip() or "5"
    metrics = {"1": "sensitivity", "2": "specificity", "3": "ppv", "4": "npv", "5": "accuracy"}
    metric = metrics[mchoice]

    p0 = float(input(f"Benchmark p₀ [default 0.50]: ").strip() or "0.50")

    print("\nClaim:")
    print("1. Metric ≥ p₀ (at or above benchmark)")
    print("2. Metric ≤ p₀ (at or below benchmark)")
    choice = input("Enter 1 or 2 [default 1]: ").strip() or "1"
    alternative = "greater" if choice == "1" else "less"

    res = diagnostic_2x2(tp, fn, fp, tn, metric, p0, alternative)

    print("\n================ p–fr–nb ================")
    print(f"Displayed p-value = {res['p_display']:.6f}")
    print(f"DFI = {res['fr']['DFI']}")
    print(f"DFQ = {1.0 if res['fr']['DFQ'] is None else res['fr']['DFQ']:.6f}")
    print(f"DNB = {res['nb']['DNB']:.6f}")
    print("=========================================\n")

    print("Interpretation:")
    print(generate_narrative(res, tp, fn, fp, tn))

if __name__ == "__main__":
    main()

Diagnostic 2x2 Calculator – v9.5 compliant

TP: 15
FN: 5
FP: 3
TN: 8

Metric: 1=sensitivity 2=specificity 3=ppv 4=npv 5=accuracy
Choose (1-5) [default 5]: 1
Benchmark p₀ [default 0.50]: 0.7

Claim:
1. Metric ≥ p₀ (at or above benchmark)
2. Metric ≤ p₀ (at or below benchmark)
Enter 1 or 2 [default 1]: 1

Displayed p-value = 0.416371
DFI = 5
DFQ = 0.250000
DNB = 0.704687

Interpretation:
Diagnostic 2×2: TP=15, FN=5, FP=3, TN=8
Metric: SENSITIVITY  (benchmark p₀ = 0.7)
Observed = 0.7500 (15/20)
Claim '≥ p₀' is NOT rejected (α = 0.05)
Displayed p-value = 0.416371
DFI = 5: toggling 5 event(s) would reject the claim.
DFQ = 0.250000 → very stable
DNB = 0.704687 → diagnostic is far from neutrality
