# Pipeline End-to-End Test Notebook

This notebook walks through **every stage** of the Biomedical Semantic Leakage Detection pipeline interactively, showing real output at each step.

| Stage | What it tests |
|---|---|
| 0 | Environment setup & imports |
| 1 | CoT Generator — live LLM call |
| 2 | Concept Extractor — UMLS concept linking |
| 3 | Hybrid NLI Entailment Checker |
| 4 | Guard Signal Derivation |
| 5 | Full Pipeline (3 questions end-to-end) |
| 6 | Results Summary & Visualisation |

Run cells top-to-bottom. Each cell is independent and shows what it produced.

## Stage 0 — Environment Setup

Clones the repo, installs dependencies, configures API keys. **Run this first.**

In [None]:
# ── 0a. Clone / pull the repo and configure paths ─────────────────────────
import os, sys
from pathlib import Path

REPO_URL  = 'https://github.com/varchanaiyer/biomedical-semantic-leakage-detection'
REPO_DIR  = 'biomedical-semantic-leakage-detection'

if not Path(REPO_DIR).exists():
    os.system(f'git clone {REPO_URL}')
else:
    os.system(f'git -C {REPO_DIR} pull --quiet')

_cwd = Path(os.getcwd())
if (_cwd / REPO_DIR / 'utils').exists():
    PROJECT_ROOT = str(_cwd / REPO_DIR)
elif (_cwd / 'utils').exists():
    PROJECT_ROOT = str(_cwd)
elif (_cwd.parent / 'utils').exists():
    PROJECT_ROOT = str(_cwd.parent)
else:
    PROJECT_ROOT = str(_cwd / REPO_DIR)

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)
os.chdir(PROJECT_ROOT)

print(f"PROJECT_ROOT : {PROJECT_ROOT}")
print(f"Python       : {sys.version.split()[0]}")
print(f"Working dir  : {os.getcwd()}")

In [None]:
# ── 0b. Install dependencies ───────────────────────────────────────────────
!pip install openai numpy pandas scipy scikit-learn matplotlib seaborn requests --quiet ipywidgets
print("Dependencies installed.")

In [None]:
# ── OpenRouter API Key ────────────────────────────────────────────────────────
import os, importlib.util
from IPython.display import display, clear_output, HTML

_HAS_WIDGETS = importlib.util.find_spec("ipywidgets") is not None

if _HAS_WIDGETS:
    import ipywidgets as widgets

    _key_box = widgets.Password(
        placeholder="sk-or-v1-…  (get yours free at openrouter.ai)",
        layout=widgets.Layout(width="520px"),
    )
    _btn = widgets.Button(
        description="Set Key", button_style="primary",
        icon="check", layout=widgets.Layout(width="110px"),
    )
    _out = widgets.Output()

    def _apply(_b):
        with _out:
            clear_output()
            key = _key_box.value.strip()
            if key:
                os.environ["OPENROUTER_API_KEY"] = key
                print(f"  ✓ OpenRouter key set ({len(key)} chars)")
            else:
                print("  ✗ Paste your OpenRouter key above, then click Set Key.")

    _btn.on_click(_apply)
    display(HTML("<b>🔑 OpenRouter API Key</b>"))
    display(widgets.HBox([_key_box, _btn]))
    display(_out)
    display(HTML(
        "<small>Get a free key at "
        "<a href=\"https://openrouter.ai\" target=\"_blank\">openrouter.ai</a>"
        " — the notebooks will automatically run across all configured models.</small>"
    ))
else:
    os.environ.setdefault("OPENROUTER_API_KEY", "")
    print("ipywidgets not found — set key with:")
    print("  os.environ[\"OPENROUTER_API_KEY\"] = \"sk-or-v1-...\"")


In [None]:
# ── 0d. Import all pipeline modules ────────────────────────────────────────
import warnings, json, time, traceback
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

warnings.filterwarnings('ignore')

_ok = {}
for mod, sym in [
    ('utils.cot_generator',    'generate'),
    ('utils.concept_extractor','extract_concepts'),
    ('utils.hybrid_checker',   'build_entailment_records'),
    ('utils.guards',           'derive_guards'),
    ('utils.umls_api_linker',  'is_configured'),
]:
    try:
        m = __import__(mod, fromlist=[sym])
        _ok[mod] = True
        print(f"  ✓ {mod}")
    except Exception as e:
        _ok[mod] = False
        print(f"  ✗ {mod}: {e}")

from utils.cot_generator    import generate as generate_cot, OPENROUTER_READY, ANTHROPIC_READY
from utils.concept_extractor import extract_concepts
from utils.hybrid_checker    import build_entailment_records
from utils.guards            import derive_guards, GuardConfig, lexical_jaccard
from utils.umls_api_linker   import is_configured as umls_configured

GUARD_CFG = GuardConfig()

print()
print(f"  OpenRouter ready : {OPENROUTER_READY}")
print(f"  Anthropic ready  : {ANTHROPIC_READY}")
print(f"  UMLS configured  : {umls_configured()}")
print(f"  Heuristic NLI    : {os.environ.get('FORCE_HEURISTIC_NLI') == '1'}")

## Stage 1 — CoT Generator

Calls an LLM to produce numbered reasoning steps for a biomedical question.  
If no API key is set, falls back to 5 generic template steps (provider = `local`).

In [None]:
# ── 1a. Configuration ───────────────────────────────────────────────────────
# Change these to try different models
PREFER = 'openrouter'
MODEL  = 'anthropic/claude-haiku-4-5'   # any OpenRouter slug
# MODEL = 'openai/gpt-4o-mini'
# MODEL = 'google/gemini-flash-1.5'
# MODEL = 'meta-llama/llama-3.3-70b-instruct'

TEST_QUESTION = (
    "Does aspirin reduce the risk of myocardial infarction "
    "in patients with cardiovascular disease?"
)
print(f"Question : {TEST_QUESTION}")
print(f"Model    : {MODEL} via {PREFER}")

In [None]:
# ── 1b. Run CoT generation ──────────────────────────────────────────────────
t0  = time.time()
cot = generate_cot(TEST_QUESTION, prefer=PREFER, model=MODEL)
elapsed = round(time.time() - t0, 2)

STEPS    = cot['steps']
PROVIDER = cot['provider']
MODEL_ID = cot['model']

print(f"Provider : {PROVIDER}  |  Model : {MODEL_ID}  |  Time : {elapsed}s")
print(f"Steps    : {len(STEPS)}")
print()

# Display steps
for i, step in enumerate(STEPS, 1):
    print(f"  Step {i:2d}: {step}")

# Warnings
if PROVIDER == 'local':
    print()
    print("⚠  provider='local' means all API calls failed.")
    print("   Set OPENROUTER_API_KEY in cell 0c and re-run.")

In [None]:
# ── 1c. Validate step quality ───────────────────────────────────────────────
checks = {
    'At least 3 steps returned':         len(STEPS) >= 3,
    'All steps non-empty strings':        all(isinstance(s, str) and len(s.strip()) > 0 for s in STEPS),
    'All steps > 15 chars (not trivial)': all(len(s) > 15 for s in STEPS),
    'Real LLM was called (not fallback)': PROVIDER != 'local',
}

all_pass = True
for name, result in checks.items():
    icon = '✓' if result else '✗'
    print(f"  {icon}  {name}")
    if not result: all_pass = False

print()
print("Stage 1:", "PASS ✓" if all_pass else "WARN — check API key")

## Stage 2 — Concept Extractor

Extracts biomedical surface candidates (n-grams, acronyms) from each step  
and links them to UMLS concepts (CUIs) if the UMLS API is configured.

In [None]:
# ── 2a. Extract concepts ────────────────────────────────────────────────────
t0       = time.time()
CONCEPTS = extract_concepts(STEPS, scispacy_when='never', top_k=5)
elapsed  = round(time.time() - t0, 2)

total_cands = sum(len(c) for c in CONCEPTS)
valid_cands = sum(1 for sc in CONCEPTS for c in sc if c.get('valid'))

print(f"Elapsed         : {elapsed}s")
print(f"Total candidates: {total_cands}  (across {len(STEPS)} steps)")
print(f"Valid (UMLS CUI): {valid_cands}")
print(f"UMLS configured : {umls_configured()}")
if not umls_configured():
    print()
    print("  ℹ  UMLS not configured — concept candidates will have no CUI.")
    print("     Set UMLS_API_KEY in cell 0c for full concept linking.")

In [None]:
# ── 2b. Show concepts per step ──────────────────────────────────────────────
rows = []
for i, (step, cands) in enumerate(zip(STEPS, CONCEPTS)):
    for c in cands[:3]:   # top-3 per step
        rows.append({
            'Step':       i + 1,
            'Step text':  step[:55] + '...' if len(step) > 55 else step,
            'Surface':    c.get('surface', '?'),
            'Name':       c.get('name') or c.get('surface', '?'),
            'CUI':        c.get('cui', '—'),
            'Confidence': round(float((c.get('scores') or {}).get('confidence', 0)), 3),
            'Valid':      c.get('valid', False),
        })

if rows:
    df_concepts = pd.DataFrame(rows)
    display(df_concepts.to_string(index=False))
else:
    print("No concept candidates returned.")
    print("This is expected when UMLS_API_KEY is not set.")

## Stage 3 — Hybrid NLI Entailment Checker

Scores each adjacent step-pair for entailment / neutral / contradiction.

- With `FORCE_HEURISTIC_NLI=1` (default): uses token-overlap heuristic (fast, no download)
- With `FORCE_HEURISTIC_NLI=0`: downloads and runs PubMedBERT-BioNLI-LoRA (~420MB)

In [None]:
# ── 3a. Run NLI on step pairs ───────────────────────────────────────────────
t0    = time.time()
PAIRS = build_entailment_records(STEPS, CONCEPTS)
elapsed = round(time.time() - t0, 2)

label_counts = Counter(p['final_label'] for p in PAIRS)
print(f"Elapsed        : {elapsed}s")
print(f"Adjacent pairs : {len(PAIRS)}  (= {len(STEPS)} steps - 1)")
print(f"Label counts   : {dict(label_counts)}")
print(f"NLI source     : {(PAIRS[0].get('meta') or {}).get('nli_source', '?') if PAIRS else 'n/a'}")

In [None]:
# ── 3b. Display pair probabilities ──────────────────────────────────────────
rows = []
for p in PAIRS:
    i, j   = p['step_pair']
    probs  = p.get('probs', {})
    rows.append({
        'Pair':          f'{i}→{j}',
        'Premise':       STEPS[i][:50] + '...' if len(STEPS[i]) > 50 else STEPS[i],
        'Hypothesis':    STEPS[j][:50] + '...' if len(STEPS[j]) > 50 else STEPS[j],
        'P(entail)':     round(probs.get('entailment',   0), 3),
        'P(neutral)':    round(probs.get('neutral',      0), 3),
        'P(contra)':     round(probs.get('contradiction',0), 3),
        'Final label':   p.get('final_label', '?'),
    })

df_nli = pd.DataFrame(rows)

# Colour the Final label column
def colour_label(val):
    colours = {'contradiction':'#ffcccc','entailment':'#ccffcc','neutral':'#e8e8e8'}
    return f"background-color: {colours.get(val, 'white')}"

display(df_nli.style.applymap(colour_label, subset=['Final label'])
               .format({'P(entail)':'{:.3f}','P(neutral)':'{:.3f}','P(contra)':'{:.3f}'}))

In [None]:
# ── 3c. Validate NLI output ─────────────────────────────────────────────────
valid_labels = {'entailment', 'neutral', 'contradiction'}
checks = {
    f'Returns {len(STEPS)-1} pairs (N-1)':  len(PAIRS) == len(STEPS) - 1,
    'All probs sum to ≈ 1.0':
        all(abs(sum(p['probs'].values()) - 1.0) < 0.05 for p in PAIRS),
    'All final_labels are valid':
        all(p.get('final_label') in valid_labels for p in PAIRS),
}
for name, ok in checks.items():
    print(f"  {'✓' if ok else '✗'}  {name}")
print()
print("Stage 3:", "PASS ✓" if all(checks.values()) else "FAIL ✗")

## Stage 4 — Guard Signal Derivation

Guard signals are qualitative tags computed **on top of** NLI probabilities.

| Guard | Fires when |
|---|---|
| `lexical_duplicate` | Adjacent steps are ≥ 90% lexically identical (wasted reasoning) |
| `caution_band` | Top two label probabilities are very close (the model is uncertain) |
| `direction_conflict` | NLI is asymmetric: A→B entails but B→A contradicts (requires bidirectional NLI) |

In [None]:
# ── 4a. Compute guard signals for each pair ─────────────────────────────────
def _reverse_probs(steps, concepts, i, j):
    try:
        rev = build_entailment_records([steps[j], steps[i]],
                                       [concepts[j] if j < len(concepts) else [],
                                        concepts[i] if i < len(concepts) else []])
        return rev[0]['probs'] if rev else None
    except Exception:
        return None

GUARDED_PAIRS = []
for p in PAIRS:
    i, j      = p['step_pair']
    rev_probs = _reverse_probs(STEPS, CONCEPTS, i, j)
    guards    = derive_guards(
        premise       = STEPS[i] if i < len(STEPS) else '',
        hypothesis    = STEPS[j] if j < len(STEPS) else '',
        probs         = p['probs'],
        reverse_probs = rev_probs,
        config        = GUARD_CFG,
    )
    GUARDED_PAIRS.append({**p, 'guards': guards, 'reverse_probs': rev_probs})

all_guards = [g for p in GUARDED_PAIRS for g in p['guards']]
print(f"Total guards fired : {len(all_guards)}")
print(f"Guard breakdown    : {dict(Counter(all_guards)) or 'none'}")

In [None]:
# ── 4b. Display guard signals per pair ──────────────────────────────────────
rows = []
for p in GUARDED_PAIRS:
    i, j   = p['step_pair']
    probs  = p['probs']
    rprobs = p.get('reverse_probs') or {}
    rows.append({
        'Pair':           f'{i}→{j}',
        'Label':          p['final_label'],
        'P(contra) fwd':  round(probs.get('contradiction', 0), 3),
        'P(entail) rev':  round(rprobs.get('entailment', 0), 3) if rprobs else '—',
        'Guards':         ', '.join(p['guards']) or 'none',
    })

df_guards = pd.DataFrame(rows)
display(df_guards.to_string(index=False))

print()
print("Guard explanations:")
print("  caution_band      — uncertain; top 2 NLI labels are very close in probability")
print("  lexical_duplicate — steps are nearly identical; probably not advancing reasoning")
print("  direction_conflict— A entails B but B contradicts A (asymmetric relationship)")

## Stage 5 — Full Pipeline (3 Questions)

Runs all four stages in sequence for 3 different biomedical questions  
and collects structured output.

In [None]:
# ── 5a. Define 3 test questions ─────────────────────────────────────────────
PIPELINE_QUESTIONS = [
    "Does aspirin reduce the risk of myocardial infarction in patients with cardiovascular disease?",
    "What is the mechanism by which metformin lowers blood glucose in type 2 diabetes?",
    "How do statins reduce LDL cholesterol and lower cardiovascular risk?",
]

In [None]:
# ── 5b. Run the full pipeline on each question ─────────────────────────────
def run_pipeline(question, prefer=PREFER, model=MODEL):
    t0 = time.time()
    cot      = generate_cot(question, prefer=prefer, model=model)
    steps    = cot.get('steps', [])
    concepts = extract_concepts(steps, scispacy_when='never', top_k=3)
    pairs    = build_entailment_records(steps, concepts)

    guarded = []
    for p in pairs:
        i, j      = p['step_pair']
        rev_probs = _reverse_probs(steps, concepts, i, j)
        guards    = derive_guards(
            premise=steps[i] if i < len(steps) else '',
            hypothesis=steps[j] if j < len(steps) else '',
            probs=p['probs'],
            reverse_probs=rev_probs,
            config=GUARD_CFG,
        )
        guarded.append({**p, 'guards': guards})

    return {
        'question':  question,
        'provider':  cot.get('provider'),
        'model':     cot.get('model'),
        'steps':     steps,
        'concepts':  concepts,
        'pairs':     guarded,
        'elapsed_s': round(time.time() - t0, 2),
    }

PIPELINE_RESULTS = []
for qi, q in enumerate(PIPELINE_QUESTIONS, 1):
    print(f"Q{qi}: {q[:65]}...")
    try:
        r = run_pipeline(q)
        n_contra = sum(1 for p in r['pairs'] if p['final_label'] == 'contradiction')
        guards   = [g for p in r['pairs'] for g in p['guards']]
        print(f"     provider={r['provider']}  steps={len(r['steps'])}  "
              f"pairs={len(r['pairs'])}  contradictions={n_contra}  "
              f"guards={dict(Counter(guards)) or 'none'}  ({r['elapsed_s']}s)")
        PIPELINE_RESULTS.append(r)
    except Exception as e:
        print(f"     ERROR: {e}")
        traceback.print_exc()
    time.sleep(0.5)

print(f"\nCompleted {len(PIPELINE_RESULTS)}/{len(PIPELINE_QUESTIONS)} questions.")

## Stage 6 — Results Summary & Visualisation

Aggregates all pipeline results into a summary table and plots.

In [None]:
# ── 6a. Build summary table ─────────────────────────────────────────────────
summary_rows = []
for r in PIPELINE_RESULTS:
    pairs    = r['pairs']
    labels   = Counter(p['final_label'] for p in pairs)
    guards   = Counter(g for p in pairs for g in p['guards'])
    n_pairs  = len(pairs)

    concepts_flat = [c for sc in r['concepts'] for c in sc]
    valid_concepts = sum(1 for c in concepts_flat if c.get('valid'))

    summary_rows.append({
        'Question':            r['question'][:60] + '...',
        'Provider':            r['provider'],
        'Model':               r['model'],
        'Steps':               len(r['steps']),
        'Pairs':               n_pairs,
        'Entailment':          labels.get('entailment', 0),
        'Neutral':             labels.get('neutral', 0),
        'Contradiction':       labels.get('contradiction', 0),
        'Contradiction Rate':  round(labels.get('contradiction', 0) / n_pairs, 3) if n_pairs else 0,
        'UMLS Concepts':       len(concepts_flat),
        'Valid Concepts':      valid_concepts,
        'caution_band':        guards.get('caution_band', 0),
        'lexical_duplicate':   guards.get('lexical_duplicate', 0),
        'direction_conflict':  guards.get('direction_conflict', 0),
        'Time (s)':            r['elapsed_s'],
    })

df_summary = pd.DataFrame(summary_rows)
display(df_summary.T)

In [None]:
# ── 6b. Contradiction details ────────────────────────────────────────────────
print("=== Detected Contradictions ===\n")
any_found = False
for r in PIPELINE_RESULTS:
    for p in r['pairs']:
        if p['final_label'] == 'contradiction':
            i, j   = p['step_pair']
            probs  = p['probs']
            any_found = True
            print(f"Question : {r['question'][:70]}")
            print(f"Pair     : step {i} → step {j}")
            print(f"  Step {i}: {r['steps'][i]}")
            print(f"  Step {j}: {r['steps'][j]}")
            print(f"  P(contradiction)={probs.get('contradiction',0):.3f}  "
                  f"P(entailment)={probs.get('entailment',0):.3f}  "
                  f"P(neutral)={probs.get('neutral',0):.3f}")
            print(f"  Guards: {p['guards'] or 'none'}")
            print()

if not any_found:
    print("No contradictions detected in these 3 questions.")
    print("Try running with FORCE_HEURISTIC_NLI=0 (real transformer model)")
    print("or try more adversarial questions.")

In [None]:
# ── 6c. NLI probability heatmap across steps ────────────────────────────────
fig, axes = plt.subplots(1, len(PIPELINE_RESULTS), figsize=(5 * len(PIPELINE_RESULTS), 4))
if len(PIPELINE_RESULTS) == 1:
    axes = [axes]

for ax, r in zip(axes, PIPELINE_RESULTS):
    pairs  = r['pairs']
    labels = [p['final_label'] for p in pairs]
    n      = len(r['steps'])

    # Build NxN probability matrix for P(contradiction)
    mat = np.full((n, n), np.nan)
    for p in pairs:
        i, j = p['step_pair']
        mat[i, j] = p['probs'].get('contradiction', 0)

    im = ax.imshow(mat, vmin=0, vmax=1, cmap='RdYlGn_r', aspect='auto')
    ax.set_title(f"{r['question'][:40]}...", fontsize=8)
    ax.set_xlabel('Hypothesis step')
    ax.set_ylabel('Premise step')
    ax.set_xticks(range(n)); ax.set_yticks(range(n))
    ax.set_xticklabels(range(1, n+1)); ax.set_yticklabels(range(1, n+1))

    for p in pairs:
        i, j  = p['step_pair']
        val   = mat[i, j]
        label = p['final_label'][0].upper()  # E / N / C
        color = 'white' if val > 0.5 else 'black'
        ax.text(j, i, f"{label}\n{val:.2f}", ha='center', va='center',
                fontsize=7, color=color)

plt.colorbar(im, ax=axes[-1], label='P(contradiction)')
plt.suptitle('P(contradiction) Heatmap per Step-Pair', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# ── 6d. Summary bar chart ────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(max(6, len(PIPELINE_RESULTS) * 3), 4))
x      = np.arange(len(PIPELINE_RESULTS))
width  = 0.25
labels = ['Entailment', 'Neutral', 'Contradiction']
colors = ['#4C72B0', '#8172B2', '#C44E52']
keys   = ['Entailment', 'Neutral', 'Contradiction']

for ki, (key, color) in enumerate(zip(keys, colors)):
    vals = [row[key] for row in summary_rows]
    ax.bar(x + ki * width, vals, width, label=key, color=color, alpha=0.85)

ax.set_xticks(x + width)
ax.set_xticklabels(
    [r['question'][:30] + '...' for r in PIPELINE_RESULTS],
    fontsize=8, rotation=10, ha='right'
)
ax.set_ylabel('# Pairs')
ax.set_title('NLI Label Distribution per Question')
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# ── 6e. Final pass/fail summary ─────────────────────────────────────────────
print("=" * 60)
print("PIPELINE TEST SUMMARY")
print("=" * 60)

total = len(PIPELINE_QUESTIONS)
ok    = len(PIPELINE_RESULTS)
passed = [
    ('All questions completed',     ok == total,                 f'{ok}/{total}'),
    ('All used real LLM (not local)',
     all(r['provider'] != 'local' for r in PIPELINE_RESULTS),
     ', '.join(r['provider'] for r in PIPELINE_RESULTS)),
    ('All returned ≥ 3 steps',
     all(len(r['steps']) >= 3 for r in PIPELINE_RESULTS),
     ', '.join(str(len(r['steps'])) for r in PIPELINE_RESULTS)),
    ('All NLI pairs valid',
     all(p.get('final_label') in {'entailment','neutral','contradiction'}
         for r in PIPELINE_RESULTS for p in r['pairs']),
     'ok'),
    ('Guard signals computed',
     all('guards' in p for r in PIPELINE_RESULTS for p in r['pairs']),
     'ok'),
]

all_ok = True
for name, result, detail in passed:
    icon = '✓' if result else '✗'
    print(f"  {icon}  {name:<40s}  {detail}")
    if not result: all_ok = False

print()
print("Overall:", "✓ ALL PASS" if all_ok else "✗ SOME CHECKS FAILED")
print()
if not all_ok:
    print("If provider='local': set OPENROUTER_API_KEY in Stage 0 and re-run.")
    print("If steps < 3: API likely returned an error — check key and model slug.")