# Demo Pipeline (Manuscript-aligned)

This notebook demonstrates the **end-to-end, evidence-centered QA workflow** described in the manuscript using **synthetic demonstration data**.

Pipeline steps:
1. Load a small synthetic demonstration corpus (document-level texts + metadata)
2. Build a TF–IDF document index and run cosine-similarity retrieval (top-*k*)
3. Perform **query-aware extractive summarization** (one sentence per retrieved document; TF–IDF cosine similarity)
4. Compute **qualitative uncertainty** (min–max normalize → mean → thresholds → Low/Medium/High)
5. (Optional) Run the **demo evaluation** workflow using synthetic CSVs


In [None]:
from pathlib import Path
import pandas as pd

# Local modules (repository code)
from retrieval import build_tfidf_index_from_dataframe, retrieve_top_k
from summarization import summarize_retrieved
from uncertainty import ConfidenceConfig, compute_overall_confidence_from_retrieval

from evaluation import (
    parse_relevance_list,
    add_precision_at_k,
    aggregate_expert_scores,
    compute_expert_confidence_majority,
    compute_uncertainty_alignment,
    build_report_table,
    compute_summary_stats,
    pairwise_cohens_kappa,
)


In [None]:
REPO_ROOT = Path("..").resolve()  # notebook is in notebooks/
DATA_ROOT = REPO_ROOT / "data"
DEMO_CORPUS_DIR = DATA_ROOT / "demo_corpus"
DEMO_EVAL_DIR = DATA_ROOT / "demo_evaluation"

DEMO_CORPUS_DIR, DEMO_EVAL_DIR


## 1) Load scenarios

In [None]:
scenarios = pd.read_csv(DEMO_EVAL_DIR / "scenarios.csv")
scenarios

## 2) Load synthetic demonstration corpus

This repo stores simplified/synthetic study text files to avoid redistributing copyrighted full texts.
We assemble a small document table with:
- `doc_id`
- `title`
- `text` (synthetic)
- optional metadata columns (if present)


In [None]:
# Map scenario_id -> demo text filename (convention used in this repo)
# Adjust here if your filenames differ.
scenario_to_file = {
    "S1": "pm25_copd_example.txt",
    "S2": "no2_asthma_example.txt",
    "S3": "lead_neurodevelopment_example.txt",
    "S4": "o3_mortality_example.txt",
    "S5": "vocs_respiratory_example.txt",
    "S6": "cadmium_cvd_example.txt",
}

docs = []
for sid, fname in scenario_to_file.items():
    fpath = DEMO_CORPUS_DIR / fname
    text = fpath.read_text(encoding="utf-8")
    row = scenarios.loc[scenarios["scenario_id"] == sid].iloc[0].to_dict()
    docs.append({
        "doc_id": sid,                 # keep doc_id aligned with scenario_id for demo clarity
        "title": f"{row['pollutant']} → {row['outcome']} (demo study)",
        "text": text,
        "pollutant": row["pollutant"],
        "outcome": row["outcome"],
        "scenario_id": sid,
    })

doc_df = pd.DataFrame(docs)
doc_df

## 3) Build TF–IDF index (document-level)

In [None]:
# Manuscript alignment: document text is formed by concatenating title + abstract/key findings.
# In the demo corpus, the synthetic file text already contains structured study information.

index = build_tfidf_index_from_dataframe(
    doc_df,
    doc_id_col="doc_id",
    title_col="title",
    text_col="text",
    # If your helper supports specifying text construction, keep defaults consistent with your implementation.
)
index

## 4) Run the QA pipeline for a single scenario

In [None]:
# Pick a scenario to demo
sid = "S1"
query = scenarios.loc[scenarios["scenario_id"] == sid, "query"].iloc[0]
query

In [None]:
# Retrieval (top-k)
k = 3
retrieved = retrieve_top_k(query, index, k=k, include_text=True)
retrieved

In [None]:
# Summarization (one sentence per retrieved document; TF–IDF cosine similarity)
summary_text, trace_df = summarize_retrieved(query, retrieved)
summary_text, trace_df

In [None]:
# Uncertainty (manuscript-aligned): min–max normalize → mean → thresholds (t1, t2)
conf_cfg = ConfidenceConfig(normalize=True, t1=0.33, t2=0.67)
support, conf_label = compute_overall_confidence_from_retrieval(retrieved, config=conf_cfg)
support, conf_label

## 5) Run the pipeline across all demo scenarios

In [None]:
rows = []
for _, r in scenarios.iterrows():
    q = r["query"]
    sid = r["scenario_id"]

    retrieved = retrieve_top_k(q, index, k=3, include_text=True)
    summary_text, trace_df = summarize_retrieved(q, retrieved)
    support, conf_label = compute_overall_confidence_from_retrieval(retrieved, config=conf_cfg)

    rows.append({
        "scenario_id": sid,
        "query": q,
        "summary": summary_text,
        "support_indicator": support,
        "system_confidence": conf_label,
    })

demo_outputs = pd.DataFrame(rows).sort_values("scenario_id")
demo_outputs

In [None]:
# Export system confidence (demo) if you want the CSV to be generated programmatically
out_dir = REPO_ROOT / "outputs"
out_dir.mkdir(exist_ok=True, parents=True)

demo_outputs[["scenario_id", "system_confidence"]].to_csv(out_dir / "system_confidence_generated.csv", index=False)
demo_outputs.to_csv(out_dir / "demo_pipeline_outputs.csv", index=False)

out_dir

## 6) Demo evaluation workflow (synthetic ratings)

In [None]:
system_conf = pd.read_csv(DEMO_EVAL_DIR / "system_confidence.csv")
expert_ratings = pd.read_csv(DEMO_EVAL_DIR / "expert_ratings.csv")
retrieval_rel = pd.read_csv(DEMO_EVAL_DIR / "retrieval_relevance.csv")

system_conf, expert_ratings.head(), retrieval_rel.head()

In [None]:
# Parse relevance_list strings like "1|0|1" into list[int], then compute precision@k
rel = retrieval_rel.copy()
rel["relevance_list"] = rel["relevance_list"].apply(parse_relevance_list)
rel = add_precision_at_k(rel, relevance_col="relevance_list", out_col="precision_at_k")
rel

In [None]:
# Aggregate expert factuality & interpretability (means per scenario)
agg_scores = aggregate_expert_scores(expert_ratings)
agg_scores

In [None]:
# Expert majority confidence
expert_maj = compute_expert_confidence_majority(expert_ratings)
expert_maj

In [None]:
# System–expert uncertainty alignment (0/1)
ua = compute_uncertainty_alignment(
    system_confidence_df=system_conf,
    expert_majority_df=expert_maj,
)
ua

In [None]:
# Merge to build a Table-1-like report table
results = (
    scenarios
    .merge(rel[["scenario_id", "k", "precision_at_k"]], on="scenario_id", how="left")
    .merge(agg_scores, on="scenario_id", how="left")
    .merge(system_conf, on="scenario_id", how="left")
    .merge(expert_maj, on="scenario_id", how="left")
    .merge(ua[["scenario_id", "aligned"]], on="scenario_id", how="left")
)

report = build_report_table(results)
report

In [None]:
# Inter-rater agreement (pairwise Cohen's κ)
# Note: κ is most appropriate for categorical labels; here we compute κ separately for factuality and interpretability.
kappa_f = pairwise_cohens_kappa(expert_ratings, label_col="factuality")
kappa_i = pairwise_cohens_kappa(expert_ratings, label_col="interpretability")
kappa_f, kappa_i

In [None]:
# Summary stats (mean ± SD) and alignment rate, with optional κ means
summary = compute_summary_stats(
    results=results.merge(ua, on="scenario_id", how="left"),
    kappa_factuality=kappa_f,
    kappa_interpretability=kappa_i,
    precision_col="precision_at_k",
    factuality_col="factuality_mean",
    interpretability_col="interpretability_mean",
    alignment_col="aligned",
)
summary

In [None]:
# Export evaluation outputs (optional)
from evaluation import export_outputs

export_outputs(
    out_dir=out_dir,
    report_table=report,
    summary_stats=summary,
    kappa_factuality=kappa_f,
    kappa_interpretability=kappa_i,
)

sorted(p.name for p in out_dir.glob("*.csv"))