# Sanity Checks for Medical RAG
This notebook demonstrates a minimal end-to-end pipeline:
1. Load a MedQuAD question (held-out test set)
2. Normalize the text
3. Generate a dense embedding
4. Retrieve top-*k* answers from FAISS
5. Build a prompt and query GPT-4
6. Compute a quick token-level F1 against the reference answer

In [None]:
# Environment & imports
import os, faiss, json, yaml
from sentence_transformers import SentenceTransformer
from preprocess import normalize_text
from dotenv import load_dotenv
from openai import OpenAI
import nltk

nltk.download("punkt", quiet=True)

In [None]:
# Load configuration & initialize clients/models
with open("../config.yaml") as f:
    cfg = yaml.safe_load(f)

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

index = faiss.read_index(f"../{cfg['paths']['index_path']}")
model = SentenceTransformer(cfg["models"]["embedding_model"])

def load_jsonl(path):
    with open(path, encoding="utf-8") as f:
        return [json.loads(l) for l in f]

In [None]:
# Load held-out test set and retrieval corpus (answers)
test_questions = [x["question"] for x in load_jsonl(f"../{cfg['paths']['test_questions_path']}")]
test_answers   = [x["answer"]   for x in load_jsonl(f"../{cfg['paths']['test_answers_path']}")]
train_answers  = [x["answer"]   for x in load_jsonl(f"../{cfg['paths']['train_answers_path']}")]

print("Test size:", len(test_questions))
print("Train (index) answers:", len(train_answers))
print("Sample question:", test_questions[0])
print("Reference answer:", test_answers[0][:300], "...")

In [None]:
# Embed query & retrieve top-k
q = normalize_text(test_questions[0])
q_vec = model.encode([q])
k = cfg["retrieval"]["k"]
D, I = index.search(q_vec, k)
retrieved = [train_answers[i] for i in I[0]]

print("Retrieved passages:")
for r in retrieved:
    print("-", r[:200], "...")

In [None]:
# Build prompt & query GPT-4
def build_prompt(q, passages):
    ctx = "\n\n".join(passages)
    return f"Context:\n{ctx}\n\nQuestion: {q}\nAnswer:"

prompt = build_prompt(q, retrieved)

resp = client.chat.completions.create(
    model=cfg["models"]["openai_model"],
    messages=[{"role":"user","content":prompt}],
    temperature=cfg["evaluation"]["temperature"],
    max_tokens=cfg["evaluation"]["max_output_tokens"]
)

pred_answer = resp.choices[0].message.content
print("GPT-4 Answer:\n", pred_answer)

In [None]:
# Quick token-level F1
def token_f1(pred, ref):
    pt = nltk.word_tokenize(pred.lower()); rt = nltk.word_tokenize(ref.lower())
    if not pt or not rt: return 0.0
    common = set(pt) & set(rt)
    if not common: return 0.0
    precision = len(common) / len(pt)
    recall    = len(common) / len(rt)
    return 2 * precision * recall / (precision + recall)

ref_answer = test_answers[0]
print("Token-level F1:", token_f1(pred_answer, ref_answer))