# MedDSL-Lite 3-Arm Evaluation

This notebook implements the 3-arm evaluation comparing LLM-only, Rules-only, and Hybrid approaches for diabetic retinopathy triage.


In [None]:
import sys
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import random
from typing import Dict, List, Any

# Add the parent directory to the path to import mddsl
sys.path.append(str(Path.cwd().parent))

from mddsl import execute, explain, SnippetRetriever
from mddsl.interpreter import load_dsl_from_file


## Setup and Data Loading


In [None]:
# Load DSL rules and cases
dsl = load_dsl_from_file("../rules/dr_triage_v1.yaml")
retriever = SnippetRetriever("../snippets")

# Load test cases
cases = []
with open("../data/cases_small.jsonl", "r") as f:
    for line in f:
        if line.strip():
            cases.append(json.loads(line))

print(f"Loaded {len(cases)} test cases")

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)


## LLM-Only Arm (Simulated)


In [None]:
def simulate_llm_only(case: Dict[str, Any], k: int = 5) -> List[Dict[str, Any]]:
    """
    Simulate LLM-only responses with some variability.
    In practice, this would call an actual LLM API.
    """
    results = []
    
    # Extract case features
    dr_grade = case.get("dr_grade")
    edema_prob = case.get("macula", {}).get("edema_prob")
    vision_reduced = case.get("vision_reduced", False)
    qc_pass = case.get("qc", {}).get("fundus_pass", False) and case.get("qc", {}).get("macula_view", False)
    
    for i in range(k):
        # Simulate some variability in LLM responses
        if not qc_pass:
            decision = "abstain"
            rationale = "QC failure - insufficient image quality"
        elif dr_grade in ["pdr", "severe_npdr"]:
            decision = "retina referral (urgent)"
            rationale = f"Severe DR grade {dr_grade} requires urgent retina specialist evaluation"
        elif dr_grade == "moderate_npdr":
            decision = "retina referral (2-4 weeks)"
            rationale = "Moderate NPDR requires referral for ophthalmic evaluation"
        elif edema_prob and edema_prob >= 0.70:
            decision = "retina referral (2-4 weeks)"
            rationale = f"High edema probability {edema_prob} suggests DME, requires OCT and referral"
        elif vision_reduced:
            decision = "follow-up (3m)"
            rationale = "Vision reduction requires earlier follow-up"
        else:
            decision = "follow-up (12m)"
            rationale = "Mild or no DR, routine follow-up appropriate"
        
        # Add some variability (simulate LLM inconsistency)
        if random.random() < 0.2:  # 20% chance of different decision
            alternatives = ["retina referral (urgent)", "retina referral (2-4 weeks)", "follow-up (6m)", "follow-up (12m)"]
            if decision in alternatives:
                decision = random.choice([alt for alt in alternatives if alt != decision])
        
        results.append({
            "decision": decision,
            "rationale": rationale,
            "thresholds_mentioned": "edema_prob ≥ 0.70" in rationale or "DR grade" in rationale,
            "citations": ["AAO PPP: Diabetic Retinopathy"] if "DR" in rationale else []
        })
    
    return results

# Run LLM-only evaluation
llm_results = []
for i, case in enumerate(cases):
    case_results = simulate_llm_only(case, k=5)
    for j, result in enumerate(case_results):
        llm_results.append({
            "case_id": i,
            "run_id": j,
            "arm": "LLM-only",
            **result
        })

print(f"Generated {len(llm_results)} LLM-only responses")
