## Imports

In [4]:
import asyncio
import concurrent.futures
import json
import re
from random import Random

from openai import AsyncOpenAI, OpenAI
from tqdm import tqdm

from virtual_lab.agent import Agent
from virtual_lab.constants import CONSISTENT_TEMPERATURE, DEFAULT_FINETUNING_EPOCHS
from virtual_lab.run_meeting import run_meeting
from virtual_lab.utils import (
    async_get_messages,
    compute_finetuning_cost,
    compute_token_cost,
    count_tokens,
    get_pubmed_central_article,
)

from interpretability_constants import (
    background_prompt,
    project_specific_prompt,
    discussions_phase_to_dir,
    model as base_model,
    model_mini as base_model_mini,
    generic_agent,
    computational_linguist,
    clinical_informatics_specialist,
    data_visualization_expert,
)

## Setup

In [5]:
# Constants
finetuning_dir = discussions_phase_to_dir["finetuning"]
papers_dir = finetuning_dir / "papers"
summaries_dir = finetuning_dir / "summaries"
qa_dir = finetuning_dir / "qa_pairs"

for dir_path in [finetuning_dir, papers_dir, summaries_dir, qa_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)

client = OpenAI()
async_client = AsyncOpenAI()
num_concurrent = 10
max_tokens = 250000

# Topic to agent mapping
topic_to_agent = {
    # Computational Linguist topics (for BioBERT/ClinicalBERT + SHAP/LIME)
    "BioBERT fine-tuning for clinical concept extraction": computational_linguist,
    "ClinicalBERT performance on phenotype definition tasks": computational_linguist,
    "Interpretability techniques in biomedical NLP": computational_linguist,
    "Token-level attribution with SHAP and LIME in medical LLMs": computational_linguist,
    "Few-shot learning and domain adaptation in clinical NLP": computational_linguist,

    # Clinical Informatics Specialist topics (for SNOMED CT)
    "SNOMED CT integration in phenotyping pipelines": clinical_informatics_specialist,
    "Mapping free-text clinical concepts to standardized vocabularies": clinical_informatics_specialist,
    "Use of terminology servers for phenotype grounding": clinical_informatics_specialist,
    "Semantic drift and disambiguation in EHR concept mapping": clinical_informatics_specialist,
    "Interoperability of phenotyping tools with EHR standards": clinical_informatics_specialist,

    # Data Visualization Expert topics (for Plotly/Dash)
    "Visual analytics for clinical decision support": data_visualization_expert,
    "Dashboard design for LLM interpretability in healthcare": data_visualization_expert,
    "Visualization of token attribution in NLP models": data_visualization_expert,
    "Human-AI interaction through visual explanation tools": data_visualization_expert,
    "Customizing interpretability interfaces for clinician roles": data_visualization_expert,
}

# Agent to topics mapping
agent_to_topics = {agent: [] for agent in topic_to_agent.values()}

for topic, agent in topic_to_agent.items():
    agent_to_topics[agent].append(topic)

%autoawait asyncio

async def run_query(
    semaphore: asyncio.Semaphore, agent: Agent, query: str, model: str = base_model
) -> str:
    """Run a query using the model.

    :param semaphore: Semaphore to limit the number of concurrent requests.
    :param agent: Agent to use for the query.
    :param query: Query to run.
    :param model: Model to use for the query.
    :return: Response from the model.
    """
    async with semaphore:
        assistant = await async_client.beta.assistants.create(
            name=agent.title, instructions=agent.prompt, model=model
        )
        thread = await async_client.beta.threads.create()
        await async_client.beta.threads.messages.create(
            thread_id=thread.id, role="user", content=query
        )
        await async_client.beta.threads.runs.create_and_poll(
            thread_id=thread.id,
            assistant_id=assistant.id,
            model=model,
            temperature=CONSISTENT_TEMPERATURE,
        )
        messages = await async_get_messages(client=async_client, thread_id=thread.id)
        response = messages[-1]["content"][0]["text"]["value"]
    return response

## PubMed Central search queries

Use the agents to generate PubMed Central search queries by topic.

In [14]:
for topic, agent in topic_to_agent.items():
    save_name = f"{topic.replace(' ', '_')}_queries"
    save_path = finetuning_dir / f"{save_name}.json"

    if save_path.exists():
        continue

    try:
        print(f"🟡 Starting meeting {save_name}")
        run_meeting(
            meeting_type="individual",
            team_member=agent,
            agenda=f"""{background_prompt} {project_specific_prompt}
                You are responsible for understanding the topic {topic} in the context of designing an LLM-based interpretability pipeline for electronic phenotype definition.
                You need to fine-tune yourself on the relevant literature on {topic} to improve your ability to contribute effectively to building a transparent, clinically grounded, and visually intuitive interpretability tool.
                Please write out a series of five distinct search queries that you want to run to find relevant scientific papers on {topic}. Include both general queries about {topic} and queries that specifically relate {topic} to LLM interpretability, phenotype definition, clinical applications, and clinician trust.
                Please provide the queries in Python syntax as a list of double-quoted strings.""",
            agenda_questions=(
                f"What are the queries that you want to perform to identify the relevant literature on {topic} (as a list of double-quoted strings in Python syntax)?",
            ),
            save_dir=finetuning_dir,
            save_name=save_name,
            temperature=CONSISTENT_TEMPERATURE,
        )
        print(f"✅ Finished meeting {save_name}")
    except Exception as e:
        print(f"❌ Meeting {save_name} failed with error: {e}")


#extract query from agent responses

# Set up regex pattern for extracting queries
query_list_pattern = re.compile(r'\[\s*(".*?"\s*(,\s*".*?"\s*)*)?,?\s*\]')

topic_to_queries = {}

for topic, agent in topic_to_agent.items():
    # Get query path for topic
    query_path = finetuning_dir / f"{topic.replace(' ', '_')}_queries.json"

    # Load query discussion
    with open(query_path) as f:
        query_discussion = json.load(f)

    # Extract queries
    query_message = query_discussion[-1]["message"]
    pattern_result = query_list_pattern.search(query_message)

    # Check if pattern is matched
    if pattern_result is None:
        print(f"No queries found for {query_path}")
        continue

    # Extract queries
    queries = json.loads(pattern_result.group())
    topic_to_queries[topic] = queries

🟡 Starting meeting BioBERT_fine-tuning_for_clinical_concept_extraction_queries
DEBUGGING: Individual meeting members = [Computational Linguist, Scientific Critic]


Team:   0%|          | 0/2 [00:07<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:07<00:00,  7.56s/it]


Input token count: 370
Output token count: 218
Tool token count: 0
Max token length: 588
Cost: $0.00
Time: 0:09
✅ Finished meeting BioBERT_fine-tuning_for_clinical_concept_extraction_queries
🟡 Starting meeting ClinicalBERT_performance_on_phenotype_definition_tasks_queries
DEBUGGING: Individual meeting members = [Computational Linguist, Scientific Critic]


Team:   0%|          | 0/2 [00:08<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:08<00:00,  8.04s/it]


Input token count: 358
Output token count: 216
Tool token count: 0
Max token length: 574
Cost: $0.00
Time: 0:09
✅ Finished meeting ClinicalBERT_performance_on_phenotype_definition_tasks_queries
🟡 Starting meeting Interpretability_techniques_in_biomedical_NLP_queries
DEBUGGING: Individual meeting members = [Computational Linguist, Scientific Critic]


Team:   0%|          | 0/2 [00:09<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:09<00:00,  9.11s/it]


Input token count: 358
Output token count: 178
Tool token count: 0
Max token length: 536
Cost: $0.00
Time: 0:10
✅ Finished meeting Interpretability_techniques_in_biomedical_NLP_queries
🟡 Starting meeting Token-level_attribution_with_SHAP_and_LIME_in_medical_LLMs_queries
DEBUGGING: Individual meeting members = [Computational Linguist, Scientific Critic]


Team:   0%|          | 0/2 [00:10<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:10<00:00, 10.01s/it]


Input token count: 400
Output token count: 223
Tool token count: 0
Max token length: 623
Cost: $0.00
Time: 0:11
✅ Finished meeting Token-level_attribution_with_SHAP_and_LIME_in_medical_LLMs_queries
🟡 Starting meeting Few-shot_learning_and_domain_adaptation_in_clinical_NLP_queries
DEBUGGING: Individual meeting members = [Computational Linguist, Scientific Critic]


Team:   0%|          | 0/2 [00:06<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:06<00:00,  6.96s/it]


Input token count: 376
Output token count: 204
Tool token count: 0
Max token length: 580
Cost: $0.00
Time: 0:08
✅ Finished meeting Few-shot_learning_and_domain_adaptation_in_clinical_NLP_queries
🟡 Starting meeting SNOMED_CT_integration_in_phenotyping_pipelines_queries
DEBUGGING: Individual meeting members = [Clinical Informatics Specialist, Scientific Critic]


Team:   0%|          | 0/2 [00:08<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:08<00:00,  8.22s/it]


Input token count: 371
Output token count: 217
Tool token count: 0
Max token length: 588
Cost: $0.00
Time: 0:09
✅ Finished meeting SNOMED_CT_integration_in_phenotyping_pipelines_queries
🟡 Starting meeting Mapping_free-text_clinical_concepts_to_standardized_vocabularies_queries
DEBUGGING: Individual meeting members = [Clinical Informatics Specialist, Scientific Critic]


Team:   0%|          | 0/2 [00:08<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:08<00:00,  8.21s/it]


Input token count: 377
Output token count: 188
Tool token count: 0
Max token length: 565
Cost: $0.00
Time: 0:09
✅ Finished meeting Mapping_free-text_clinical_concepts_to_standardized_vocabularies_queries
🟡 Starting meeting Use_of_terminology_servers_for_phenotype_grounding_queries
DEBUGGING: Individual meeting members = [Clinical Informatics Specialist, Scientific Critic]


Team:   0%|          | 0/2 [00:08<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:08<00:00,  8.71s/it]


Input token count: 359
Output token count: 204
Tool token count: 0
Max token length: 563
Cost: $0.00
Time: 0:11
✅ Finished meeting Use_of_terminology_servers_for_phenotype_grounding_queries
🟡 Starting meeting Semantic_drift_and_disambiguation_in_EHR_concept_mapping_queries
DEBUGGING: Individual meeting members = [Clinical Informatics Specialist, Scientific Critic]


Team:   0%|          | 0/2 [00:07<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:07<00:00,  7.09s/it]


Input token count: 389
Output token count: 197
Tool token count: 0
Max token length: 586
Cost: $0.00
Time: 0:09
✅ Finished meeting Semantic_drift_and_disambiguation_in_EHR_concept_mapping_queries
🟡 Starting meeting Interoperability_of_phenotyping_tools_with_EHR_standards_queries
DEBUGGING: Individual meeting members = [Clinical Informatics Specialist, Scientific Critic]


Team:   0%|          | 0/2 [00:06<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:06<00:00,  6.97s/it]


Input token count: 383
Output token count: 211
Tool token count: 0
Max token length: 594
Cost: $0.00
Time: 0:08
✅ Finished meeting Interoperability_of_phenotyping_tools_with_EHR_standards_queries
🟡 Starting meeting Visual_analytics_for_clinical_decision_support_queries
DEBUGGING: Individual meeting members = [Data Visualization Expert, Scientific Critic]


Team:   0%|          | 0/2 [00:07<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:07<00:00,  7.04s/it]


Input token count: 351
Output token count: 173
Tool token count: 0
Max token length: 524
Cost: $0.00
Time: 0:08
✅ Finished meeting Visual_analytics_for_clinical_decision_support_queries
🟡 Starting meeting Dashboard_design_for_LLM_interpretability_in_healthcare_queries
DEBUGGING: Individual meeting members = [Data Visualization Expert, Scientific Critic]


Team:   0%|          | 0/2 [00:08<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:08<00:00,  8.25s/it]


Input token count: 369
Output token count: 245
Tool token count: 0
Max token length: 614
Cost: $0.00
Time: 0:09
✅ Finished meeting Dashboard_design_for_LLM_interpretability_in_healthcare_queries
🟡 Starting meeting Visualization_of_token_attribution_in_NLP_models_queries
DEBUGGING: Individual meeting members = [Data Visualization Expert, Scientific Critic]


Team:   0%|          | 0/2 [00:07<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:07<00:00,  7.62s/it]


Input token count: 363
Output token count: 201
Tool token count: 0
Max token length: 564
Cost: $0.00
Time: 0:08
✅ Finished meeting Visualization_of_token_attribution_in_NLP_models_queries
🟡 Starting meeting Human-AI_interaction_through_visual_explanation_tools_queries
DEBUGGING: Individual meeting members = [Data Visualization Expert, Scientific Critic]


Team:   0%|          | 0/2 [00:07<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:07<00:00,  7.96s/it]


Input token count: 363
Output token count: 183
Tool token count: 0
Max token length: 546
Cost: $0.00
Time: 0:08
✅ Finished meeting Human-AI_interaction_through_visual_explanation_tools_queries
🟡 Starting meeting Customizing_interpretability_interfaces_for_clinician_roles_queries
DEBUGGING: Individual meeting members = [Data Visualization Expert, Scientific Critic]


Team:   0%|          | 0/2 [00:08<?, ?it/s]1 [00:00<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:08<00:00,  8.63s/it]

Input token count: 369
Output token count: 209
Tool token count: 0
Max token length: 578
Cost: $0.00
Time: 0:09
✅ Finished meeting Customizing_interpretability_interfaces_for_clinician_roles_queries





## PubMed Central papers

Have the agents find papers on PubMed Central using the search queries (100 papers per query) and evaluate them based on their abstracts.

In [16]:
for topic, agent in topic_to_agent.items():
    for query_num, query in enumerate(topic_to_queries[topic]):
        save_name = f"{topic.replace(' ', '_')}_papers_{query_num + 1}"
        save_path = discussions_phase_to_dir["finetuning"] / f"{save_name}.json"

        if save_path.exists():
            continue

        try:
            print(f"🟡 Starting meeting {save_name}")
            run_meeting(
                meeting_type="individual",
                team_member=agent,
                agenda=f"""{background_prompt} {project_specific_prompt}
                You are responsible for understanding the topic {topic} in the context of designing an LLM-based interpretability pipeline for electronic phenotype definition.
                You need to fine-tune yourself on the relevant literature on {topic} to improve your ability to contribute effectively to building a transparent, clinically grounded, and visually intuitive interpretability tool.
                Please write out a series of five distinct search queries that you want to run to find relevant scientific papers on {topic}. Include both general queries about {topic} and queries that specifically relate {topic} to LLM interpretability, phenotype definition, clinical applications, and clinician trust.
                Please provide the queries in Python syntax as a list of double-quoted strings.""",
                agenda_questions=(
                    "What are the PMCIDs and titles of the papers you wish to fine-tune yourself on (as a Python dictionary mapping PMCID as a double-quoted string to title as double-quoted string)?",
                ),
                save_dir=discussions_phase_to_dir["finetuning"],
                save_name=save_name,
                temperature=CONSISTENT_TEMPERATURE,
                pubmed_search=True,
            )
            print(f"✅ Finished meeting {save_name}")
        except Exception as e:
            print(f"❌ Meeting {save_name} failed with error: {e}")

# Set up regex pattern for extracting queries
pmcid_to_title_pattern = re.compile(
    r'\{\s*(".*?"\s*:\s*".*?"\s*(,\s*".*?"\s*:\s*".*?"\s*)*)?\}'
)

for topic, agent in topic_to_agent.items():
    # Set up title to PMC ID dictionary
    title_to_pmcid = {}
    titles_lower, pmcids = set(), set()
    topic_name = topic.replace(" ", "_")

    # Get all paper paths for a topic
    paper_paths = sorted(finetuning_dir.glob(f"{topic_name}_papers_*.json"))

    # Check if all papers results are present
    if len(paper_paths) != 5:
        print(f"Missing papers for {topic}")
        continue

    # Extract PMC IDs and titles from each papers file
    for paper_path in paper_paths:
        # Load paper discussion
        with open(paper_path) as f:
            paper_discussion = json.load(f)

        # Extract PMC IDs and titles dictionary
        paper_message = paper_discussion[1]["message"]
        pattern_result = pmcid_to_title_pattern.search(paper_message)

        # Check if pattern is matched
        if pattern_result is None:
            print(f"No papers found for {paper_path}")
            continue

        # Extract PMC IDs and titles dictionary
        pmcid_to_title = json.loads(pattern_result.group())

        # Add PMC IDs and titles to dictionary, avoiding duplicates
        for pmcid, title in pmcid_to_title.items():
            # Replace en dash and em dash with a hyphen and convert to lowercase
            title = title.replace("–", "-").replace("—", "-")
            title_lower = title.lower()

            if title_lower not in titles_lower and pmcid not in pmcids:
                title_to_pmcid[title] = pmcid
                titles_lower.add(title_lower)
                pmcids.add(pmcid)

    print(f"Number of papers found for {topic}: {len(title_to_pmcid):,}")

    # Save title to PMC ID dictionary
    with open(finetuning_dir / f"{topic_name}_title_to_pmcid.json", "w") as f:
        json.dump(title_to_pmcid, f, indent=4, sort_keys=True)


🟡 Starting meeting Interpretability_techniques_in_biomedical_NLP_papers_2
DEBUGGING: Individual meeting members = [Computational Linguist, Scientific Critic]


Rounds (+ Final Round):   0%|          | 0/1 [00:00<?, ?it/s]

Searching PubMed Central for 5 articles (abstracts) with query: "Interpretability techniques in biomedical NLP"
Found 5 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: "LLM interpretability in electronic phenotype definition"
Found 5 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: "Interpretability techniques in clinical NLP applications"
Found 5 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: "Building clinician trust with NLP interpretability"
Found 5 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: "SNOMED CT integration in NLP interpretability"
Found 5 articles on PubMed Central


Team:   0%|          | 0/2 [00:28<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:28<00:00, 28.37s/it]


Input token count: 363
Output token count: 327
Tool token count: 10,146
Max token length: 690
Cost: $0.03
Time: 0:30
✅ Finished meeting Interpretability_techniques_in_biomedical_NLP_papers_2
🟡 Starting meeting Use_of_terminology_servers_for_phenotype_grounding_papers_2
DEBUGGING: Individual meeting members = [Clinical Informatics Specialist, Scientific Critic]


Rounds (+ Final Round):   0%|          | 0/1 [00:00<?, ?it/s]

Searching PubMed Central for 5 articles (abstracts) with query: "Use of terminology servers for phenotype grounding"
Found 5 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: "Terminology servers and LLM interpretability in phenotype definition"
Found 1 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: "Phenotype grounding using SNOMED CT in clinical applications"
Found 5 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: "Building clinician trust through visual interpretability tools in phenotype definitions"
Found 3 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: "Machine learning and terminology servers for electronic phenotype definitions"
Found 5 articles on PubMed Central


Team:   0%|          | 0/2 [00:44<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:44<00:00, 44.83s/it]


Input token count: 912
Output token count: 688
Tool token count: 5,791
Max token length: 1,052
Cost: $0.02
Time: 0:46
✅ Finished meeting Use_of_terminology_servers_for_phenotype_grounding_papers_2
🟡 Starting meeting Use_of_terminology_servers_for_phenotype_grounding_papers_4
DEBUGGING: Individual meeting members = [Clinical Informatics Specialist, Scientific Critic]


Rounds (+ Final Round):   0%|          | 0/1 [00:00<?, ?it/s]

Searching PubMed Central for 5 articles (abstracts) with query: ""terminology servers" AND "phenotype grounding""
Found 0 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: ""terminology servers" AND "LLM interpretability" AND "phenotype definition""
Found 0 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: ""terminology servers" AND "clinical applications" AND "phenotype grounding""
Found 0 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: ""terminology servers" AND "clinician trust" AND "phenotype grounding""


Team:   0%|          | 0/2 [00:06<?, ?it/s]
Rounds (+ Final Round):   0%|          | 0/1 [00:06<?, ?it/s]


❌ Meeting Use_of_terminology_servers_for_phenotype_grounding_papers_4 failed with error: 429 Client Error: Too Many Requests for url: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=%22terminology+servers%22+AND+%22clinician+trust%22+AND+%22phenotype+grounding%22&retmax=10&retmode=json&sort=relevance
🟡 Starting meeting Semantic_drift_and_disambiguation_in_EHR_concept_mapping_papers_5
DEBUGGING: Individual meeting members = [Clinical Informatics Specialist, Scientific Critic]


Rounds (+ Final Round):   0%|          | 0/1 [00:00<?, ?it/s]

Searching PubMed Central for 2 articles (abstracts) with query: "Semantic drift in EHR concept mapping"
Found 2 articles on PubMed Central
Searching PubMed Central for 2 articles (abstracts) with query: "Disambiguation in electronic health records"
Found 2 articles on PubMed Central
Searching PubMed Central for 2 articles (abstracts) with query: "Semantic drift and LLM interpretability in phenotype definition"
Found 2 articles on PubMed Central
Searching PubMed Central for 2 articles (abstracts) with query: "EHR concept mapping and clinical applications"
Found 2 articles on PubMed Central
Searching PubMed Central for 2 articles (abstracts) with query: "Building clinician trust in LLM-based phenotype tools"
Found 1 articles on PubMed Central


Team:   0%|          | 0/2 [00:25<?, ?it/s]
Rounds (+ Final Round): 100%|██████████| 1/1 [00:25<00:00, 25.99s/it]


Input token count: 962
Output token count: 506
Tool token count: 3,646
Max token length: 895
Cost: $0.02
Time: 0:27
✅ Finished meeting Semantic_drift_and_disambiguation_in_EHR_concept_mapping_papers_5
🟡 Starting meeting Visual_analytics_for_clinical_decision_support_papers_4
DEBUGGING: Individual meeting members = [Data Visualization Expert, Scientific Critic]


Rounds (+ Final Round):   0%|          | 0/1 [00:00<?, ?it/s]

Searching PubMed Central for 5 articles (abstracts) with query: ""Visual analytics for clinical decision support""
Found 0 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: ""Visual analytics for clinical decision support" AND "LLM interpretability""
Found 0 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: ""Visual analytics for clinical decision support" AND "phenotype definition""
Found 0 articles on PubMed Central
Searching PubMed Central for 5 articles (abstracts) with query: ""Visual analytics for clinical decision support" AND "clinical applications""


Team:   0%|          | 0/2 [00:06<?, ?it/s]
Rounds (+ Final Round):   0%|          | 0/1 [00:06<?, ?it/s]

❌ Meeting Visual_analytics_for_clinical_decision_support_papers_4 failed with error: 429 Client Error: Too Many Requests for url: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pmc&term=%22Visual+analytics+for+clinical+decision+support%22+AND+%22clinical+applications%22&retmax=10&retmode=json&sort=relevance
No papers found for discussions/finetuning/BioBERT_fine-tuning_for_clinical_concept_extraction_papers_4.json
Number of papers found for BioBERT fine-tuning for clinical concept extraction: 24
No papers found for discussions/finetuning/ClinicalBERT_performance_on_phenotype_definition_tasks_papers_1.json
No papers found for discussions/finetuning/ClinicalBERT_performance_on_phenotype_definition_tasks_papers_2.json
No papers found for discussions/finetuning/ClinicalBERT_performance_on_phenotype_definition_tasks_papers_3.json
No papers found for discussions/finetuning/ClinicalBERT_performance_on_phenotype_definition_tasks_papers_4.json
No papers found for discussions/fine




In [6]:
import json
from pathlib import Path
from tqdm import tqdm

# Assume these directories are already defined
# finetuning_dir = Path("...")  # directory where _title_to_pmcid.json files live
# papers_dir = Path("...")      # directory where downloaded papers should be saved

pmcids = set()

# Load all PMCIDs from available topic response files
for topic in topic_to_agent:
    topic_name = topic.replace(" ", "_")
    file_path = finetuning_dir / f"{topic_name}_title_to_pmcid.json"

    if not file_path.exists():
        print(f"⚠️ Skipping: {file_path} does not exist.")
        continue

    try:
        with open(file_path) as f:
            title_to_pmcid = json.load(f)
            pmcids.update(title_to_pmcid.values())
    except Exception as e:
        print(f"❌ Error reading {file_path}: {e}")
        continue

print(f"📄 Number of unique PMCIDs: {len(pmcids):,}")

# Download the papers
paper_count = 0

for pmcid in tqdm(sorted(pmcids), desc="Downloading papers"):
    try:
        title, content = get_pubmed_central_article(pmcid=pmcid)
        if not title:
            print(f"⚠️ Skipping PMCID {pmcid}: title is None or empty.")
            continue

        with open(papers_dir / f"{pmcid}.json", "w") as f:
            json.dump({"title": title, "content": content}, f, indent=4, sort_keys=True)

        paper_count += 1

    except Exception as e:
        print(f"❌ Failed to download or save PMCID {pmcid}: {e}")
        continue

print(f"✅ Number of papers downloaded: {paper_count:,}")


⚠️ Skipping: discussions/finetuning/Use_of_terminology_servers_for_phenotype_grounding_title_to_pmcid.json does not exist.
⚠️ Skipping: discussions/finetuning/Visual_analytics_for_clinical_decision_support_title_to_pmcid.json does not exist.
📄 Number of unique PMCIDs: 147


Downloading papers: 100%|██████████| 147/147 [01:12<00:00,  2.02it/s]

✅ Number of papers downloaded: 147





## Summarize papers

Define a function to using an agent to summarize a paper.

In [None]:
async def summarize_paper(
    semaphore: asyncio.Semaphore,
    agent: Agent,
    topic: str,
    pmcid: str,
    title: str,
    content: list[str],
) -> tuple[str, str, str]:
    """Summarize a paper using the model.

    :param semaphore: Semaphore to limit the number of concurrent requests.
    :param agent: Agent to use for summarization.
    :param topic: Topic of interest.
    :param pmcid: PMC ID of the paper.
    :param title: Title of the paper.
    :param content: Content of the paper.
    :return: Tuple of PMC ID, title, and summary of the paper.
    """
    # Set up query with paper
    query = "\n\n".join(
    [
        f'Please summarize in extreme detail the following paper titled "{title}". Focus especially on summarizing key insights about the topic "{topic}" as it relates to building an LLM-based interpretability pipeline for electronic phenotype definition.'
    ]
    + content
    )

    # Run query to get summary
    summary = await run_query(
        semaphore=semaphore,
        agent=agent,
        query=query,
    )

    return pmcid, title, summary

#Use the agents to summarize each paper in parallel.
for topic, agent in topic_to_agent.items():
    topic_name = topic.replace(" ", "_")

    # Create save directory
    topic_summary_dir = summaries_dir / topic_name
    topic_summary_dir.mkdir(parents=True, exist_ok=True)

    # Load title to PMC ID dictionary
    with open(finetuning_dir / f"{topic_name}_title_to_pmcid.json") as f:
        title_to_pmcid: dict[str, str] = json.load(f)

    # Get unique PMC IDs
    pmcids = sorted(set((title_to_pmcid.values())))

    # Load papers
    pmcid_to_paper = {}
    for pmcid in pmcids:
        paper_path = papers_dir / f"{pmcid}.json"

        if paper_path.exists():
            with open(paper_path) as f:
                paper: dict[str, str | list[str]] = json.load(f)
                pmcid_to_paper[pmcid] = paper

    print(f"Number of papers loaded for {topic}: {len(pmcid_to_paper):,}")

    # Limit papers by length
    pmcid_to_paper = {
        pmcid: paper
        for pmcid, paper in pmcid_to_paper.items()
        if sum(count_tokens(paragraph) for paragraph in paper["content"]) <= max_tokens
    }

    print(
        f"Number of papers after token limit of {max_tokens:,} for {topic}: {len(pmcid_to_paper):,}"
    )

    # Compute input token cost
    input_token_count = sum(
        count_tokens(paragraph)
        for paper in pmcid_to_paper.values()
        for paragraph in [paper["title"]] + paper["content"]
    )

    input_token_cost = compute_token_cost(
        model=base_model,
        input_token_count=input_token_count,
        output_token_count=0,
    )

    print(f"Approximate input token cost for {topic} = ${input_token_cost:.2f}")

    # Set up semaphore with the number of concurrent requests
    semaphore = asyncio.Semaphore(num_concurrent)

    # Create tasks for each paper
    tasks = [
        asyncio.create_task(
            summarize_paper(
                semaphore=semaphore,
                agent=agent,
                topic=topic,
                pmcid=pmcid,
                title=paper["title"],
                content=paper["content"],
            )
        )
        for pmcid, paper in pmcid_to_paper.items()
    ]

    # Run agent summary for each paper
    results = [
        (await task) for task in tqdm(asyncio.as_completed(tasks), total=len(tasks))
    ]

    # Save summaries
    for pmcid, title, summary in results:
        with open(topic_summary_dir / f"{pmcid}.json", "w") as f:
            json.dump(
                {"pmcid": pmcid, "title": title, "summary": summary},
                f,
                indent=4,
                sort_keys=True,
            )

#Convert the summaries to the format required for fine-tuning, aggregated by topic.
for topic, agent in topic_to_agent.items():
    # Convert summaries to training data format
    training_data = []

    topic_name = topic.replace(" ", "_")

    # Get summary paths
    topic_summary_dir = summaries_dir / topic_name
    summary_paths = sorted(topic_summary_dir.glob("*.json"))

    for summary_path in summary_paths:
        # Load paper summary data
        with open(summary_path) as f:
            summary_data = json.load(f)

        # Extract title and summary
        title, summary = summary_data["title"], summary_data["summary"]

        # Add example to training data
        training_data.append(
        {
            "messages": [
                {"role": "system", "content": agent.prompt},
                {
                    "role": "user",
                    "content": f'Please tell me about the paper "{title}" and its insights into "{topic}" in relation to designing an LLM-based interpretability pipeline for electronic phenotype definition.',
                },
                {"role": "assistant", "content": summary},
            ]
        }
        )

    # Shuffle training data
    random = Random(0)
    random.shuffle(training_data)

    # Count tokens
    token_count = sum(
        count_tokens(message["content"])
        for data in training_data
        for message in data["messages"]
    )

    # Determine finetuning cost
    finetuning_cost = compute_finetuning_cost(
        model=base_model_mini,
        token_count=token_count,
        num_epochs=DEFAULT_FINETUNING_EPOCHS,
    )

    # Print stats
    print(f"Number of paper examples for {topic}: {len(training_data):,}")
    print(f"Token count for {topic}: {token_count:,}")
    print(f"Finetuning cost for {topic} using {base_model_mini}: ${finetuning_cost:.2f}")
    print()

    # Save training data in jsonl format
    with open(
        finetuning_dir / f"{topic.replace(' ', '_')}_training_data.jsonl", "w"
    ) as f:
        f.write("\n".join(json.dumps(example) for example in training_data))

#Convert the summaries to the format required for fine-tuning, aggregated by agent.
for agent, topics in agent_to_topics.items():
    # Convert summaries to training data format
    training_data = []

    for topic in topics:
        topic_name = topic.replace(" ", "_")

        # Get summary paths
        topic_summary_dir = summaries_dir / topic_name
        summary_paths = sorted(topic_summary_dir.glob("*.json"))

        for summary_path in summary_paths:
            # Load paper summary data
            with open(summary_path) as f:
                summary_data = json.load(f)

            # Extract title and summary
            title, summary = summary_data["title"], summary_data["summary"]

            # Add example to training data
            training_data.append(
                {
                    "messages": [
                        {"role": "system", "content": agent.prompt},
                        {
                            "role": "user",
                            "content": f'Please tell me about the paper "{title}" and its insights into "{topic}" in relation to designing SARS-CoV-2 nanobody binders.',
                        },
                        {"role": "assistant", "content": summary},
                    ]
                }
            )

    # Shuffle training data
    random = Random(0)
    random.shuffle(training_data)

    # Count tokens
    token_count = sum(
        count_tokens(message["content"])
        for data in training_data
        for message in data["messages"]
    )

    # Determine finetuning cost
    finetuning_cost = compute_finetuning_cost(
        model=base_model_mini,
        token_count=token_count,
        num_epochs=DEFAULT_FINETUNING_EPOCHS,
    )

    # Print stats
    print(f"Number of paper examples for {agent.title}: {len(training_data):,}")
    print(f"Token count for {agent.title}: {token_count:,}")
    print(f"Finetuning cost for {agent.title} using {base_model_mini}: ${finetuning_cost:.2f}")
    print()

    # Save training data in jsonl format
    with open(
        finetuning_dir / f"{agent.title.replace(' ', '_')}_training_data.jsonl", "w"
    ) as f:
        f.write("\n".join(json.dumps(example) for example in training_data))

#Convert the summaries to the format required for fine-tuning, aggregated across all topics.
# Convert summaries to training data format
training_data = []

for topic in topic_to_agent:
    topic_name = topic.replace(" ", "_")

    # Get summary paths
    topic_summary_dir = summaries_dir / topic_name
    summary_paths = sorted(topic_summary_dir.glob("*.json"))

    for summary_path in summary_paths:
        # Load paper summary data
        with open(summary_path) as f:
            summary_data = json.load(f)

        # Extract title and summary
        title, summary = summary_data["title"], summary_data["summary"]

        # Add example to training data
        training_data.append(
            {
                "messages": [
                    {"role": "system", "content": generic_agent.prompt},
                    {
                        "role": "user",
                        "content": f'Please tell me about the paper "{title}" and its insights into "{topic}" in relation to designing SARS-CoV-2 nanobody binders.',
                    },
                    {"role": "assistant", "content": summary},
                ]
            }
        )

# Shuffle training data
random = Random(0)
random.shuffle(training_data)

# Count tokens
token_count = sum(
    count_tokens(message["content"])
    for data in training_data
    for message in data["messages"]
)

# Determine finetuning cost
finetuning_cost = compute_finetuning_cost(
    model=base_model_mini,
    token_count=token_count,
    num_epochs=DEFAULT_FINETUNING_EPOCHS,
)

# Print stats
print(f"Number of paper examples: {len(training_data):,}")
print(f"Token count: {token_count:,}")
print(f"Finetuning cost using {base_model_mini}: ${finetuning_cost:.2f}")
print()

# Save training data in jsonl format
with open(
    finetuning_dir / "all_generic_training_data.jsonl", "w"
) as f:
    f.write("\n".join(json.dumps(example) for example in training_data))