In [1]:
import asyncio
import os
from typing import Literal
import zipfile

import dotenv
import instructor
import openai
from pydantic import BaseModel, Field

In [2]:
dotenv.load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

## Setting up for Extraction

### Properties to extract

- What is the study design?
- What hypotheses were being tested?
- Methods for identifying follow up recommendations (including by a human and/or AI)
- Methods for communicating follow up recommnendations
- Methods for facillitating ordering / scheduling / completion of recommended follow up?
- Methods for assessing if follow up recommendations are followed
- Methods for assessing outcomes for patinets for whom follow up was recommended
- Assessment of recommendation format on follow up being performed

Exclusion criteria
- Only a guidelines
- Inumeration of follow up recommendation / incidental findings (incidence / prevalence)
- Opinion only (editorial that does not describe a system)

In [3]:
class ArticleInfo(BaseModel):
    pmid: str = Field(description="PMID of the article")
    title: str = Field(description="Title of the article")
    about_imaging_followup_recommendations : bool | Literal['unknown'] = Field(description="Does the article discuss imaging follow-up recommendations? True or False; if you can't tell from the information, say 'unknown'")
    has_abstract: bool | Literal['unknown'] = Field(description="Does the article have an abstract? True or False; if you can't tell from the information, say 'unknown'")
    about_breast_imaging_followup: bool | Literal['unknown'] = Field(description="Does the article discuss breast imaging follow-up recommendations? True or False; if you can't tell from the information, say 'unknown'")
    followup_guideline: bool | Literal['unknown'] = Field(description="Does the article discuss a follow-up guideline? True or False; if you can't tell from the information, say 'unknown'")
    management_guideline: bool | Literal['unknown'] = Field(description="Does the article discuss a management guideline? True or False; if you can't tell from the information, say 'unknown'")
    followup_detection: bool | Literal['unknown'] = Field(description="Does the article discuss follow-up detection? True or False; if you can't tell from the information, say 'unknown'")
    system_or_program: bool | Literal['unknown'] = Field(description="Does the article discuss a system or program to improve follow-up? True or False; if you can't tell from the information, say 'unknown'")
    # followup_description: Literal["guideline", "measurement", "detection", "system or program"] | None = Field(default=None, description="If the article discusses follow-up, what is its focus? 'guideline': describes appropriate imaging follow-up; 'measurement': measures how often follow-up is obtained; 'detection': methods for detecting whether follow-up has been obtained; 'system or program': describes a system or program to improve follow-up for a particular imaging finding. If you can't tell from the information or the article isn't about imaging follow-up, leave it blank.")
    # study_design: str | Literal['unknown'] = Field(description="Study design of the article; if you can't tell from the information, say 'unknown'")
    # hypothesis: str  | Literal['unknown'] = Field(description="Hypothesis of the article; if you can't tell from the information, say 'unknown'")
    # identifying_recommendations_method: str | Literal['unknown'] = Field(description="How did the study identify recommendations; if you can't tell from the information, say 'unknown'")
    # communication_recommendations_method: str | Literal['unknown'] = Field(description="How did the program described communicate recommendations; if you can't tell from the information, say 'unknown'")
    # incidence_only: bool | Literal['unknown']= Field(description="Is the article only measuring the incidence of a particular recommendation? True or False; if you can't tell from the information, say 'unknown'")
    # guideline_only: bool | Literal['unknown'] = Field(description="Is the article only describing a guideline? True or False; if you can't tell from the information, say 'unknown'")
    # opinion_only: bool | Literal['unknown'] = Field(description="Is the article only an opinion piece or editorial? True or False; if you can't tell from the information, say 'unknown'")
    # review_only: bool | Literal['unknown'] = Field(description="Is the article only a review? True or False; if you can't tell from the information, say 'unknown'")


In [4]:
articles = {}

articles_zip = zipfile.ZipFile("data/entries_with_abstracts.zip", "r")
for filename in articles_zip.namelist():
    pmid = filename[0:-4]
    with articles_zip.open(filename) as f:
        articles[pmid] = f.read().decode("utf-8")

In [5]:
import random
pmids = list(articles.keys())
def get_random_article() -> str:
    pmid = pmids[random.randint(0, len(pmids) - 1)]
    return articles[pmid]


In [6]:
llm = instructor.from_openai(openai.AsyncOpenAI())

In [7]:
PROMPT = """You are a medical expert. You are given information about an article in a medical journal. 
Based on the information given, please answer the following questions: 
- Is the study about imaging follow-up recommendations, including the identification, communication, tracking, management, and outcomes of these recommendations? Note that this includes follow-up for incidental findings in radiology or other "clinically significant" findings/results in imaging exams. 
- Does the article have an abstract? 
- Is the article primarily about follow-up in breast imaging, such as mammography, breast ultrasound, or breast MRI? 
- If the article is about imaging follow-up, please answer the following questions: 
  - *follow-up guideline*: Does the article describe what the appropriate follow-up is for a particular imaging finding? 
  - *management guideline*: Does the article describe what the appropriate follow-up is for a particular diagnosis or following a particular therapy? 
  - *follow-up detection*: Does the article describe how do to identify a a follow up recommendation in a radiology report? 
  - *system or program*: Does the article describe a system or program to improve adherence to follow-up recommendations from radiology reports?
"""

"""
- What is the study design of the article? (e.g., cohort study, case-control study, randomized controlled trial, etc.)
- What is the hypothesis of the article?
- How did the study identify recommendations?
- How did the program described communicate recommendations?
- Is the article only measuring the incidence of a particular recommendation? True or False
- Is the article only describing a guideline? True or False
- Is the article only an opinion piece or editorial? True or False
- Is the article only a review? True or False
- Is the article only measuring the incidence of a particular recommendation? True or False
- Is the article only describing a guideline? True or False
"""

INSTRUCTIONS = """Answer the questions based ONLY on the given information. If you can't tell from the information, say 'unknown'."""

In [8]:
def generate_prompt(info: str) -> str:
    return f"{PROMPT}\n\nArticle Information:\n\n{info}\n\n{INSTRUCTIONS}\n\nAnswer:"

async def extract_study_info(info: str, semaphore: asyncio.Semaphore) -> ArticleInfo:
    async with semaphore:
        prompt = generate_prompt(info)
        response = await llm.chat.completions.create(
            model="o4-mini",
            messages=[
                {"role": "user", "content": prompt}
            ],
            response_model=ArticleInfo,
        )
        return response


In [16]:
result = await extract_study_info(get_random_article(), asyncio.Semaphore(1))

In [17]:
print(result.model_dump_json(indent=2))

{
  "pmid": "33810834",
  "title": "Strategies to optimize management of incidental radiographic findings in the primary care setting: A mixed methods study.",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true,
  "about_breast_imaging_followup": false,
  "followup_guideline": false,
  "management_guideline": false,
  "followup_detection": false,
  "system_or_program": true
}


In [18]:
async def process_articles(infos: list[str]) -> list[ArticleInfo]:
    semaphore = asyncio.Semaphore(20)  # Limit to 20 concurrent requests
    tasks = [extract_study_info(info, semaphore) for info in infos]
    return await asyncio.gather(*tasks)

In [19]:
all_articles = list(articles.values())
all_infos: list[ArticleInfo] = []
for start_index in range(0, len(all_articles), 200):
    end_index = min(start_index + 200, len(all_articles))
    print(f"Processing articles {start_index} to {end_index}...", end=' ')
    batch = all_articles[start_index:end_index]
    results = await process_articles(batch)
    all_infos.extend(results)
    print("done.")

Processing articles 0 to 200... done.
Processing articles 200 to 400... done.
Processing articles 400 to 600... done.
Processing articles 600 to 800... done.
Processing articles 800 to 1000... done.
Processing articles 1000 to 1200... done.
Processing articles 1200 to 1400... done.
Processing articles 1400 to 1600... done.
Processing articles 1600 to 1800... done.
Processing articles 1800 to 2000... done.
Processing articles 2000 to 2117... done.


In [21]:
for info in all_infos[0:5]:
    print(info.model_dump_json(indent=2))

{
  "pmid": "100459",
  "title": "Follow-up investigation of reconstruction of the alveolar process in the atrophic mandible",
  "about_imaging_followup_recommendations": false,
  "has_abstract": "unknown",
  "about_breast_imaging_followup": false,
  "followup_guideline": false,
  "management_guideline": false,
  "followup_detection": false,
  "system_or_program": false
}
{
  "pmid": "10063862",
  "title": "Methods of compliance with Mammography Quality Standards Act regulations for tracking positive mammograms: survey results",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true,
  "about_breast_imaging_followup": true,
  "followup_guideline": false,
  "management_guideline": false,
  "followup_detection": false,
  "system_or_program": true
}
{
  "pmid": "10071638",
  "title": "Rights and responsibilities of the radiologist in determining indications and applying radiologic procedures",
  "about_imaging_followup_recommendations": false,
  "has_abstract": "unknown"

In [20]:
import csv

with open("data/entries_with_abstracts_all.csv", "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=all_infos[0].model_dump().keys())
    writer.writeheader()
    for info in all_infos:
        writer.writerow(info.model_dump())

In [15]:
s="""38147905
36922265
36650302
35752517
33607066
29728325
24294680
37879972
37302681
29305076
28434846
36063362
30779667
36564264
36922106
23548405
32220539
30860895
22195225
31452006
37073901
27832518
18647895
19703870
20884911
29179912
34076452
36287625
37236842
31206047
28742377
36759382
37820835
36521629
36411090
36922107
36792996
37000450
35867062
35773813
35788428
35186516
32827469
34374592
32857982
33478839
33278340
33984286
32294771
20308458"""
good_pmids = s.splitlines()

In [None]:
good_infos: list[ArticleInfo] = []
for pmid in good_pmids:
    if pmid not in articles:
        print(f"PMID {pmid} not found in articles.")
        continue
    response = await llm.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": generate_prompt(pmid),
            }
        ],
        model="o4-mini",
        response_model=ArticleInfo,
    )

    print(response.model_dump_json(indent=2))
    good_infos.append(response)

{
  "pmid": "38147905",
  "title": "Financial Impact of a Radiology Safety Net Program for Resolution of Clinically Necessary Follow-up Imaging Recommendations",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "36922265",
  "title": "White Paper: Best Practices in the Communication and Management of Actionable Incidental Findings in Emergency Department Imaging.",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "36650302",
  "title": "Implementing a Streamlined Radiology Workflow to Close the Loop on Incidental Imaging Findings in the Emergency Department",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "35752517",
  "title": "Catching Those Who Fall Through the Cracks: Integrating a Follow-Up Process for Emergency Department Patients with Incidental Radiologic Findings",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "33607066",


In [17]:
with open("data/likely_entries.csv", "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=good_infos[0].model_dump().keys())
    writer.writeheader()
    for info in good_infos:
        writer.writerow(info.model_dump())

In [None]:
recheck_pmids = ["24294680", "19703870", "35788428", "34374592"]
for pmid in recheck_pmids:
    response = await llm.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": generate_prompt(pmid),
            }
        ],
        model="o4-mini",
        response_model=ArticleInfo,
    )

    print(response.model_dump_json(indent=2))

{
  "pmid": "24294680",
  "title": "An initiative to improve the management of clinically significant test results in a large health care network",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "19703870",
  "title": "Important imaging finding e-mail alert system: experience after 3 years of implementation.",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "35788428",
  "title": "Automated Notification of Relevant Expected or Incidental Findings in Imaging Exams in a Verticalized Healthcare System",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "34374592",
  "title": "Electronic Health Record Closed-Loop Communication Program for Unexpected Nonemergent Findings",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
