In [1]:
import os
from typing import Literal
import zipfile

import dotenv
import instructor
import openai
from pydantic import BaseModel, Field

In [2]:
dotenv.load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

## Setting up for Extraction

### Properties to extract

- What is the study design?
- What hypotheses were being tested?
- Methods for identifying follow up recommendations (including by a human and/or AI)
- Methods for communicating follow up recommnendations
- Methods for facillitating ordering / scheduling / completion of recommended follow up?
- Methods for assessing if follow up recommendations are followed
- Methods for assessing outcomes for patinets for whom follow up was recommended
- Assessment of recommendation format on follow up being performed

Exclusion criteria
- Only a guidelines
- Inumeration of follow up recommendation / incidental findings (incidence / prevalence)
- Opinion only (editorial that does not describe a system)

In [3]:
class ArticleInfo(BaseModel):
    pmid: str = Field(description="PMID of the article")
    title: str = Field(description="Title of the article")
    about_imaging_followup_recommendations : bool | Literal['unknown'] = Field(description="Does the article discuss imaging follow-up recommendations? True or False; if you can't tell from the information, say 'unknown'")
    has_abstract: bool | Literal['unknown'] = Field(description="Does the article have an abstract? True or False; if you can't tell from the information, say 'unknown'")
    # study_design: str | Literal['unknown'] = Field(description="Study design of the article; if you can't tell from the information, say 'unknown'")
    # hypothesis: str  | Literal['unknown'] = Field(description="Hypothesis of the article; if you can't tell from the information, say 'unknown'")
    # identifying_recommendations_method: str | Literal['unknown'] = Field(description="How did the study identify recommendations; if you can't tell from the information, say 'unknown'")
    # communication_recommendations_method: str | Literal['unknown'] = Field(description="How did the program described communicate recommendations; if you can't tell from the information, say 'unknown'")
    # incidence_only: bool | Literal['unknown']= Field(description="Is the article only measuring the incidence of a particular recommendation? True or False; if you can't tell from the information, say 'unknown'")
    # guideline_only: bool | Literal['unknown'] = Field(description="Is the article only describing a guideline? True or False; if you can't tell from the information, say 'unknown'")
    # opinion_only: bool | Literal['unknown'] = Field(description="Is the article only an opinion piece or editorial? True or False; if you can't tell from the information, say 'unknown'")
    # review_only: bool | Literal['unknown'] = Field(description="Is the article only a review? True or False; if you can't tell from the information, say 'unknown'")


In [4]:
articles = {}

articles_zip = zipfile.ZipFile("data/entries_with_abstracts.zip", "r")
for filename in articles_zip.namelist():
    pmid = filename[0:-4]
    with articles_zip.open(filename) as f:
        articles[pmid] = f.read().decode("utf-8")

In [5]:
pmids = list(articles.keys())[:5]
print(pmids)
print(articles[pmids[0]])

['100459', '10063862', '10071638', '10073588', '10077038']
456. Int J Oral Surg. 1978 Aug;7(4):400-4. doi: 10.1016/s0300-9785(78)80115-x.

Follow-up investigation of reconstruction of the alveolar process in the 
atrophic mandible.

Fazili M, von Overvest-Eerdmans GR, Vernooy AM, Visser WJ, von Waas MA.

In this article the results are presented of the reconstruction of the alveolar 
process with iliac crest bone grafts in 14 patients. In all cases in a second 
operation a vestibuloplasty and a floor-of-mouth plasty was done. After an 
average follow-up period of 39 months, almost complete resorption of the bone 
grafts was observed. The clinical results varied. To obtain better results and 
avoid complications like pain at the donor and graft sites, mental nerve 
disturbances and other major problems associated with bone grafting, we prefer 
to perform the visor osteotomy in the future.

DOI: 10.1016/s0300-9785(78)80115-x
PMID: 100459 [Indexed for MEDLINE]


In [9]:
llm = instructor.from_openai(openai.OpenAI())

In [18]:
PROMPT = """You are a medical expert. You are given the title of a medical article about tracking follow-up recommendations in radiology.
Based on the title, you need to answer the following questions:
- Is the study about imaging follow-up recommendations? Note that this includes the idea of management of incidental findings 
  in radiology or other "clinically significant" findings/results in imaging exams.
- Does the article have an abstract? 
"""

"""
- What is the study design of the article? (e.g., cohort study, case-control study, randomized controlled trial, etc.)
- What is the hypothesis of the article?
- How did the study identify recommendations?
- How did the program described communicate recommendations?
- Is the article only measuring the incidence of a particular recommendation? True or False
- Is the article only describing a guideline? True or False
- Is the article only an opinion piece or editorial? True or False
- Is the article only a review? True or False
- Is the article only measuring the incidence of a particular recommendation? True or False
- Is the article only describing a guideline? True or False
"""

INSTRUCTIONS = """Answer the questions based ONLY on the given information. If you can't tell from the information, say 'unknown'."""

In [7]:
def generate_prompt(pmid: str) -> str:
    info = articles[pmid]
    return f"{PROMPT}\n\nArticle Information:\n\n{info}\n\n{INSTRUCTIONS}\n\nAnswer:"

In [10]:
pmids = list(articles.keys())[:100]
infos: list[ArticleInfo] = []
for pmid in pmids:
    response = llm.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": generate_prompt(pmid),
            }
        ],
        model="o4-mini",
        response_model=ArticleInfo,
    )

    print(response.model_dump_json(indent=2))
    infos.append(response)

{
  "pmid": "100459",
  "title": "Follow-up investigation of reconstruction of the alveolar process in the atrophic mandible.",
  "about_imaging_followup_recommendations": false,
  "has_abstract": true
}
{
  "pmid": "10063862",
  "title": "Methods of compliance with Mammography Quality Standards Act regulations for tracking positive mammograms: survey results",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "10071638",
  "title": "[Rights and responsibilities of the radiologist in determining indications and applying radiologic procedures]",
  "about_imaging_followup_recommendations": false,
  "has_abstract": true
}
{
  "pmid": "10073588",
  "title": "Primary hybrid total hip replacement, performed with insertion of the acetabular component without cement and a precoat femoral component with cement. An average ten-year follow-up study.",
  "about_imaging_followup_recommendations": false,
  "has_abstract": "unknown"
}
{
  "pmid": "10077038",
  "ti

In [11]:
with open("data/entries_with_abstracts_0_100.jsonl", "w") as f:
    for info in infos:
        f.write(info.model_dump_json() + "\n")

In [12]:
import csv

with open("data/entries_with_abstracts_0_100.csv", "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=infos[0].model_dump().keys())
    writer.writeheader()
    for info in infos:
        writer.writerow(info.model_dump())

In [15]:
s="""38147905
36922265
36650302
35752517
33607066
29728325
24294680
37879972
37302681
29305076
28434846
36063362
30779667
36564264
36922106
23548405
32220539
30860895
22195225
31452006
37073901
27832518
18647895
19703870
20884911
29179912
34076452
36287625
37236842
31206047
28742377
36759382
37820835
36521629
36411090
36922107
36792996
37000450
35867062
35773813
35788428
35186516
32827469
34374592
32857982
33478839
33278340
33984286
32294771
20308458"""
good_pmids = s.splitlines()

In [16]:
good_infos: list[ArticleInfo] = []
for pmid in good_pmids:
    if pmid not in articles:
        print(f"PMID {pmid} not found in articles.")
        continue
    response = llm.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": generate_prompt(pmid),
            }
        ],
        model="o4-mini",
        response_model=ArticleInfo,
    )

    print(response.model_dump_json(indent=2))
    good_infos.append(response)

{
  "pmid": "38147905",
  "title": "Financial Impact of a Radiology Safety Net Program for Resolution of Clinically Necessary Follow-up Imaging Recommendations",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "36922265",
  "title": "White Paper: Best Practices in the Communication and Management of Actionable Incidental Findings in Emergency Department Imaging.",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "36650302",
  "title": "Implementing a Streamlined Radiology Workflow to Close the Loop on Incidental Imaging Findings in the Emergency Department",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "35752517",
  "title": "Catching Those Who Fall Through the Cracks: Integrating a Follow-Up Process for Emergency Department Patients with Incidental Radiologic Findings",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "33607066",


In [17]:
with open("data/likely_entries.csv", "w", newline='', encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=good_infos[0].model_dump().keys())
    writer.writeheader()
    for info in good_infos:
        writer.writerow(info.model_dump())

In [19]:
recheck_pmids = ["24294680", "19703870", "35788428", "34374592"]
for pmid in recheck_pmids:
    response = llm.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": generate_prompt(pmid),
            }
        ],
        model="o4-mini",
        response_model=ArticleInfo,
    )

    print(response.model_dump_json(indent=2))

{
  "pmid": "24294680",
  "title": "An initiative to improve the management of clinically significant test results in a large health care network",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "19703870",
  "title": "Important imaging finding e-mail alert system: experience after 3 years of implementation.",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "35788428",
  "title": "Automated Notification of Relevant Expected or Incidental Findings in Imaging Exams in a Verticalized Healthcare System",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
{
  "pmid": "34374592",
  "title": "Electronic Health Record Closed-Loop Communication Program for Unexpected Nonemergent Findings",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true
}
