In [1]:
import os
from typing import Literal
import zipfile

import dotenv
import instructor
import openai
from pydantic import BaseModel, Field

In [4]:
dotenv.load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

## Setting up for Extraction

### Properties to extract

- What is the study design?
- What hypotheses were being tested?
- Methods for identifying follow up recommendations (including by a human and/or AI)
- Methods for communicating follow up recommnendations
- Methods for facillitating ordering / scheduling / completion of recommended follow up?
- Methods for assessing if follow up recommendations are followed
- Methods for assessing outcomes for patinets for whom follow up was recommended
- Assessment of recommendation format on follow up being performed

Exclusion criteria
- Only a guidelines
- Inumeration of follow up recommendation / incidental findings (incidence / prevalence)
- Opinion only (editorial that does not describe a system)

In [19]:
class ArticleInfo(BaseModel):
    pmid: str = Field(description="PMID of the article")
    title: str = Field(description="Title of the article")
    about_imaging_followup_recommendations : bool | Literal['unknown'] = Field(description="Does the article discuss imaging follow-up recommendations? True or False; if you can't tell from the information, say 'unknown'")
    has_abstract: bool | Literal['unknown'] = Field(description="Does the article have an abstract? True or False; if you can't tell from the information, say 'unknown'")
    study_design: str | Literal['unknown'] = Field(description="Study design of the article; if you can't tell from the information, say 'unknown'")
    hypothesis: str  | Literal['unknown'] = Field(description="Hypothesis of the article; if you can't tell from the information, say 'unknown'")
    identifying_recommendations_method: str | Literal['unknown'] = Field(description="How did the study identify recommendations; if you can't tell from the information, say 'unknown'")
    communication_recommendations_method: str | Literal['unknown'] = Field(description="How did the program described communicate recommendations; if you can't tell from the information, say 'unknown'")
    incidence_only: bool | Literal['unknown']= Field(description="Is the article only measuring the incidence of a particular recommendation? True or False; if you can't tell from the information, say 'unknown'")
    guideline_only: bool | Literal['unknown'] = Field(description="Is the article only describing a guideline? True or False; if you can't tell from the information, say 'unknown'")
    opinion_only: bool | Literal['unknown'] = Field(description="Is the article only an opinion piece or editorial? True or False; if you can't tell from the information, say 'unknown'")
    review_only: bool | Literal['unknown'] = Field(description="Is the article only a review? True or False; if you can't tell from the information, say 'unknown'")


In [6]:
articles = {}

articles_zip = zipfile.ZipFile("data/entries_with_abstracts.zip", "r")
for filename in articles_zip.namelist():
    pmid = filename[0:-4]
    with articles_zip.open(filename) as f:
        articles[pmid] = f.read().decode("utf-8")

In [12]:
pmids = list(articles.keys())[:5]
print(pmids)
print(articles[pmids[0]])

['100459', '10063862', '10071638', '10073588', '10077038']
456. Int J Oral Surg. 1978 Aug;7(4):400-4. doi: 10.1016/s0300-9785(78)80115-x.

Follow-up investigation of reconstruction of the alveolar process in the 
atrophic mandible.

Fazili M, von Overvest-Eerdmans GR, Vernooy AM, Visser WJ, von Waas MA.

In this article the results are presented of the reconstruction of the alveolar 
process with iliac crest bone grafts in 14 patients. In all cases in a second 
operation a vestibuloplasty and a floor-of-mouth plasty was done. After an 
average follow-up period of 39 months, almost complete resorption of the bone 
grafts was observed. The clinical results varied. To obtain better results and 
avoid complications like pain at the donor and graft sites, mental nerve 
disturbances and other major problems associated with bone grafting, we prefer 
to perform the visor osteotomy in the future.

DOI: 10.1016/s0300-9785(78)80115-x
PMID: 100459 [Indexed for MEDLINE]


In [13]:
llm = instructor.from_openai(openai.OpenAI())

In [14]:
PROMPT = """You are a medical expert. You are given the title of a medical article about tracking follow-up recommendations in radiology.
Based on the title, you need to answer the following questions:
- Is the study about imaging follow-up recommendations? 
- Does the article have an abstract? 
- What is the study design of the article? (e.g., cohort study, case-control study, randomized controlled trial, etc.)
- What is the hypothesis of the article?
- How did the study identify recommendations?
- How did the program described communicate recommendations?
- Is the article only measuring the incidence of a particular recommendation? True or False
- Is the article only describing a guideline? True or False
- Is the article only an opinion piece or editorial? True or False
- Is the article only a review? True or False
- Is the article only measuring the incidence of a particular recommendation? True or False
- Is the article only describing a guideline? True or False
"""

INSTRUCTIONS = """Answer the questions based ONLY on the given information. If you can't tell from the information, say 'unknown'."""

In [16]:
def generate_prompt(pmid: str) -> str:
    info = articles[pmid]
    return f"{PROMPT}\n\nArticle Information:\n\n{info}\n\n{INSTRUCTIONS}\n\nAnswer:"

In [25]:
pmids = list(articles.keys())[:100]
infos: list[ArticleInfo] = []
for pmid in pmids:
    response = llm.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": generate_prompt(pmid),
            }
        ],
        model="o4-mini",
        response_model=ArticleInfo,
    )

    print(response.model_dump_json(indent=2))
    infos.append(response)

{
  "pmid": "100459",
  "title": "Follow-up investigation of reconstruction of the alveolar process in the atrophic mandible.",
  "about_imaging_followup_recommendations": false,
  "has_abstract": true,
  "study_design": "case series",
  "hypothesis": "unknown",
  "identifying_recommendations_method": "unknown",
  "communication_recommendations_method": "unknown",
  "incidence_only": false,
  "guideline_only": false,
  "opinion_only": false,
  "review_only": false
}
{
  "pmid": "10063862",
  "title": "Methods of compliance with Mammography Quality Standards Act regulations for tracking positive mammograms: survey results.",
  "about_imaging_followup_recommendations": true,
  "has_abstract": true,
  "study_design": "cross-sectional survey study",
  "hypothesis": "unknown",
  "identifying_recommendations_method": "questionnaire survey sent to Society of Breast Imaging fellows",
  "communication_recommendations_method": "tracked by radiologists, technologists, other personnel or combinati

KeyboardInterrupt: 