In [12]:
import enum
import os
from typing import Literal

import dotenv
import instructor
import openai
import polars as pl
from pydantic import BaseModel, Field

In [7]:
dotenv.load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

## Setting up for Extraction

### Properties to extract

- What is the study design?
- What hypotheses were being tested?
- Methods for identifying follow up recommendations (including by a human and/or AI)
- Methods for communicating follow up recommnendations
- Methods for facillitating ordering / scheduling / completion of recommended follow up?
- Methods for assessing if follow up recommendations are followed
- Methods for assessing outcomes for patinets for whom follow up was recommended
- Assessment of recommendation format on follow up being performed

Exclusion criteria
- Only a guidelines
- Inumeration of follow up recommendation / incidental findings (incidence / prevalence)
- Opinion only (editorial that does not describe a system)

In [None]:
class ArticleInfo(BaseModel):
    study_design: str | Literal['unknown'] = Field(description="Study design of the article; if you can't tell from the information, say 'unknown'")
    hypothesis: str  | Literal['unknown'] = Field(description="Hypothesis of the article; if you can't tell from the information, say 'unknown'")
    identifying_recommendations_method: str | Literal['unknown'] = Field(description="How did the study identify recommendations; if you can't tell from the information, say 'unknown'")
    communication_recommendations_method: str | Literal['unknown'] = Field(description="How did the program described communicate recommendations; if you can't tell from the information, say 'unknown'")
    incidence_only: bool | Literal['unknown']= Field(description="Is the article only measuring the incidence of a particular recommendation? True or False; if you can't tell from the information, say 'unknown'")
    guideline_only: bool | Literal['unknown'] = Field(description="Is the article only describing a guideline? True or False; if you can't tell from the information, say 'unknown'")
    opinion_only: bool | Literal['unknown'] = Field(description="Is the article only an opinion piece or editorial? True or False; if you can't tell from the information, say 'unknown'")
    review_only: bool | Literal['unknown'] = Field(description="Is the article only a review? True or False; if you can't tell from the information, say 'unknown'")


In [16]:
articles_df = pl.read_csv("data/PM_EB_searches.tsv", separator="\t", encoding="latin1")
print(articles_df.head())

shape: (5, 12)
┌──────────┬────────────┬───────────┬───────────┬───┬───────────┬──────────┬───────────┬───────────┐
│ PMID     ┆ Title      ┆ Authors   ┆ Citation  ┆ … ┆ PMCID     ┆ NIHMS ID ┆ DOI       ┆ InEmbaseR │
│ ---      ┆ ---        ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---      ┆ ---       ┆ esults?   │
│ i64      ┆ str        ┆ str       ┆ str       ┆   ┆ str       ┆ str      ┆ str       ┆ ---       │
│          ┆            ┆           ┆           ┆   ┆           ┆          ┆           ┆ bool      │
╞══════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪══════════╪═══════════╪═══════════╡
│ 37934099 ┆ A 20-year  ┆ Henschke  ┆ Radiology ┆ … ┆ PMC106985 ┆ null     ┆ 10.1148/r ┆ true      │
│          ┆ Follow-up  ┆ CI, Yip   ┆ . 2023    ┆   ┆ 00        ┆          ┆ adiol.231 ┆           │
│          ┆ of the     ┆ R, Shaham ┆ Nov;309(2 ┆   ┆           ┆          ┆ 988       ┆           │
│          ┆ Int…       ┆ D, …      ┆ ):e23…    ┆   ┆           ┆          ┆

In [29]:
article_titles =list(articles_df[0:5]["Title"])

In [23]:
llm = instructor.from_openai(openai.OpenAI())

In [33]:
PROMPT = """You are a medical expert. You are given the title of a medical article about tracking follow-up recommendations in radiology.
Based on the title, you need to answer the following questions:
- What is the study design of the article? (e.g., cohort study, case-control study, randomized controlled trial, etc.)
- What is the hypothesis of the article?
- How did the study identify recommendations?
- How did the program described communicate recommendations?
- Is the article only measuring the incidence of a particular recommendation? True or False
- Is the article only describing a guideline? True or False
- Is the article only an opinion piece or editorial? True or False
- Is the article only a review? True or False
- Is the article only measuring the incidence of a particular recommendation? True or False
- Is the article only describing a guideline? True or False
"""

INSTRUCTIONS = """Answer the questions based ONLY on the given information. If you can't tell from the information, say 'unknown'."""

In [34]:
def generate_prompt(title: str) -> str:
    return f"{PROMPT}\n\nArticle Title: {title}\n\n{INSTRUCTIONS}\n\nAnswer:"

In [35]:
print(article_titles[0])

A 20-year Follow-up of the International Early Lung Cancer Action Program (I-ELCAP)


In [36]:
response = llm.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": generate_prompt(article_titles[0]),
        }
    ],
    model="o4-mini",
    response_model=ArticleInfo,
)

In [37]:
print(response.model_dump_json(indent=2))

{
  "study_design": "cohort study",
  "hypothesis": "unknown",
  "identifying_recommendations_method": "unknown",
  "communication_recommendations_method": "unknown",
  "incidence_only": false,
  "guideline_only": false
}
