In [1]:
import json
import pathlib

import pandas as pd
import polars as pl
from openai import OpenAI
import tqdm.notebook as tqdm

from undina_llm.models import DrugLabel, Prompt, Response, SystemPrompt
from undina_llm.db import SessionManager
from undina_llm.llm import query

In [2]:
SQLITE_FILE = "sqlite:///../data/project.db"

manager = SessionManager(SQLITE_FILE)

In [4]:
root = pathlib.Path("../data/latest_labels_ingredients/")
files = sorted(root.glob("*.json"))

print(len(files), "files")

labels_df = pl.concat([pl.read_json(file).unnest("sections") for file in files], how="diagonal")

labels = [DrugLabel.model_validate(label) for label in labels_df.to_dicts()]

for label in labels:
    try:
        manager.get_drug_label(label)
    except:
        manager.register_drug_label(label)
    
labels = [manager.get_drug_label(label) for label in labels]

labels[0]

1468 files




# Inspect data

* DI: Drug interactions
* CO: Contraindications

In [4]:
print(
    labels_df
    .select("spl_version", "CO", "DI")
    # .filter(~pl.col("CO").str.starts_with("CONTRAINDICATIONS"))
    # .filter(~pl.col("CO").str.starts_with("4 CONTRAINDICATIONS"))
    # .filter(~pl.col("CO").str.starts_with("4  CONTRAINDICATIONS"))
    # .filter(~pl.col("CO").str.contains("CONTRAINDICATION|contraindication|Contraindication"))
    # ["CO"].to_pandas()
    ["CO"][:5].to_pandas().add("\n\n").sum()
)

CONTRAINDICATIONS

Contraindicated in the presence of an anatomically narrow angle or in narrow-angle glaucoma or in persons who have shown hypersensitivity to any component of this preparation.

CONTRAINDICATIONS

There are no known contraindications to oral use when administered in recommended doses.
Isoxsuprine Hydrochloride, USP should not be given immediately postpartum or in the presence of arterial bleeding.

4 CONTRAINDICATIONS

None.





None. (4)

4 CONTRAINDICATIONS

None.





None (4)

4 CONTRAINDICATIONS

None.





None (4)




# Extract data

In [5]:
with open('../config.json') as f:
    config = json.load(f)

api_key = config["OpenAI"]['openai_api_key']
client = OpenAI(api_key=api_key)

In [6]:
model = "gpt-4o-2024-05-13"
temperature = 0
seed = 0

system_prompt = "You are an expert in medical natural language processing, adverse drug reactions, pharmacology, and clinical trials."

In [7]:
try:
    system_prompt_obj = manager.get_system_prompt(system_prompt)
except Exception:
    system_prompt_obj = manager.register_system_prompt(system_prompt)

## 1. Contraindications

In [8]:
prompt = """
Contraindications designate circumstances in which a drug should not be administered to a patient.
You are to extract all contraindications from the structured product label. 
Return these values in a comma separated list.
If a contraindication is negated, extract it and prepend a <negated> tag.
Note that negating a contraindication means that the condition is not contraindicated, representing a double negative.
You should only extract the portions that list the specific conditions that contraindicate drug administration. 
Exclude the name of the drug and other contextual information beyond the conditions themselves.
If no contraindications are listed, simply return an empty string.

The structured product label for you to analyze is triple quoted below:
\"\"\"{}\"\"\"
"""

In [9]:
try:
    prompt_obj = manager.get_prompt(prompt)
except Exception:
    prompt_obj = manager.register_prompt(prompt)

In [10]:
errors = list()

for label in tqdm.tqdm(labels):
    response_template = Response(
        system_prompt=system_prompt_obj,
        prompt=prompt_obj,
        drug_label=label,
        section="CO",
        model=model,
        temperature=temperature,
        response=""
    )
    response = manager.get_response(response_template)
    if response is None:
        try:
            result = query(client, system_prompt_obj, prompt_obj, label, "CO", model, temperature, seed)
            manager.register_response(result)
        except Exception:
            print(f"Error processing {label}")
            errors.append(label)

  0%|          | 0/1468 [00:00<?, ?it/s]



## 2. Drug interactions

In [11]:
prompt = """
Drug interactions designate drugs which, when taken at the same time as the drug of interest, could lead to adverse reactions for the patient.
You are to extract all drug interactions from the structured product label. 
Return these values in a comma separated list.
You should only extract the portions that list the specific drug or product names. 
Exclude the names of conditions, adverse reactions, and other contextual information beyond the drugs or products themselves.
Exclude the name of the drug whose label is given.
If no drug interactions are listed, simply return an empty string.
Do not return anything but a comma separated list of drug, product, and ingredient names.
Ensure that all drug names are separated from one another using a comma only.
Do not delimit drug names by 'and' or '/' or anything other than a comma.
Do not include any additional text whatsoever.

The structured product label for you to analyze is triple quoted below:
\"\"\"{}\"\"\"
"""

In [12]:
try:
    prompt_obj = manager.get_prompt(prompt)
except Exception:
    prompt_obj = manager.register_prompt(prompt)

In [13]:
errors = list()

for label in tqdm.tqdm(labels):
    response_template = Response(
        system_prompt=system_prompt_obj,
        prompt=prompt_obj,
        drug_label=label,
        section="DI",
        model=model,
        temperature=temperature,
        response=""
    )
    response = manager.get_response(response_template)
    if response is None:
        try:
            result = query(client, system_prompt_obj, prompt_obj, label, "DI", model, temperature, seed)
            manager.register_response(result)
        except Exception:
            print(f"Error processing {label}")
            errors.append(label)

  0%|          | 0/1468 [00:00<?, ?it/s]