In [1]:
output = "sample_output2.csv"

# SOAP Note Generation

This notebook performs step 1 of the Brief Hospital Course pipeline, in which we generate a SOAP note for each service based on a list of information that has been passed into a GPT-3.5 model. 

In [34]:
%load_ext autoreload
%autoreload 2

In [35]:
import pandas as pd
import ast
import numpy as np

from tqdm import tqdm
# from tqdm.auto import tqdm  # for notebooks
tqdm.pandas()

import os
import openai

In [36]:
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.

True

In [37]:
from preprocessing import *

In [38]:
N = 5

## Read in Radiology Reports

In [6]:
radiology = pd.read_csv("/gpfs/milgram/project/rtaylor/shared/DischargeMe/public/train/radiology.csv.gz")

## Read in Structured Data

In [7]:
# discharge summaries
discharges = pd.read_csv("/gpfs/milgram/project/rtaylor/shared/DischargeMe/public/train/discharge.csv.gz")

# ed stays
edstays = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/public/train/edstays.csv.gz')

# triage
triage = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/public/train/triage.csv.gz')

# ward transfers
transfers = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/transfers.csv.gz')

# higher-level services (ICU, CARD, etc)
services = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/services.csv.gz')

# get patient info
pts = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/patients.csv.gz')

# admission demographics
admissions = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/admissions.csv.gz')

# procedures
procs = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/procedures_icd.csv.gz')
procs_icd = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/d_icd_procedures.csv.gz')

# diagnoses
diags = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/diagnoses_icd.csv.gz')
diags_icd = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/d_icd_diagnoses.csv.gz')

# meds
med_orders = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/emar.csv.gz')


In [8]:

# # procedures
# procs = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/procedures_icd.csv.gz')
# procs_icd = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/d_icd_procedures.csv.gz')

# # diagnoses
# diags = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/diagnoses_icd.csv.gz')
# diags_icd = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/d_icd_diagnoses.csv.gz')


In [9]:
prescriptions = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/prescriptions.csv.gz')
labs = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/labevents.csv.gz')
microbio = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/microbiologyevents.csv.gz')

  prescriptions = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/prescriptions.csv.gz')
  microbio = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/microbiologyevents.csv.gz')


In [10]:
labs_icd = pd.read_csv("/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/d_labitems.csv.gz")

In [11]:
labs = labs.merge(labs_icd, on="itemid", how="left")

### Clean up/type cast data

In [12]:
med_orders = med_orders[~med_orders['charttime'].isna() & 
                        ~med_orders['medication'].isna() & 
                        ~med_orders['event_txt'].isna()]

In [13]:
procs = procs.astype({"chartdate":"datetime64[ns]"})
med_orders = med_orders.astype({"charttime":"datetime64[ns]",
                  "scheduletime":"datetime64[ns]",
                  "storetime":"datetime64[ns]",})

discharges = discharges.astype({"charttime":"datetime64[ns]",
                               "storetime":"datetime64[ns]"})

In [14]:
prescriptions = prescriptions.astype({"starttime":"datetime64[ns]",
                      "stoptime":"datetime64[ns]"})
labs = labs.astype({"charttime":"datetime64[ns]",
                      "storetime":"datetime64[ns]"})

microbio = microbio.astype({"chartdate":"datetime64[ns]",
                      "charttime":"datetime64[ns]",
                           "storetime":"datetime64[ns]",
                      "charttime":"datetime64[ns]"})

In [15]:
# drop any potential repeats
procs_icd = procs_icd.groupby(["icd_code", "icd_version"]).first().reset_index()
diags_icd = diags_icd.groupby(["icd_code", "icd_version"]).first().reset_index()

In [16]:
# grab long_titles for procs/diags
procs = procs.merge(procs_icd, on=["icd_code", "icd_version"], how="left")
diags = diags.merge(diags_icd, on=["icd_code", "icd_version"], how="left")


### Define Extraction Functions

In [17]:
# generate free text sections of tables concatenating columns
med_orders['admin_text'] = med_orders['medication'] + " at " + med_orders['charttime'].dt.strftime('%B %d, %Y, %r')
# TODO SORAYA: Add the right information in here that we actually want for the meds
prescriptions['text'] = prescriptions['drug'] + " " + prescriptions['prod_strength']


In [18]:
with pd.option_context("display.max_columns", None):
    display(prescriptions.sample(2))

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,poe_seq,order_provider_id,starttime,stoptime,drug_type,drug,formulary_drug_cd,gsn,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route,text
6026403,13901573,26791489,64120137,13901573-666,666.0,P30P24,2162-09-02 21:00:00,2162-09-03 22:00:00,MAIN,Glucose Gel,GLUCOSEGEL,1781,38396060000.0,15 g Tube,,15.0,g,1,TUBE,,PO,Glucose Gel 15 g Tube
1016827,10680429,20834032,27094241,10680429-145,145.0,P363MT,2172-10-06 08:00:00,2172-10-06 18:00:00,MAIN,Amlodipine,AMLO25,16925,51079050000.0,2.5mg Tablet,,2.5,mg,1,TAB,1.0,PO/NG,Amlodipine 2.5mg Tablet


In [19]:
# TODO THOMAS: figure out how to better encapsulate lab values
labs['text'] = labs['category'] + " " + labs['fluid'] + " " + labs['label'] + ": " + labs['value'] + " " + labs['valueuom']

In [20]:
# TODO SORAYA: check for those patients that don't have comments (they might not have useful information in the test)
microbio['text'] = microbio['test_name'] + ": " + microbio['comments']

In [21]:
microbio.isna().sum()

microevent_id                0
subject_id                   0
hadm_id                1830396
micro_specimen_id            0
order_provider_id      2272201
chartdate                    0
charttime               257699
spec_itemid                  0
spec_type_desc               1
test_seq                     0
storedate                12251
storetime                25186
test_itemid                  0
test_name                    0
org_itemid             1942044
org_name               1942044
isolate_num            1942044
quantity               3228683
ab_itemid              2121435
ab_name                2121435
dilution_text          2148780
dilution_comparison    2149420
dilution_value         2149420
interpretation         2121435
comments                990931
text                    990931
dtype: int64

In [42]:
def create_pt_prompt_per_service(discharge_row, demos_in, triage_in, transfers_in, diags_in, procs_in, prescriptions_in, labs_in, microbio_in):

    demos = get_demos(discharge_row['subject_id'], demos_in)
    if demos.empty:
        age = r"[UNKNOWN AGE]"
        sex = r"[UNKNOWN SEX]"
    else:
        age = demos['anchor_age']
        sex = demos['gender']

    pt_edstays = edstays[edstays['hadm_id'] == discharge_row['hadm_id']]

    ccs = []
    for stay_id in pt_edstays['stay_id'].tolist():
        triage_info = get_triage_info(stay_id, triage_in)
        ccs.append(triage_info['chiefcomplaint'].squeeze())
        
    chief_complaints = ", ".join(ccs)

    if sex:
        pronoun = ["he","his"] if sex == "M" else ['she',"her"]
    else:
        pronoun = ["they", "their"]

    # transfers with dates
    transfers = get_transfers(discharge_row['hadm_id'], transfers_in)

    # get stay admission diagnoses
    diags = get_diags(discharge_row['hadm_id'], diags_in)

    init_prompt = f"___ is a {age} year old {sex} that initially presented to the ED with {chief_complaints}. By the end of {pronoun[1]} hospital stay, {pronoun[0]} was given the following diagnoses: {', '.join(diags['long_title'])} in order of importance to this admission. "
    transfer_service_prompts = []

    for index, row in transfers.iterrows():
        if row['eventtype'] == "discharge":
             transfer_service_prompts.append("")
        else:
            procs = get_procs_within_service(discharge_row['hadm_id'], row, procs_in)
            # med_orders = get_med_orders_within_service(discharge_row['hadm_id'], row)
            prescriptions = get_prescriptions_within_service(discharge_row['hadm_id'], row, prescriptions_in)
            labs = get_labs_within_service(discharge_row['hadm_id'], row, labs_in)
            microbio = get_microbio_within_service(discharge_row['hadm_id'], row, microbio_in)
            
            within_service_prompt = f"The patient was transferred and stayed in the {row['careunit']} ward between {row['intime']} and {row['outtime']}. ___ received the following procedures (ordered by priority): {', '.join(procs['long_title'].tolist())}. \n------------------------\n{pronoun[0]} also received the following medications (ordered chronologically) during the service: {', '.join(prescriptions['text'].tolist())}. \n------------------------\nThe following labs were also drawn during the service: {', '.join(labs['text'].tolist())}. \n------------------------\nThe physician also ordered the following microbiology cultures during the service: {', '.join(microbio['text'].tolist())}. "
            
            full_prompt = init_prompt + within_service_prompt + f"Given this information, please generate a progress note for this patient for their care during this part of their hospital course staying in the {row['careunit']} ward. Be SPECIFIC to the conditions, any abnormal labs, vitals, or procedures that were conducted, and significant medications as they relate to the hospital course."
            transfer_service_prompts.append(full_prompt)

    transfers['service_prompts'] = transfer_service_prompts
    
    return transfers
    


In [29]:
def create_pt_prompt_per_day(discharge_row, demos_in, triage_in, transfers_in, diags_in, procs_in, prescriptions_in, labs_in, microbio_in):

    demos = get_demos(discharge_row['subject_id'])
    if demos.empty:
        age = r"[UNKNOWN AGE]"
        sex = r"[UNKNOWN SEX]"
    else:
        age = demos['anchor_age']
        sex = demos['gender']

    pt_edstays = edstays[edstays['hadm_id'] == discharge_row['hadm_id']]

    ccs = []
    for stay_id in pt_edstays['stay_id'].tolist():
        triage_info = get_triage_info(stay_id)
        ccs.append(triage_info['chiefcomplaint'].squeeze())
        
    chief_complaints = ", ".join(ccs)

    if sex:
        pronoun = ["he","his"] if sex == "M" else ['she',"her"]
    else:
        pronoun = ["they", "their"]

    # transfers with dates
    transfers = get_transfers(discharge_row['hadm_id'])

    # get stay admission diagnoses
    diags = get_diags(discharge_row['hadm_id'])

    init_prompt = f"___ is a {age} year old {sex} that initially presented to the ED with {chief_complaints}. By the end of {pronoun[1]} hospital stay, {pronoun[0]} was given the following diagnoses: {', '.join(diags['long_title'])} in order of importance to this admission. "
    transfer_service_prompts = []

    for index, row in transfers.iterrows():
        if row['eventtype'] == "discharge":
             transfer_service_prompts.append("")
        else:
            # split time in this ward into days. 
            service_date_ranges = pd.date_range(start=row['intime'], end=row['outtime'])
            
            for previous, current in zip(service_date_ranges.tolist(), service_date_ranges[1:].tolist()):
                print(previous, current) 
            # TODO VIMIG: finish creating the per-day SOAP note

            within_service_prompt = f"The patient was transferred and stayed in the {row['careunit']} ward between {row['intime']} and {row['outtime']}. ___ received "
            
            procs = get_procs_within_service(discharge_row['hadm_id'], row)
            # med_orders = get_med_orders_within_service(discharge_row['hadm_id'], row)
            prescriptions = get_prescriptions_within_service(discharge_row['hadm_id'], row)
            labs = get_labs_within_service(discharge_row['hadm_id'], row)
            microbio = get_microbio_within_service(discharge_row['hadm_id'], row)
            
            within_service_prompt = within_service_prompt + f"the following procedures (ordered by priority): {', '.join(procs['long_title'].tolist())}.\n------------------------\n{pronoun[0]} also received the following medications (ordered chronologically) during the service: {', '.join(prescriptions['text'].tolist())}. \n------------------------\nThe following labs were also drawn during the service: {', '.join(labs['text'].tolist())}. \n------------------------\nThe physician also ordered the following microbiology cultures during the service: {', '.join(microbio['text'].tolist())}. "
            
            full_prompt = init_prompt + within_service_prompt + f"Given this information, please generate a progress note for this patient for their care during this part of their hospital course staying in the {row['careunit']} ward. Be SPECIFIC to the conditions, any abnormal labs, vitals, or procedures that were conducted, and significant medications as they relate to the hospital course."
            transfer_service_prompts.append(full_prompt)

    transfers['service_prompts'] = transfer_service_prompts
    
    return transfers
    


## Generate Input Data List Prompt

In [None]:
# TODO THOMAS: why are there so many NAs?: 22680492
labs = labs[~labs['text'].isna()]

In [45]:
service_prompts = create_pt_prompt_per_service(discharges.sample(1).squeeze(), pts, triage, transfers, diags, procs, prescriptions, labs, microbio)

In [46]:
print(service_prompts['service_prompts'].iloc[1])

___ is a 38 year old F that initially presented to the ED with RLQ abdominal pain, R Leg swelling. By the end of her hospital stay, she was given the following diagnoses: Localized edema, Pain in right ankle and joints of right foot, Pain in right lower leg, Malignant neoplasm of unspecified ovary, Personal history of pulmonary embolism, Long term (current) use of anticoagulants, Localized enlarged lymph nodes, Urinary tract infection, site not specified, Unspecified osteoarthritis, unspecified site in order of importance to this admission. The patient was transferred and stayed in the Hematology/Oncology ward between 2193-02-01 00:26:00 and 2193-02-04 15:00:02. ___ received the following procedures (ordered by priority): . 
------------------------
she also received the following medications (ordered chronologically) during the service: Gabapentin 400mg Capsule, Acetaminophen 500mg Tablet, Heparin Flush (10 units/ml) 10 Units/mL - 5 mL Syringe, Morphine Sulfate IR 15mg Tab, Docusate S

## Create SOAP notes from GPT API

In [None]:
import openai
openai.api_type = "azure"
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")
engine = "decile-gpt-35-turbo-16k"


In [None]:
# drop all eventtypes of type: discharge, since we don't need a note for that
service_prompts = service_prompts[service_prompts['eventtype'] != "discharge"]

In [None]:
gpt_inputs = []
for _, row in service_prompts.iterrows():
    message_text = [{"role":"system","content":f"You are a physician that is reviewing a patient's medical record during their stay in your ward: ({row['careunit']}) and writing a SOAP Note based on this information."},]
    
    gpt_service_prompt = {"role":"user",
                     "content":row['service_prompts']}
    
    message_text.append(gpt_service_prompt)
    
    # print(f"Deprescribe Prompt: {message_text}")
    gpt_inputs.append(message_text)

In [None]:
completions = []
for input in gpt_inputs:
    completion = openai.ChatCompletion.create(
      engine=engine,
      messages = input,
    )
    completions.append(completion['choices'][0]['message']['content'])

In [None]:
service_prompts['gpt_SOAP_note'] = completions

In [None]:
service_prompts.to_csv(output)