In [1]:
import pandas as pd 
import numpy as np
import openai
import humanize 
import json 

# import backoff


################################################################################
################################################################################

### (1) Gather relevant notes 

# Current All Note Types
# Current EKGs
# Current/Prior All Note Types
# Current/Prior All Note Types (with Text dividers b/w current and prior)

################################################################################

# (1a.) Current note: Gather time stamp, note type, and provider type, for all 
#         notes related to the current encounter

# prior_notes: these are all the notes associated with a patient. There are no encounter_IDs for the encounters these note are from so we have to order the notes by time and select the most recent one. => PAT_ENC_CSN_ID, ArrivalInstant, visit_occurrence_id, _CreationInstant, Type, TEXT
# ekg_notes: these are all EKGs from a particular CPC encounter. => PAT_ENC_CSN_ID, ORD_VALUE, Comment, RESULT_TIME | ORD_VALUE = was the ECG abnormal or not, Comment = ECG read description
# cpc_notes: these are the notes from a particular CPC encounter. => PAT_ENC_CSN_ID, visit_occurrence_id, ArrivalInstant, person_id, Type, TEXT | note_id == person_id

# DeID is just a unique identifier for all patients or encounters; it can be used to right notes for a specific CPC encounter (including patient-level PMH) across all the datasets

# Read the csv spreadsheet into a dataframe called "allNotes"
prior_notes = pd.read_csv("/Users/vsocrates/Documents/Yale/Heart_Score/NOTES_PRIOR_IDENTIFIED_postprocessed.csv")
ekg_notes = pd.read_csv("/Users/vsocrates/Documents/Yale/Heart_Score/EKG_HEART_IDENTIFIED_postprocessed.csv")
# we are only using CPC notes to define our cohort and not passing them to GPT, since they contain the gold standard HEART score and may lead to data leakage
cpc_notes = pd.read_csv("/Users/vsocrates/Documents/Yale/Heart_Score/NOTES_CPC_IDENTIFIED_postprocessed.csv")

prior_notes = prior_notes.rename({
                                  "PAT_ENC_CSN_ID":"CPC_PAT_ENC_CSN_ID",
                                  "Document_Time":"_CreationInstant"}, axis=1)
cpc_notes = cpc_notes.rename({"note_id":"DeID"}, axis=1)

# Cast to datetime
prior_notes["_CreationInstant"] = pd.to_datetime(prior_notes['_CreationInstant']).apply(lambda t: t.replace(tzinfo=None))
ekg_notes["RESULT_TIME"] = pd.to_datetime(ekg_notes['RESULT_TIME']).apply(lambda t: t.replace(tzinfo=None))
cpc_notes["ArrivalInstant"] = pd.to_datetime(cpc_notes['ArrivalInstant']).apply(lambda t: t.replace(tzinfo=None))

# we need the ArrivalInstant to the CPC Encounter because we can't use the _CreationInstant to need to create a time delta
prior_notes = prior_notes.merge(cpc_notes[['DeID', "ArrivalInstant"]], on="DeID").rename({"ArrivalInstant":"CPC_ArrivalInstant"}, axis=1)
prior_notes['TIME_SINCE_CPC_ARRIVAL'] = (prior_notes['CPC_ArrivalInstant'] - prior_notes['_CreationInstant']).apply(humanize.naturaldelta)

ekg_notes = ekg_notes.merge(cpc_notes[['DeID', "ArrivalInstant"]], on="DeID").rename({"ArrivalInstant":"CPC_ArrivalInstant"}, axis=1)
ekg_notes['TIME_SINCE_CPC_ARRIVAL'] = (ekg_notes['CPC_ArrivalInstant'] - ekg_notes['RESULT_TIME']).apply(humanize.naturaldelta)

# Create compiledText column with all variables
# we don't have provider info so we're not including it
prior_notes['compiledText'] = ("Time Since ED Arrival: " + prior_notes["TIME_SINCE_CPC_ARRIVAL"] + 
                                           "\nType: " + prior_notes["Type"] + 
                                           "\nProvider: " + prior_notes["Service"].fillna("N/A") + 
                                           "\n\n\n" + prior_notes["deid_text"])
ekg_notes['compiledText'] = ("Time Since ED Arrival: " + ekg_notes["TIME_SINCE_CPC_ARRIVAL"] + 
                                           "\nType: " + "EKG" + 
                                           "\n\n\n" + ekg_notes["deid_text"])
cpc_notes['compiledText'] = ("ED Arrival Time: " + cpc_notes["ArrivalInstant"].astype(str) + 
                                           "\nType: " + "Chest Pain Center Note" +
                                           "\n\n\n" + cpc_notes["deid_text"])

# Filter down so that there's only unique rows (i.e. no duplicates)
prior_notes = prior_notes.drop_duplicates()
ekg_notes = ekg_notes.drop_duplicates()
cpc_notes = cpc_notes.drop_duplicates()

# sort notes by ENC_ID and date/time
prior_notes = prior_notes.sort_values(["DeID", "_CreationInstant"], ascending=False)
ekg_notes = ekg_notes.sort_values(["DeID", "RESULT_TIME"], ascending=True)

# create note index within each patient
prior_notes['note_num'] = prior_notes.groupby("DeID").cumcount()+1
ekg_notes['note_num'] = ekg_notes.groupby("DeID").cumcount()+1


In [104]:
def get_notes_by_enc_ID(row):
    
    # get all prior notes associated with a CPC encounter
    prior_notes_row = prior_notes[(prior_notes['DeID'] == row['DeID'])]
    
    # just confirm that they're sorted and get most recent note
    prior_notes_row = prior_notes_row.sort_values("_CreationInstant", ascending=False)
    current_note = prior_notes_row.iloc[0]

    # if the most recent "prior" note isn't an ED provider note or it was created before the current encounter, we drop those patients
    if (current_note['Type'] != "ED Provider Notes") or (current_note['_CreationInstant'] < row['ArrivalInstant']):
        return np.nan, np.nan, np.nan
    
    previous_prior_notes_row = prior_notes_row.iloc[1:]
    
    current_ekg_notes_row = ekg_notes[(ekg_notes['DeID'] == row['DeID'])]
    current_ekg_notes_row = current_ekg_notes_row.sort_values("RESULT_TIME", ascending=True)
    # display(previous_prior_notes_row)

    # add in the number of the note into the text so we can concatenate them easily later
    # previous_prior_notes_row['compiledText'] = ("#####################################\nENCOUNTER NOTE #" + previous_prior_notes_row['note_num'].astype(str) + 
    #                         "\n\n\n" + previous_prior_notes_row['compiledText']
    #                         )

# prior_notes['compiledText'] = ()
# ekg_notes['compiledText'] = ()
# cpc_notes['compiledText'] = ("ED Arrival Time: " + cpc_notes["ArrivalInstant"].astype(str) + 
#                                            "\nType: " + "Chest Pain Center Note" +
#                                            "\n\n\n" + cpc_notes["deid_text"])

    current_note_txt = ("#####################################\nCURRENT ED PROVIDER NOTE: \n\n\n" +
                                           "\nType: " + current_note["Type"] + 
                                           "\nProvider: " + ("N/A" if pd.isnull(current_note["Service"]) else current_note["Service"]) + 
                                           "\n\n\n" + current_note["deid_text"]
                        )

    if prior_notes_row.shape[0] > 1:
        previous_prior_notes_row['compiledText'] = ("#####################################\nPAST ENCOUNTER NOTE #" + (previous_prior_notes_row['note_num'] - 1).astype(str) + 
                                "\n\n\n" + "Time Before ED Arrival: " + previous_prior_notes_row["TIME_SINCE_CPC_ARRIVAL"] + 
                                           "\nType: " + previous_prior_notes_row["Type"] + 
                                           "\nProvider: " + previous_prior_notes_row["Service"].fillna("N/A") + 
                                           "\n\n\n" + previous_prior_notes_row["deid_text"]
                                )

    # we don't have person_IDs for EKGs so we can't use prior EKGs from before the CPC encounter
    # previous_ekg_notes_row['compiledText'] = ("#####################################\nEKG NOTE #" + previous_ekg_notes_row['note_num'].astype(str) + 
    #                         "\n\n\n" + previous_ekg_notes_row['compiledText']
    #                         )

    current_ekg_notes_row['compiledText'] = ("#####################################\nEKG IMPRESSION NOTE [#" + current_ekg_notes_row['note_num'].astype(str) + 
                            "] IN CURRENT ENCOUNTER\n\n\n" + "Time Since ED Arrival: " + current_ekg_notes_row["TIME_SINCE_CPC_ARRIVAL"] + 
                                           "\nType: " + "EKG" + 
                                           "\n\n\n" + current_ekg_notes_row["deid_text"]
                            )

    if prior_notes_row.shape[0] > 1:
        previous_prior_notes_txt = "\n\n\n".join(previous_prior_notes_row['compiledText'].tolist())
    else:
        previous_prior_notes_txt = ""

    current_ekg_notes_txt = "\n\n\n".join(current_ekg_notes_row['compiledText'].tolist())

    ## we need 3 things for the HEART Score so we create them below
    # History, Age, Troponin: Current Encounter Note (ED Provider Notes)
    # EKGs: All Current EKGs
    # Risk Factors: Current/Prior All Note Types

    # we don't have person_IDs for EKGs so we can't use prior EKGs from before the CPC encounter
    # all_notes_txt = previous_prior_notes_txt + "\n\n\n" + previous_ekg_notes_txt + "\n\n\n" + current_prior_notes_txt + "\n\n\n" + current_ekg_notes_txt
    all_notes_txt = current_note_txt + "\n\n\n" + current_ekg_notes_txt + "\n\n\n" + previous_prior_notes_txt 

    return current_note_txt, current_ekg_notes_txt, all_notes_txt


output = cpc_notes.apply(get_notes_by_enc_ID, axis=1, result_type="expand")

In [105]:
output

Unnamed: 0,0,1,2
0,#####################################\nCURRENT...,#####################################\nEKG IMP...,#####################################\nCURRENT...
1,#####################################\nCURRENT...,#####################################\nEKG IMP...,#####################################\nCURRENT...
2,#####################################\nCURRENT...,#####################################\nEKG IMP...,#####################################\nCURRENT...
3,,,
4,#####################################\nCURRENT...,#####################################\nEKG IMP...,#####################################\nCURRENT...
...,...,...,...
1155,#####################################\nCURRENT...,#####################################\nEKG IMP...,#####################################\nCURRENT...
1156,#####################################\nCURRENT...,#####################################\nEKG IMP...,#####################################\nCURRENT...
1157,#####################################\nCURRENT...,#####################################\nEKG IMP...,#####################################\nCURRENT...
1158,,,


In [106]:
cpc_notes.shape

(1160, 7)

In [107]:
(~output[0].isna()).sum()

936

In [108]:
N = 103
cpc_notes.iloc[N]

text              Emergency Department - Chest Pain Center\n----...
spans                                                            []
deid_text         Emergency Department - <<REDACTED>>\n---------...
ArrivalInstant                                  2022-06-21 13:50:00
PAT_ENC_CSN_ID                                            290352583
DeID                                                            107
compiledText      ED Arrival Time: 2022-06-21 13:50:00\nType: Ch...
Name: 103, dtype: object

In [109]:
print(output.iloc[N,2])

#####################################
CURRENT ED PROVIDER NOTE: 



Type: ED Provider Notes
Provider: Stroke


History 
Chief Complaint 
Patient presents with 
 Chest Pain 
 
 
HPI 
 
<<REDACTED>> is a 60 y.o. female who presents with constant chest pain and pressure onset 3 days ago, states she feels like she can't take a deep breath. Different from usual GERD, similar to anxiety attack- only difference is Ativan didn't relieve sx as it usually does with anxiety attack. Feels like her "heart is pounding," feels like she has to keep swallowing. Also notes burning L arm and L leg pain onset 2 weeks ago, Occasional L leg swelling over past 6 mo- went away with Ibuprofen 200mg. 
 
Took Ativan and Famotidine medication today. + tingling in L arm earlier today- resolved with ASA.Has never had anything like this before. 
 
Husband had MI w/stent placement 12 days ago, got CABG <<DATE>>. Stressed and tearful when mentioning him. Lots 4 lbs over past week. 
 
Saw primary care provider 6 days a

In [110]:
# 224 patients dropped
# 79 don't have the most recent previous encounter note be ED Provider Note
# 145 have the most recent ED Provider Note from before this current encounter's ArrivalInstant
# Final cohort = 1160 - (145 + 79)

In [111]:
gpt_input = output.rename({0:"Current_Note", 
                              1:"Current_EKG",
                              2:"All_Notes"}, axis=1)

cpc_notes_processed = pd.concat([cpc_notes, gpt_input], axis=1)

# dropping the ones that don't have any text because they didn't have the right notes
print(f"Before drop missing patients size: {cpc_notes_processed.shape[0]}")
cpc_notes_processed = cpc_notes_processed[~cpc_notes_processed['Current_Note'].isna()]
print(f"After drop missing patients size: {cpc_notes_processed.shape[0]}")


Before drop missing patients size: 1160
After drop missing patients size: 936


# GPT Code

In [23]:
prompts = pd.read_csv("/Users/vsocrates/Documents/Yale/Heart_Score/prompt_iteration_for_GPT.csv")


In [24]:
system_role_prompt = prompts[prompts['Step'] == "INSTRUCTIONAL_PHRASE"]['Prompt'].squeeze()
prompts

Unnamed: 0,Step,Relevant_Notes,Prompt,VERSION
0,INSTRUCTIONAL_PHRASE,,You are a cardiologist consulted by the Emerge...,V10
1,History,Current_Notes,\nPROMPT:\nYou will be given clinical encounte...,
2,EKG,Current_EKG,\nPROMPT:\nYou will be given clinical encounte...,
3,Risk_Factors,All_Notes,\nPROMPT:\nYou will be given clinical encounte...,
4,OnePass_Prompt,All_Notes,\nPROMPT:\nYou will be given clinical encounte...,


In [27]:
history_prompt = prompts[prompts['Step'] == "History"]['Prompt'].squeeze()
ekg_prompt = prompts[prompts['Step'] == "EKG"]['Prompt'].squeeze()
risks_prompt = prompts[prompts['Step'] == "Risk_Factors"]['Prompt'].squeeze()
onepass_prompt = prompts[prompts['Step'] == "OnePass_Prompt"]['Prompt'].squeeze()


In [133]:
cpc_notes_processed_sample = cpc_notes_processed.sample(10)

In [127]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff



prompts = pd.read_csv("/Users/vsocrates/Documents/Yale/Heart_Score/prompt_iteration_for_GPT.csv")

system_role_prompt = prompts[prompts['Step'] == "INSTRUCTIONAL_PHRASE"]['Prompt'].squeeze()
history_prompt = prompts[prompts['Step'] == "History"]['Prompt'].squeeze()
ekg_prompt = prompts[prompts['Step'] == "EKG"]['Prompt'].squeeze()
risks_prompt = prompts[prompts['Step'] == "Risk_Factors"]['Prompt'].squeeze()
onepass_prompt = prompts[prompts['Step'] == "OnePass_Prompt"]['Prompt'].squeeze()


openai.api_type = "azure"
openai.api_base = "https://decile-openai-llm.openai.azure.com/"
openai.api_version = "2023-07-01-preview"
openai.api_key = "02582884a0604debb3f53092fbaf630d" #os.getenv("OPENAI_API_KEY")

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)


def get_classification_from_completion(completion):
   matches = list(re.finditer(r"\[.+?\]", completion))
   if matches:
      bracketed_info = matches[-1].group(0)
      return bracketed_info
   else:
       return "ERROR - No bracketed phrase"

def get_subscore_from_completion(completion):
   bracketed_info = get_classification_from_completion(completion)
   if re.match(r"\[Not enough information", bracketed_info):
      return "ERROR - Not enough info"

   subscore_match = re.search(r"\((\d)\)", bracketed_info)
   try:
       return int(subscore_match.group(1))
   except TypeError:
       return "ERROR - No number found"
    #    raise Exception(f"Could not cast subscore as an int: {subscore_match.group(1)}")

# completion = completion_with_backoff(engine="decile-gpt-35-turbo-16k",
#                                      messages = message_text,
#                                     #   temperature=0.7,
#                                     #   max_tokens=800,
#                                     #   top_p=0.95,
#                                     #   frequency_penalty=0,
#                                     #   presence_penalty=0,
#                                     #   stop=None
#                                     )

N = 3
history_df = []
ekg_df = []
risks_df = []
onepass_df = []

for idx, row in cpc_notes_processed_sample.iterrows():

    # history
    history_message_text = [{"role":"system","content":system_role_prompt},
                    {"role":"user","content":history_prompt[:history_prompt.rfind("{")] + row['Current_Note'] + '"""'}]
    # ekg
    ekg_message_text = [{"role":"system","content":system_role_prompt},
                    {"role":"user","content":ekg_prompt[:ekg_prompt.rfind("{")] + row['Current_EKG'] + '"""'}]
    # risk factors
    risks_message_text = [{"role":"system","content":system_role_prompt},
                    {"role":"user","content":risks_prompt[:risks_prompt.rfind("{")] + row['All_Notes'] + '"""'}]
    # one pass
    onepass_message_text = [{"role":"system","content":system_role_prompt},
                    {"role":"user","content":onepass_prompt[:onepass_prompt.rfind("{")] + row['All_Notes'] + '"""'}]


    for category, df, prompt in zip(["history", "ekg", "risk_factors", "one_pass"],
                            [history_df, ekg_df, risks_df, onepass_df],
                            [history_message_text, ekg_message_text, risks_message_text, onepass_message_text]):
        for trial in range(N):
            output = {}
            completion = completion_with_backoff(engine="decile-gpt-35-turbo-16k",
                                    messages=prompt)
            print(completion)

            output['DeID'] = row['DeID']
            output['completion'] = json.dumps(completion)
            output['attempt_number'] = trial
            output['section'] = category
            output['subscore'] = get_subscore_from_completion(completion['choices'][0]['message']['content'])

            df.append(output)
            
            print(df)
            break

    break

#####################################
CURRENT ED PROVIDER NOTE: 



Type: ED Provider Notes
Provider: Emergency Medicine


History 
Chief Complaint 
Patient presents with 
 Chest Pain 
 
 
The history is provided by the patient and medical records. 
Other 
This is a new problem. The current episode started 3 to 5 hours ago. The problem occurs constantly. The problem has been gradually improving. Associated symptoms include chest pain. Pertinent negatives include no abdominal pain, no headaches and no shortness of breath. The symptoms are aggravated by exertion. The symptoms are relieved by rest. He has tried ASA for the symptoms. The treatment provided moderate relief. 
 
 
Past Medical History: 
Diagnosis Date 
 AF (atrial fibrillation) (HC Code) (HC CODE) 
 Atrial fibrillation (HC Code) (HC CODE) 
 BPH (benign prostatic hyperplasia) 
 Cancer (HC Code) (HC CODE) 
 basil cell 
 Dyslipidemia 
 Hyperlipidemia 
 Macular degeneration 
 Pulmonary emboli (HC CODE) 
 
 
No past surgical histo

In [128]:
history_df

[{'DeID': 1,
  'completion': '{"id": "chatcmpl-8nUVb4bXeuj6gCMyRLG48ODvayw3c", "object": "chat.completion", "created": 1706806039, "model": "gpt-35-turbo-16k", "prompt_filter_results": [{"prompt_index": 0, "content_filter_results": {"hate": {"filtered": false, "severity": "safe"}, "self_harm": {"filtered": false, "severity": "safe"}, "sexual": {"filtered": false, "severity": "safe"}, "violence": {"filtered": false, "severity": "safe"}}}], "choices": [{"finish_reason": "stop", "index": 0, "message": {"role": "assistant", "content": "The patient presented to the Emergency Department with chest pain that started 3 to 5 hours ago and has been gradually improving. The pain is aggravated by exertion and relieved by rest. The patient has tried ASA for symptom relief, which provided moderate relief. These features are consistent with a moderately suspicious history of myocardial infarction (MI) [Moderately suspicious (1)]."}, "content_filter_results": {"hate": {"filtered": false, "severity": "

In [129]:
ekg_df

[{'DeID': 1,
  'completion': '{"id": "chatcmpl-8nUVdgekLy8RzCqGOo9bp48Fd3wR9", "object": "chat.completion", "created": 1706806041, "model": "gpt-35-turbo-16k", "prompt_filter_results": [{"prompt_index": 0, "content_filter_results": {"hate": {"filtered": false, "severity": "safe"}, "self_harm": {"filtered": false, "severity": "safe"}, "sexual": {"filtered": false, "severity": "safe"}, "violence": {"filtered": false, "severity": "safe"}}}], "choices": [{"finish_reason": "stop", "index": 0, "message": {"role": "assistant", "content": "Based on the information provided in the encounter notes, the patient\'s EKG interpretation is not clear. There is mention of atrial fibrillation, right bundle branch block, sinus rhythm, atrial premature complexes, incomplete right bundle branch block, and prolonged PR interval. However, there is no specific mention of significant ST deviations or elevations.\\n\\nTherefore, the EKG subscore is [Not enough information - Pt EKG interpretation not present]."}

In [131]:
pd.DataFrame.from_records(risks_df)

Unnamed: 0,DeID,completion,attempt_number,section,subscore
0,1,"{""id"": ""chatcmpl-8nUVeOHPXrnohHYAYG6tuLrUoSKhB...",0,risk_factors,1


In [136]:
x = [1,2,3,4,5,5,6,7]
x.reverse()
x[:20]

[7, 6, 5, 5, 4, 3, 2, 1]

In [141]:
print("The patient's chief complaint is sudden onset epigastric pain traveling to the back. The pain is rated 8/10 and has worsened since its onset. The patient also reports a previous episode with similar pain. Although the patient initially attributed the pain to intermittent fasting, it has become significantly worse. The pain is associated with nausea but no vomiting or diarrhea. \n\nBased on the patient's symptoms, the pain characteristics, and the absence of specific chest pain patterns, triggers, localization, related symptoms, and reactions to sublingual nitrates, the patient's history is [Slightly suspicious (0)].")

The patient's chief complaint is sudden onset epigastric pain traveling to the back. The pain is rated 8/10 and has worsened since its onset. The patient also reports a previous episode with similar pain. Although the patient initially attributed the pain to intermittent fasting, it has become significantly worse. The pain is associated with nausea but no vomiting or diarrhea. 

Based on the patient's symptoms, the pain characteristics, and the absence of specific chest pain patterns, triggers, localization, related symptoms, and reactions to sublingual nitrates, the patient's history is [Slightly suspicious (0)].
