In [11]:
import pandas as pd

discharge_sections_df = pd.read_csv('discharge_sections.csv')
radiology_sections_df = pd.read_csv('radiology_sections.csv')

# NLP Pipeline

In [12]:
discharge_sections_df.head(2)

Unnamed: 0.1,Unnamed: 0,subject_id,hadm_id,CC,Service,Major Surgical Procedure,HPI,PMH,SOC,FH,...,Problem List,Physical Exam,Medication Lists,Pertinent Results,BHC,Transitional Issues,Disposition,Discharge Instructions,Followup Instructions,Discharge Diagnosis
0,0,10001884,24962904,Shortness of Breath,"MEDICINE\n \nAllergies: \nIV Dye, Iodine Conta...",,Ms. ___ is a ___ female with history of \nCOPD...,- COPD/Asthma on home 2L O2\n- Atypical Chest ...,___,Mother with asthma and hypertension. Father wi...,...,,ADMISSION PHYSICAL EXAM:\n====================...,is accurate and complete.\n1. Acetaminophen 32...,ADMISSION LABS: \n=========================\n_...,Ms. ___ is a ___ female with history of \nCOPD...,==========================\n[] For pt's contin...,Extended Care\n \nFacility:\n___\n \n___ Diagn...,"Dear Ms. ___,\n\nYou were admitted to ___ afte...",,PRIMARY:\nCOPD Exacerbation\n\nSECONDARY:\nAfi...
1,1,10003019,22774359,fever,MEDICINE\n \nAllergies: \nRagweed / morphine /...,none,Mr ___ is a ___ with h/o stage IV Hodgkins c1d...,"1. Sarcoidosis, dx skin bx: intestinal & pulmo...",___,"Mother: ___, cardiac disease. \nFather: diver...",...,,ADMISSION EXAM\nVitals: 124/67 on neosynephrin...,The Preadmission Medication list is accurate a...,ADMISSION LABS\n___ 10:40AM BLOOD WBC-0.2* RBC...,___ male with h/o Hodgkin's lymphoma C1D17 ABV...,,Home With Service\n \nFacility:\n___\n \nDisch...,"Dear Mr. ___,\n\nIt has been our pleasure to b...",,"Primary Diagnosis\nNeutropenic Fever, no sourc..."


In [6]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM
import pandas as pd

# Load the model and tokenizer
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

def query_with_llama(question, dataframes, column_lists, hadm_id, output_column):
    
    context_data = ""

    # Iterate through each DataFrame and its corresponding columns to create the context
    for df, cols in zip(dataframes, column_lists):
        # Filter the dataframe for the specific hadm_id
        target_df = df[df['hadm_id'] == hadm_id]
        if not target_df.empty:
            # Create a context string from the specified columns
            context_part = target_df[cols].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).str.cat(sep=' ')
            context_data += " " + context_part
    
    if not context_data:
        dataframes[0].loc[dataframes[0]['hadm_id'] == hadm_id, output_column] = "No records found for the given HADM ID."
        return dataframes[0]

    # Combine the question with the aggregated context data
    inputs = tokenizer(question + " " + context_data, return_tensors="pt", truncation=True, max_length=1024)

    # Generate the response using the model
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model.generate(**inputs, max_length=1024)
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    
    # Append the answer to the first DataFrame
    dataframes[0].loc[dataframes[0]['hadm_id'] == hadm_id, output_column] = answer
    return dataframes[0]

# Example usage
question = "Summarize the radiological tests and findings"
hadm_id = 24962904  # Specify the HADM ID you want to query
dfs = [discharge_sections_df, radiology_sections_df]  # List of DataFrames
relevant_cols = [['Pertinent Results'], ['EXAMINATION','INDICATION','IMPRESSION']]  # List of column lists for each DataFrame
output_col_name = 'radiology tests summary'  # Specify the name of the new column

# Assuming df1 is the primary DataFrame where the output should be stored
df1 = query_with_llama(question, dfs, relevant_cols, hadm_id, output_col_name)
print(df1.loc[df1['hadm_id'] == hadm_id, output_col_name])


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

0    Summarize the radiological tests and findings ...
Name: radiology tests summary, dtype: object


In [7]:
print(df1.loc[df1['hadm_id'] == hadm_id, output_col_name].values[0])


Summarize the radiological tests and findings  ADMISSION LABS: 
___ 05:54PM BLOOD WBC-7.1 RBC-4.74 Hgb-12.8 Hct-41.1 MCV-87 
MCH-27.0 MCHC-31.1* RDW-22.6* RDWSD-69.0* Plt ___
___ 05:54PM BLOOD Neuts-81.8* Lymphs-9.6* Monos-7.6 
Eos-0.3* Baso-0.1 Im ___ AbsNeut-5.82 AbsLymp-0.68* 
AbsMono-0.54 AbsEos-0.02* AbsBaso-0.01
___ 06:35AM BLOOD Calcium-9.9 Phos-4.1 Mg-2.0
___ 05:54PM BLOOD ___ pO2-52* pCO2-49* pH-7.43 
calTCO2-34* Base XS-6
___ 05:54PM BLOOD Lactate-1.5
___ 05:54PM BLOOD proBNP-181
___ 05:54PM BLOOD cTropnT-<0.01

STUDIES: 
+ CXR (___): Mild basilar atelectasis without definite focal 
consolidation.
+ EKG: Sinus rhythm at 69, left bundle branch block, no acute ST 
or T wave changes.

DISCHARGE LABS: 
___ 06:38AM BLOOD WBC-14.4*# RBC-4.34 Hgb-11.8 Hct-37.6 
MCV-87 MCH-27.2 MCHC-31.4* RDW-22.5* RDWSD-69.4* Plt ___
___ 06:38AM BLOOD Glucose-113* UreaN-18 Creat-0.8 Na-137 
K-3.1(repleted)* Cl-94* HCO3-31 AnGap-15 Chest:  Frontal and lateral views History: ___ with dyspnea  // eval 

In [12]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM
import pandas as pd

# Load the model and tokenizer
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

def query_with_llama(question, dataframes, column_lists, hadm_id, output_column):
    
    context_data = ""

    # Iterate through each DataFrame and its corresponding columns to create the context
    for df, cols in zip(dataframes, column_lists):
        # Filter the dataframe for the specific hadm_id
        target_df = df[df['hadm_id'] == hadm_id]
        if not target_df.empty:
            # Create a context string from the specified columns
            context_part = target_df[cols].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).str.cat(sep=' ')
            context_data += " " + context_part
    
    if not context_data:
        dataframes[0].loc[dataframes[0]['hadm_id'] == hadm_id, output_column] = "No records found for the given HADM ID."
        return dataframes[0]

    # Combine the question with the aggregated context data
    inputs = tokenizer(question + " " + context_data, return_tensors="pt", truncation=True, max_length=1024)

    # Generate the response using the model
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model.generate(**inputs, max_length=1024)
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    
    # Append the answer to the first DataFrame
    dataframes[0].loc[dataframes[0]['hadm_id'] == hadm_id, output_column] = answer
    return dataframes[0]

# Example usage
question = "Summarize"
hadm_id = 24962904  # Specify the HADM ID you want to query
dfs = [discharge_sections_df, radiology_sections_df]  # List of DataFrames
relevant_cols = [['Pertinent Results'], ['EXAMINATION','INDICATION','IMPRESSION']]  # List of column lists for each DataFrame
output_col_name = 'radiology tests summary'  # Specify the name of the new column

# Assuming df1 is the primary DataFrame where the output should be stored
df1 = query_with_llama(question, dfs, relevant_cols, hadm_id, output_col_name)
print(df1.loc[df1['hadm_id'] == hadm_id, output_col_name])


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Name: radiology tests summary, dtype: object


In [13]:
print(df1.loc[df1['hadm_id'] == hadm_id, output_col_name].values[0])


Summarize  ADMISSION LABS: 
___ 05:54PM BLOOD WBC-7.1 RBC-4.74 Hgb-12.8 Hct-41.1 MCV-87 
MCH-27.0 MCHC-31.1* RDW-22.6* RDWSD-69.0* Plt ___
___ 05:54PM BLOOD Neuts-81.8* Lymphs-9.6* Monos-7.6 
Eos-0.3* Baso-0.1 Im ___ AbsNeut-5.82 AbsLymp-0.68* 
AbsMono-0.54 AbsEos-0.02* AbsBaso-0.01
___ 06:35AM BLOOD Calcium-9.9 Phos-4.1 Mg-2.0
___ 05:54PM BLOOD ___ pO2-52* pCO2-49* pH-7.43 
calTCO2-34* Base XS-6
___ 05:54PM BLOOD Lactate-1.5
___ 05:54PM BLOOD proBNP-181
___ 05:54PM BLOOD cTropnT-<0.01

STUDIES: 
+ CXR (___): Mild basilar atelectasis without definite focal 
consolidation.
+ EKG: Sinus rhythm at 69, left bundle branch block, no acute ST 
or T wave changes.

DISCHARGE LABS: 
___ 06:38AM BLOOD WBC-14.4*# RBC-4.34 Hgb-11.8 Hct-37.6 
MCV-87 MCH-27.2 MCHC-31.4* RDW-22.5* RDWSD-69.4* Plt ___
___ 06:38AM BLOOD Glucose-113* UreaN-18 Creat-0.8 Na-137 
K-3.1(repleted)* Cl-94* HCO3-31 AnGap-15 Chest:  Frontal and lateral views History: ___ with dyspnea  // eval for pneumonia Mild basilar atelectas

In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

def query_with_bart(question, dataframes, column_lists, hadm_id, output_column):
    
    context_data = ""

    # Iterate through each DataFrame and its corresponding columns to create the context
    for df, cols in zip(dataframes, column_lists):
        # Filter the dataframe for the specific hadm_id
        target_df = df[df['hadm_id'] == hadm_id]
        if not target_df.empty:
            # Create a context string from the specified columns
            context_part = target_df[cols].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).str.cat(sep=' ')
            context_data += " " + context_part
    
    if not context_data:
        dataframes[0].loc[dataframes[0]['hadm_id'] == hadm_id, output_column] = "No records found for the given HADM ID."
        return dataframes[0]

    # Combine the question with the aggregated context data
    inputs = tokenizer(question + " " + context_data, return_tensors="pt", truncation=True, max_length=1024)

    # Generate the response using the model
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model.generate(**inputs, max_length=1024)
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    
    # Append the answer to the first DataFrame
    dataframes[0].loc[dataframes[0]['hadm_id'] == hadm_id, output_column] = answer
    return dataframes[0]

# Example usage
question = "Summarize the radiological tests and findings"
hadm_id = 24962904  # Specify the HADM ID you want to query
dfs = [discharge_sections_df, radiology_sections_df]  # List of DataFrames
relevant_cols = [['Pertinent Results'], ['EXAMINATION','INDICATION','IMPRESSION']]  # List of column lists for each DataFrame
output_col_name = 'radiology tests summary'  # Specify the name of the new column

# Assuming df1 is the primary DataFrame where the output should be stored
df1 = query_with_bart(question, dfs, relevant_cols, hadm_id, output_col_name)
print(df1.loc[df1['hadm_id'] == hadm_id, output_col_name])


0    Summarize the radiological tests and findings....
Name: radiology tests summary, dtype: object


In [17]:
print(df1.loc[df1['hadm_id'] == hadm_id, output_col_name].values[0])


Summarize the radiological tests and findings. ADMISSION LABS: RBC-4.74 Hgb-12.8 Hct-41.1 MCV-87 MCH-27.0 MCHC-31.1 RDW-22.6* RDWSD-69.0* Plt. CXR: Mild basilar atelectasis without definite focal consolidation.


In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Falconsai/text_summarization")
model = AutoModelForSeq2SeqLM.from_pretrained("Falconsai/text_summarization")

def query_with_bart(question, dataframes, column_lists, hadm_id, output_column):
    
    context_data = ""

    # Iterate through each DataFrame and its corresponding columns to create the context
    for df, cols in zip(dataframes, column_lists):
        # Filter the dataframe for the specific hadm_id
        target_df = df[df['hadm_id'] == hadm_id]
        if not target_df.empty:
            # Create a context string from the specified columns
            context_part = target_df[cols].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).str.cat(sep=' ')
            context_data += " " + context_part
    
    if not context_data:
        dataframes[0].loc[dataframes[0]['hadm_id'] == hadm_id, output_column] = "No records found for the given HADM ID."
        return dataframes[0]

    # Combine the question with the aggregated context data
    inputs = tokenizer(question + " " + context_data, return_tensors="pt", truncation=True, max_length=1024)

    # Generate the response using the model
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model.generate(**inputs, max_length=1024)
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    
    # Append the answer to the first DataFrame
    dataframes[0].loc[dataframes[0]['hadm_id'] == hadm_id, output_column] = answer
    return dataframes[0]

# Example usage
question = "Summarize"
hadm_id = 24962904  # Specify the HADM ID you want to query
dfs = [discharge_sections_df, radiology_sections_df]  # List of DataFrames
relevant_cols = [['Pertinent Results'], ['EXAMINATION','INDICATION','IMPRESSION']]  # List of column lists for each DataFrame
output_col_name = 'radiology tests summary'  # Specify the name of the new column

# Assuming df1 is the primary DataFrame where the output should be stored
df1 = query_with_bart(question, dfs, relevant_cols, hadm_id, output_col_name)
print(df1.loc[df1['hadm_id'] == hadm_id, output_col_name])


0    ___ 05:54PM BLOOD WBC-14.4*# RBC-4.74 Hgb-12.8...
Name: radiology tests summary, dtype: object


In [25]:
print(df1.loc[df1['hadm_id'] == hadm_id, output_col_name].values[0])


___ 05:54PM BLOOD WBC-14.4*# RBC-4.74 Hgb-12.8 Hct-41.1 MCV-87 MCH-27.2* RDWSD-69.0* Plt ___ ___ 05:54PM BLOOD Lactate-1.5 ___ ___ 06:38AM BLOOD Lactate-1.5 ___ __ 05:54PM BLOOD Lactate-1.5 _


In [None]:
# question = "Summarize the radiological tests and findings"
# relevant_cols = ['Pertinent findings'] from discharge,['examination','indication','Impressions'] from radiology

In [None]:
# # question 2
# question = "Why was the patient admitted to the hospital?"
# relevant_cols = ['History of Present Illness', 'Chief Complaint']
# Reason_for_admission = query_with_chatgpt(prompt, df, relevant_cols, hadm_id )

# # question 3
# question = "What tests, diagnostic workup and treatments did the patient receive in the emergency room?"
# relevant_cols = ['History of Present Illness']
# ED_Summary = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 4
# question = "Combine the following admission and discharge medication lists into a table with the columns "name" ,"admission dose", "discharge dose" ,"route"
# relevant_cols = ['Medication List']
# Med_comp = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 5
# question = "What medications were new at discharge?"
# relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
# New_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 6
# question = "What medications changed and by how much?"
# relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
# Changed_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 7
# question = "What medications were stopped or discontinued?"
# relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
# Discontinued_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )


# # question 8
# question = "Based on the medication list, what major medical problems was the patient treated for?"
# relevant_cols = ['Medication list']
# Med_indications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 9
# question = "What major treatments, surgeries or procedures did the patient receive? "
# relevant_cols = ['Major Procedures', 'History of Present Illness'] # and add input from Med_comp
# Major_treatments = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 10
# question = "Summarize the radiological tests and findings"
# relevant_cols = ['Pertinent findings', 'Impressions'] # also add radiology notes ()
# Radiology_findings = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 11
# question = "What were the most pertinent lab, radiology, or study results?"
# relevant_cols = ['Pertinent Results', 'History of Present Illness'] # add Radiology_findings from above
# Pertinent_results = query_with_chatgpt(question, df, relevant_cols, hadm_id )


# # question 12
# question = "What diagnoses did the patient receive?"
# relevant_cols = ['discharge diagnosis'] # add in ICD codes, will need to construct a string: the patient recieved the following diagnosis from ICD version [ICD version]: codes [codes]
# Diagnosis_list = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 13
# question = "Create a list of problems the patient was treated for during this hospital admission.  Combine the major medical problems, the reason the patient was admitted, and icd diagnosis"
# relevant_cols = [''] # input will be answers to questions above Diagnosis_lis, Relevant_PMH, Major_treatments...
# Hospital_problems =  query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # generate outputs
# # question 15
# question = "Create a summary of the events and treatments during the hospital visit.  Start with a summary sentence describing the patient’s major medical problems and the reason for the hospital admission.  Then create a list of each problem the patient was treated for combining their major medical problems, reason for admission, and ICD diagnoses.  For each problem, describe the presenting symptom and severity, the diagnostic workup and results, and what treatments or procedures they had related to this problem. For each problem, are there any pending results?  Is there recommended follow up related to this problem? What medications did they receive related to this problem?  If they were already being treated with the medication, was there a change from the baseline dose?  Summarize in the form of a bullet point list with a maximum of 12 items.  "
# relevant_cols = [''] # inputs will be answers to previous questions
# BHC_generated = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 16
# question = "Summarize in 1-2 sentences, the patient’s major medical problems and the reason for the hospital admission. You may use common abbreviations of medical terms when possible."
# relevant_cols = ['Reason_for_admission', 'Relevant_PMH']
# BHC_summary_sentence = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 17
# question = "For each problem, describe the presenting symptom and severity, the diagnostic workup and results, and what treatments or procedures they had related to this problem. For each problem, are there any pending results?  Is there recommended follow up related to this problem? What medications did they receive related to this problem?  If they were already being treated with the medication, was there a change from the baseline dose?  Summarize in the form of a bullet point list with a maximum of 12 items.  "
# relevant_cols = ['']
# BHC_problem_list = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # stitch together BHC summary sentence and problem list output

# # question 18
# question = "Compose a letter to the patient that is courteous and easy to understand.  There should be limited medical jargon and it should be written in layman's language.  The letter will describe briefly the reason for admission and what treatments were given.  It will also include any major changes to the patient's current medical management. Include the following information: why the patient was admitted to the hospital, including major medical condition or symptoms. What were the most relevant diagnostic tests and what did they show? What major treatments did the patient receive? Are there any pending results?  What are the changes to the existing medications? Is there any scheduled or recommended  follow up? Include the following items: "
# relevant_cols = ['']
# Discharge_instructions_generated = query_with_chatgpt(question, df, relevant_cols, hadm_id )

In [None]:
!pip install sacremoses


In [None]:
print(cur_discharge_df['HPI'].iloc[0])
print(f"Prompt: {prompt}")
cur_discharge_df.head()
# print(f"Response: {generated_text}")
# print("\n==========================================================")

In [None]:
print(diagnosis_df['stay_id'].iloc[1])
# diagnosis_df.head(1)


In [None]:
outputs = model.generate(prompts, sampling_params)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text

    print(f"Prompt: {prompt}")
    print(f"Response: {generated_text}")
    print("\n==========================================================")


# Prune Data Tables


In [None]:
# Bhanu method - segment radiology notes
def extract_text_by_subheading_radiology_bhanu(text):
    # Define regular expressions for each subheading
    headings_regex = {
        'EXAMINATION': r'EXAMINATION:(.*?)(?=INDICATION:|$)',
        'INDICATION': r'INDICATION:(.*?)(?=TECHNIQUE:|$)',
        'TECHNIQUE': r'TECHNIQUE:(.*?)(?=COMPARISON:|$)',
        'COMPARISON': r'COMPARISON:(.*?)(?=FINDINGS:|$)',
        'FINDINGS': r'FINDINGS:(.*?)(?=IMPRESSION:|$)',
        'IMPRESSION': r'IMPRESSION:(.*)'
    }

    extracted_text = {}

    # Iterate over each subheading and extract corresponding text
    for heading, regex in headings_regex.items():
        match = re.search(regex, text, re.DOTALL)
        if match:
            extracted_text[heading] = match.group(1).strip()
        else:
            extracted_text[heading] = None

    return extracted_text



In [None]:
#apply function
radiology_sections_df = radiology_df['text'].apply(extract_text_by_subheading_radiology_bhanu).apply(pd.Series)

# Concatenating the columns from radiology_df into radiology_sections_df
radiology_sections_df = pd.concat([radiology_df[['subject_id', 'hadm_id']], radiology_sections_df], axis=1)


In [None]:
# Bhanu method - discharge note segmentation by heading

subject_id = discharge_df['subject_id']
hadm_id = discharge_df['hadm_id']
stay_id = edstays_df[edstays_df['hadm_id'].isin(hadm_id)]['stay_id']
dcnote_id = discharge_df['note_id']
cctriage = triage_df[triage_df['stay_id'].isin(stay_id)]['chiefcomplaint']
icd_codes = diagnosis_df[diagnosis_df['stay_id'].isin(stay_id)]['icd_code']
icd_version = diagnosis_df[diagnosis_df['stay_id'].isin(stay_id)]['icd_version']
icd_title = diagnosis_df[diagnosis_df['stay_id'].isin(stay_id)]['icd_title']
radiologynotes = radiology_df[radiology_df['hadm_id'].isin(hadm_id)]['text']
dischargenotes = discharge_df[discharge_df['hadm_id'].isin(hadm_id)]['text']

df = pd.DataFrame({
    'subject_id': subject_id,
    'hadm_id': hadm_id,
    'stay_id': stay_id,
    'dcnote_id': dcnote_id,
    'cctriage': cctriage,
    'icd_codes': icd_codes,
    'icd_version': icd_version,
    'icd_title': icd_title,
    'radiology_notes': radiologynotes,
    'discharge_notes': dischargenotes
})
def extract_text_by_subheading_discharge_bhanu(text):
    # Define regular expressions for each subheading in discharge notes
    headings_regex = {
        'CC': r'(CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:)(.*?)(?=Service:|Specialty:|Unit:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|Major Surgical or Invasive Procedure:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Service': r'(Service:|Specialty:|Unit:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Major Surgical Procedure': r'(Major Surgical or Invasive Procedure:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'HPI': r'(HPI:|History of Present Illness:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'PMH': r'(PMH:|Past Medical History:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Social History': r'(Social History:|SOC:|SH:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Family History': r'(Family History:|PFH:|FH:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Past Surgical History': r'(Past Surgical History:|PSH:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Problem List': r'(Problem List:|Problems:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Physical Exam': r'(Physical Exam:|PE:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Medication lists': r'(Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Pertinent Results': r'(Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'BHC': r'(BHC:|Brief Hospital Course:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Disposition': r'(Disposition:|Dispo:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Discharge Instructions': r'(Discharge Instructions:|Patient Instructions:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Followup Instructions': r'(Followup Instructions:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Discharge Diagnosis': r'(Discharge Diagnosis:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:| Service:|Specialty:|Unit:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:$)',
    }

    extracted_text = {}

    # Iterate over each subheading and extract corresponding text
    for heading, regex in headings_regex.items():
        match = re.search(regex, text, re.DOTALL)
        if match:
            extracted_text[heading] = match.group(2).strip()
        else:
            extracted_text[heading] = None

    return extracted_text



In [None]:
# apply bhanu function
discharge_sections_df = discharge_df['text'].apply(extract_text_by_subheading_discharge_bhanu).apply(pd.Series)

v# Concatenating the columns from discharge_df into discharge_sections_df
discharge_sections_df = pd.concat([discharge_df[['subject_id', 'hadm_id']], discharge_sections_df], axis=1)

In [None]:
# Raj method radiology
def extract_text_by_subheading_radiology_raj(text):
    headings_regex = {
        'EXAMINATION': r'EXAMINATION:(.*?)(?=INDICATION:|$)',
        'INDICATION': r'INDICATION:(.*?)(?=TECHNIQUE:|$)',
        'TECHNIQUE': r'TECHNIQUE:(.*?)(?=COMPARISON:|$)',
        'COMPARISON': r'COMPARISON:(.*?)(?=FINDINGS:|$)',
        'FINDINGS': r'FINDINGS:(.*?)(?=IMPRESSION:|$)',
        'IMPRESSION': r'IMPRESSION:(.*)'
    }
    extracted_text = {}
    for heading, regex in headings_regex.items():
        match = re.search(regex, text, re.DOTALL)
        if match:
            extracted_text[heading] = match.group(1).strip()
        else:
            extracted_text[heading] = None
    return extracted_text



In [None]:
# Apply the raj function to each row in the 'radiology_notes' column and create a new DataFrame from the results
extracted_df = pd.DataFrame(merged_df['radiology_notes'].apply(extract_text_by_subheading_radiology_raj).tolist())

# Concatenate the new DataFrame with the original merged_df
merged_df_expanded = pd.concat([merged_df, extracted_df], axis=1)
merged_df_expanded.head()

In [None]:
# raj method discharge notes


def extract_text_by_subheading_discharge_raj(text):

    headings_regex = {
    'Chief Complaint': r'(?:CC:|Chief Complaint:| \n___ Complaint:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Service': r'(?:Service:|Specialty:|Unit:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Major Surgical Procedure': r'(?:Major Medical Procedures:|Major Procedures|Major Surgeries|Major Surgical or Invasive Procedure)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'History of Present Illness': r'(?:HPI:|History of Present Illness:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Past Medical History': r'(?:PMH:|Past Medical History:|PAST MEDICAL HISTORY:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Social History': r'(?:Social History:|SOC:|SH:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Family History': r'(?:Family History:|PFH:|FH:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Past Surgical History': r'(?:Past Surgical History:|PSH:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Problem List': r'(?:Problem List:|Problems:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Physical Exam': r'(?:Physical Exam:|PE:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Medication lists': r'(?:Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Pertinent Results': r'(?:Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Brief Hospital Course': r'(?:BHC:|Brief Hospital Course:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Disposition': r'(?:Disposition:|Dispo:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Discharge Diagnosis': r'Discharge Diagnosis:\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Discharge Instructions': r'(?:Discharge Instructions:|Patient Instructions:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Followup Instructions': r'(?:Followup Instructions:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Transitional Issues': r'(?:Transitional Issues:)\s*(.*?)(?=\n[A-Za-z ]+:|$)'
}


    extracted_text = {}
    for heading, regex in headings_regex.items():
        match = re.search(regex, text, re.DOTALL)
        if match:
            extracted_text[heading] = match.group(1).strip() if match.group(1) else None
        else:
            extracted_text[heading] = None
    return extracted_text



In [None]:
# apply raj method of discharge note segmentation
# Apply the function to the 'discharge_notes' column
extracted_discharge = pd.DataFrame(merged_df_expanded['discharge_notes'].apply(extract_text_by_subheading_discharge_raj).tolist())

# Concatenate the new DataFrame with the original merged_df to include the new columns
merged_df_expanded_with_discharge = pd.concat([merged_df_expanded, extracted_discharge], axis=1)
merged_df_expanded_with_discharge.head()
print(merged_df_expanded_with_discharge.radiology_notes[0])

In [None]:
# save curated, pre-processed data to new file

# clear all loaded data from memory


In [None]:
# load pre-processed data

In [None]:
hadm_id = 22595853  # Specify the HADM ID you want to query, start with discharge_target 'hadm_id')


# question list, relevant_cols (read from csv file 'Pipeline_Prompt_Instructions_Data_Pairs.csv')
question = "What major treatments, surgeries or procedures did the patient receive?"
relevant_cols = ['History of Present Illness','Major Surgical or Invasive Procedure']  # Specify columns related to the question

answer = query_with_chatgpt(question, merged_df_expanded_with_discharge, relevant_cols, hadm_id)
print(answer)



In [None]:
# slightly different way using the segmented data frames (unmerged)
hadm_id = target_df.hadm_id[0] # select hadm_id from target table

df = merged_df_expanded_with_discharge
# seeding the conversation 0
prompt = 'This conversation is about clinical notes and documents from electronic health records'
relevant_cols = ['History of Present Illness','Major Surgical or Invasive Procedure']  # Specify columns related to the question
answer = query_with_chatgpt(prompt, df, relevant_cols, hadm_id )

# question 1
question = "What are the patient’s major medical problems?"
relevant_cols = ['History of Present Illness']
Relevant_PMH = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 2
question = "Why was the patient admitted to the hospital?"
relevant_cols = ['History of Present Illness', 'Chief Complaint']
Reason_for_admission = query_with_chatgpt(prompt, df, relevant_cols, hadm_id )

# question 3
question = "What tests, diagnostic workup and treatments did the patient receive in the emergency room?"
relevant_cols = ['History of Present Illness']
ED_Summary = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 4
question = "Combine the following admission and discharge medication lists into a table with the columns "name" ,"admission dose", "discharge dose" ,"route"
relevant_cols = ['Medication List']
Med_comp = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 5
question = "What medications were new at discharge?"
relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
New_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 6
question = "What medications changed and by how much?"
relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
Changed_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 7
question = "What medications were stopped or discontinued?"
relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
Discontinued_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )


# question 8
question = "Based on the medication list, what major medical problems was the patient treated for?"
relevant_cols = ['Medication list']
Med_indications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 9
question = "What major treatments, surgeries or procedures did the patient receive? "
relevant_cols = ['Major Procedures', 'History of Present Illness'] # and add input from Med_comp
Major_treatments = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 10
question = "Summarize the radiological tests and findings"
relevant_cols = ['Pertinent findings', 'Impressions'] # also add radiology notes ()
Radiology_findings = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 11
question = "What were the most pertinent lab, radiology, or study results?"
relevant_cols = ['Pertinent Results', 'History of Present Illness'] # add Radiology_findings from above
Pertinent_results = query_with_chatgpt(question, df, relevant_cols, hadm_id )


# question 12
question = "What diagnoses did the patient receive?"
relevant_cols = ['discharge diagnosis'] # add in ICD codes, will need to construct a string: the patient recieved the following diagnosis from ICD version [ICD version]: codes [codes]
Diagnosis_list = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 13
question = "Create a list of problems the patient was treated for during this hospital admission.  Combine the major medical problems, the reason the patient was admitted, and icd diagnosis"
relevant_cols = [''] # input will be answers to questions above Diagnosis_lis, Relevant_PMH, Major_treatments...
Hospital_problems =  query_with_chatgpt(question, df, relevant_cols, hadm_id )

# generate outputs
# question 15
question = "Create a summary of the events and treatments during the hospital visit.  Start with a summary sentence describing the patient’s major medical problems and the reason for the hospital admission.  Then create a list of each problem the patient was treated for combining their major medical problems, reason for admission, and ICD diagnoses.  For each problem, describe the presenting symptom and severity, the diagnostic workup and results, and what treatments or procedures they had related to this problem. For each problem, are there any pending results?  Is there recommended follow up related to this problem? What medications did they receive related to this problem?  If they were already being treated with the medication, was there a change from the baseline dose?  Summarize in the form of a bullet point list with a maximum of 12 items.  "
relevant_cols = [''] # inputs will be answers to previous questions
BHC_generated = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 16
question = "Summarize in 1-2 sentences, the patient’s major medical problems and the reason for the hospital admission. You may use common abbreviations of medical terms when possible."
relevant_cols = ['Reason_for_admission', 'Relevant_PMH']
BHC_summary_sentence = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 17
question = "For each problem, describe the presenting symptom and severity, the diagnostic workup and results, and what treatments or procedures they had related to this problem. For each problem, are there any pending results?  Is there recommended follow up related to this problem? What medications did they receive related to this problem?  If they were already being treated with the medication, was there a change from the baseline dose?  Summarize in the form of a bullet point list with a maximum of 12 items.  "
relevant_cols = ['']
BHC_problem_list = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# stitch together BHC summary sentence and problem list output

# question 18
question = "Compose a letter to the patient that is courteous and easy to understand.  There should be limited medical jargon and it should be written in layman's language.  The letter will describe briefly the reason for admission and what treatments were given.  It will also include any major changes to the patient's current medical management. Include the following information: why the patient was admitted to the hospital, including major medical condition or symptoms. What were the most relevant diagnostic tests and what did they show? What major treatments did the patient receive? Are there any pending results?  What are the changes to the existing medications? Is there any scheduled or recommended  follow up? Include the following items: "
relevant_cols = ['']
Discharge_instructions_generated = query_with_chatgpt(question, df, relevant_cols, hadm_id )



In [None]:
# scratch cell

# template
# question
question = ""
relevant_cols = ['']
answer = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# Create a context string from the specified columns
context_data = df[columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).str.cat(sep=' ')
prompt = f"Question: {question}\nData: {context_data}\nAnswer:"

ans_1 = query_with_x(prompt)

# to do - new query_function (prompt, data)
# generate prompt with each question.  data = new table with selected cols

In [None]:
hadm_id = target_df.hadm_id[0] # select hadm_id from target table
context = discharge_df[discharge_df['hadm_id'] == hadm_id]['text'].values[0]

question = "what symptoms did the patient present with?"

answer = 

# openAIGPT

In [None]:
import os
import openai
import pandas as pd

# Set the API key
openai.api_key = os.getenv("OPENAI_API_KEY")

def query_with_chatgpt(question, df, columns, hadm_id):
    # Filter the dataframe for the specific hadm_id
    df = df[df['hadm_id'] == hadm_id]

    if df.empty:
        return "No records found for the given HADM ID."

    # Create a context string from the specified columns
    context_data = df[columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).str.cat(sep=' ')

    # Limit the context data length if necessary
    max_length = 4000  # Adjust according to the GPT-3 token limit
    if len(context_data) > max_length:
        context_data = context_data[:max_length]

    # Create the prompt for the API
    prompt = f"Question: {question}\nData: {context_data}\nAnswer:"

    try:
        # Call the OpenAI API
        response = openai.Completion.create(
            engine="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=150,
            temperature=0.5
        )
        # Extract the text from the response
        answer = response.choices[0].text.strip()
        return answer
    except Exception as e:
        return f"Error: {str(e)}"



In [None]:
# Example usage
question = "What major treatments, surgeries or procedures did the patient receive?"
hadm_id = 22595853  # Specify the HADM ID you want to query
relevant_cols = ['History of Present Illness','Major Surgical or Invasive Procedure']  # Specify columns related to the question
answer = query_with_chatgpt(question, merged_df_expanded_with_discharge, relevant_cols, hadm_id)
print(answer)

# bioGPT
https://github.com/microsoft/BioGPT/tree/main/src

# requirements
command line:

git clone https://github.com/pytorch/fairseq
cd fairseq
git checkout v0.12.0
pip install .
python setup.py build_ext --inplace
cd ..

git clone https://github.com/moses-smt/mosesdecoder.git
export MOSES=${PWD}/mosesdecoder

git clone https://github.com/glample/fastBPE.git
export FASTBPE=${PWD}/fastBPE
cd fastBPE
g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast

pip install sacremoses

pip install scikit-learn



mkdir checkpoints
cd checkpoints
wget https://msralaphilly2.blob.core.windows.net/release/BioGPT/checkpoints/Pre-trained-BioGPT.tgz?sp=r&st=2023-11-13T15:37:35Z&se=2099-12-30T23:37:35Z&spr=https&sv=2022-11-02&sr=b&sig=3CcG1TOhqJPBhkVutvVn3PtUq0vPyLBgwggUfojypfY%3D
tar -zxvf Pre-trained-BioGPT.tgz

# example useage
import torch
from fairseq.models.transformer_lm import TransformerLanguageModel
m = TransformerLanguageModel.from_pretrained(
        "checkpoints/Pre-trained-BioGPT",
        "checkpoint.pt",
        "data",
        tokenizer='moses',
        bpe='fastbpe',
        bpe_codes="data/bpecodes",
        min_len=100,
        max_len_b=1024)
m.cuda()
src_tokens = m.encode("COVID-19 is")
generate = m.generate([src_tokens], beam=5)[0]
output = m.decode(generate[0]["tokens"])
print(output)

In [None]:
import torch
from transformers import BioGptModel, BioGptConfig

# Initializing a BioGPT microsoft/biogpt style configuration
configuration = BioGptConfig()

# Initializing a model from the microsoft/biogpt style configuration
model = BioGptModel(configuration)

# Accessing the model configuration
configuration = model.config

from src.transformer_lm_prompt import TransformerLanguageModelPrompt
m = TransformerLanguageModelPrompt.from_pretrained(
        "checkpoints/RE-DTI-BioGPT",
        "checkpoint_avg.pt",
        "data/KD-DTI/relis-bin",
        tokenizer='moses',
        bpe='fastbpe',
        bpe_codes="data/bpecodes",
        max_len_b=1024,
        beam=1)
m.cuda()


src_text="What are the patiet's major medical problems?" # input text, e.g., a PubMed abstract
src_tokens = m.encode(src_text)
generate = m.generate([src_tokens], beam=args.beam)[0]
output = m.decode(generate[0]["tokens"])
print(output)

In [None]:
src_text="What are the patiet's major medical problems?" # input text, e.g., a PubMed abstract
context_data = df[columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).str.cat(sep=' ')

src_tokens = m.encode(src_text)
generate = m.generate([src_tokens], beam=args.beam)[0]
output = m.decode(generate[0]["tokens"])
print(output)

# Amazon Bedrock

# RAG_demonstration

# Scoring

https://github.com/Stanford-AIMI/discharge-me/tree/main


https://github.com/yuh-zha/AlignScore
