# Setup

In [1]:
# path for input and target data tables

diagnosis_path = '../data/diagnosis.csv'
discharge_path ='../data/discharge.csv'
edstays_path = '../data/edstays.csv'
radiology_path = '../data/radiology.csv'
triage_path = '../data/triage.csv'
target_path = '../data/discharge_target.csv'


In [3]:
!pip install pandas
import pandas as pd
import re
import os

# read data
diagnosis_df = pd.read_csv(diagnosis_path, keep_default_na=False)
discharge_df = pd.read_csv(discharge_path, keep_default_na=False)
edstays_df = pd.read_csv(edstays_path, keep_default_na=False)
radiology_df = pd.read_csv(radiology_path, keep_default_na=False)
triage_df = pd.read_csv(triage_path, keep_default_na=False)
target_df = pd.read_csv(target_path, keep_default_na=False)


Collecting pandas
  Using cached pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting numpy>=1.22.4 (from pandas)
  Using cached numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Using cached numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Using cached pytz-2024.1-py2.py3-none-any.whl (505 kB)
Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-1.26.4 pandas-2.2.2 pytz-2024.1 tzdata-2024.1


In [5]:
edstays_df.head(2)

Unnamed: 0,subject_id,hadm_id,stay_id,intime,outtime,gender,race,arrival_transport,disposition
0,10001884,24962904,31742950,2130-12-06 16:46:00,2130-12-06 22:05:00,F,BLACK/AFRICAN AMERICAN,WALK IN,ADMITTED
1,10003019,22774359,33010597,2175-10-08 10:10:00,2175-10-08 18:58:00,M,WHITE,WALK IN,ADMITTED


In [6]:
discharge_df.head(2)

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,10001884-DS-35,10001884,24962904,DS,35,2130-12-08 00:00:00,2130-12-13 21:50:00,\nName: ___ Unit No: ___\n \nA...
1,10003019-DS-22,10003019,22774359,DS,22,2175-10-17 00:00:00,2175-10-24 14:40:00,\nName: ___. Unit No: ___\n \...


In [7]:
target_df.head(2)

Unnamed: 0,note_id,hadm_id,discharge_instructions,brief_hospital_course,discharge_instructions_word_count,brief_hospital_course_word_count
0,10001884-DS-35,24962904,"Dear Ms. ___,\n\nYou were admitted to ___ afte...",Ms. ___ is a ___ female with history of \nCOPD...,87,358
1,10003019-DS-22,22774359,"Dear Mr. ___,\n\nIt has been our pleasure to b...",___ male with h/o Hodgkin's lymphoma C1D17 ABV...,140,78


In [8]:

diagnosis_df.head(3)


Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title
0,10001884,31742950,1,J441,10,Chronic obstructive pulmonary disease w (acute...
1,10003019,33010597,1,28800,9,"NEUTROPENIA, UNSPECIFIED"
2,10003019,33010597,2,78060,9,"FEVER, UNSPECIFIED"


In [9]:
edstays_df.shape

(10985, 9)

In [10]:
diagnosis_df.shape

(21764, 6)

In [11]:
diagnosis_hadm_df = diagnosis_df.merge(edstays_df[['stay_id', 'hadm_id']], on='stay_id', how='left')
diagnosis_hadm_df.head(2)

Unnamed: 0,subject_id,stay_id,seq_num,icd_code,icd_version,icd_title,hadm_id
0,10001884,31742950,1,J441,10,Chronic obstructive pulmonary disease w (acute...,24962904
1,10003019,33010597,1,28800,9,"NEUTROPENIA, UNSPECIFIED",22774359


In [12]:
diagnosis_hadm_df.shape

(21764, 7)

In [13]:
diagnosis_hadm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21764 entries, 0 to 21763
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   subject_id   21764 non-null  int64 
 1   stay_id      21764 non-null  int64 
 2   seq_num      21764 non-null  int64 
 3   icd_code     21764 non-null  object
 4   icd_version  21764 non-null  int64 
 5   icd_title    21764 non-null  object
 6   hadm_id      21764 non-null  int64 
dtypes: int64(5), object(2)
memory usage: 1.2+ MB


In [14]:
diagnosis_hadm_df.to_csv('diagnosis_hadm.csv')

In [None]:
print(discharge_df['text'].iloc[0])

# Pre-process data

In [None]:
def extract_text_by_subheading_radiology(text):
    headings_regex = {
        'EXAMINATION': r'(?:EXAMINATION:|EXAM:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
        'INDICATION':  r'(?:INDICATION:|CLINICAL INFORMATION:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
        'TECHNIQUE': r'(?:TECHNIQUE:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
        'COMPARISON': r'(?:COMPARISON)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
        'FINDINGS': r'(?:FINDINGS:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
        'IMPRESSION': r'IMPRESSION:(.*)'
    }

    extracted_text = {}

    # Iterate over each subheading and extract corresponding text
    for heading, regex in headings_regex.items():
        match = re.search(regex, text, re.DOTALL)
        if match:
            extracted_text[heading] = match.group(1).strip()
        else:
            extracted_text[heading] = None

    return extracted_text

#apply function
radiology_sections_df = radiology_df['text'].apply(extract_text_by_subheading_radiology).apply(pd.Series)

# Concatenating the columns from radiology_df into radiology_sections_df
radiology_sections_df = pd.concat([radiology_df[['subject_id', 'hadm_id']], radiology_sections_df], axis=1)

In [None]:
radiology_sections_df.head(3)


In [None]:
def extract_text_by_subheading_discharge(text):
    
    headings_regex = {
        'CC': r'(CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:)(.*?)(?=Service:|Specialty:|Unit:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|Major Surgical or Invasive Procedure:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Service': r'(Service:|Specialty:|Unit:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Major Surgical Procedure': r'(Major Surgical or Invasive Procedure:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'HPI': r'(HPI:|History of Present Illness:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'PMH': r'(PMH:|Past Medical History:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'SOC': r'(Social History:|SOC:|SH:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'FH': r'(Family History:|PFH:|FH:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Past Surgical History': r'(Past Surgical History:|PSH:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Problem List': r'(Problem List:|Problems:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Physical Exam': r'(Physical Exam:|PE:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Medication Lists': r'(Medication lists:|Admission Medications:|Medications on Admission:|Preadmission Medication list)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Pertinent Results': r'(Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'BHC': r'(BHC:|Brief Hospital Course:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Transitional Issues': r'(Transitional Issues:|Transitional Issues|TRANSITIONAL ISSUES:)(.*?)(?=CODE STATUS:|Disposition:|Dispo:|CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Disposition': r'(Disposition:|Dispo:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Discharge Instructions': r'(Discharge Instructions:|Patient Instructions:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Followup Instructions': r'(Followup Instructions:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Discharge Diagnosis': r'(Discharge Diagnosis:|Diagnosis:)(.*?)(?=Discharge Condition:|CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:| Service:|Specialty:|Unit:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:$)',
    }

    extracted_text = {}
    for heading, regex in headings_regex.items():
        match = re.search(regex, text, re.DOTALL)
        if match:
            extracted_text[heading] = match.group(2).strip()
        else:
            extracted_text[heading] = None        

    return extracted_text

#apply function
discharge_sections_df = discharge_df['text'].apply(extract_text_by_subheading_discharge).apply(pd.Series)

# Concatenating the columns from radiology_df into radiology_sections_df
discharge_sections_df= pd.concat([discharge_df[['subject_id', 'hadm_id']], discharge_sections_df], axis=1)



In [None]:
discharge_sections_df.head(2)

In [None]:
print(discharge_sections_df['Transitional Issues'].iloc[0])


In [None]:
# target_df.head()
target_ind = 0
cur_hadm_id = target_df.hadm_id[target_ind]
# print(cur_hadm_id)
print(target_df['brief_hospital_course'].iloc[target_ind])


# NLP Pipeline

In [None]:
!pip install transformers
# !pip install vllm
from transformers import AutoModelForCausalLM, AutoTokenizer

# model_name = "allenai/llama-large-2048"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)

# context = discharge_df[discharge_df['hadm_id'] == 3]['text'].values[0]
 
# input_ids = tokenizer.encode(context, return_tensors="pt")

# output = model.generate(input_ids, max_length=100, num_return_sequences=1)
# decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
# from vllm import LLM, SamplingParams
# import os

In [None]:
os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"
os.environ['HuggingFaceHub_API_Token'] = "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"

# suzanne's
# os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"
# os.environ['HuggingFaceHub_API_Token'] = "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"


In [None]:
from transformers import pipeline
from vllm import LLM, SamplingParams
import os
# pipe = pipeline("text-generation", model="microsoft/BioGPT-Large")

In [None]:
from transformers import pipeline
import os

LLM_MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf" #"meta-llama/Meta-Llama-3-8B" #"meta-llama/Llama-2-7b-chat-hf" # meta-llama/Llama-2-7b-hf

pipe = pipeline("text-generation", model= LLM_MODEL_NAME)

In [None]:
# model = LLM(model=LLM_MODEL_NAME, gpu_memory_utilization=1, max_model_len=3000, enforce_eager=True, tensor_parallel_size=4)

In [None]:
# select data for one target entry
target_ind = 0
cur_hadm_id = target_df.hadm_id[target_ind]
cur_stay_id = edstays_df.stay_id[edstays_df['hadm_id'] == cur_hadm_id]
cur_discharge_df = discharge_sections_df[discharge_sections_df['hadm_id'] == cur_hadm_id]
cur_radiology_df = radiology_sections_df[radiology_sections_df['hadm_id'] == cur_hadm_id]
# cur_diagnosis_df = diagnosis_df[diagnosis_df['stay_id'] == cur_stay_id] # fix this

formatted_HPI = cur_discharge_df['HPI'].apply(lambda x: f"History of Present Illness: {x}")
formatted_CC = cur_discharge_df['CC'].apply(lambda x: f"The patient was admitted for Chief COmplaint: {x}")
formated_Pertinent_Results = []
formatted_radiology = []
# formatted_radiology.extend(f"Exam: {str(item)}" for item in cur_radiology['Exam'])


In [None]:
# seeding the conversation 0
prompt = 'This conversation is about clinical notes and documents from electronic health records'
# langchan this?

# # question 1
question = "What are this patient's medical problems? Create a list."

# gather the relevant info for this question
relevant_PMH = cur_discharge_df['HPI'][0].split('\n\n')[0] 
context = {}
context = f"The patient was admitted for chief complaint {cur_discharge_df['CC'][0]}. \nThe patient has the following relevant past medical history: {relevant_PMH}"
# print(context)

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)


In [None]:
# # question 2
question = "Why was the patient admitted to the hospital?"
context = {}
context = f"The patient was admitted for chief complaint {cur_discharge_df['CC'][0]}. \nHPI: {cur_discharge_df['HPI'][0]}"
# print(context)

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:
# # question 3
question = "What tests, diagnostic workup and treatments did the patient receive?"
# relevant_cols = ['History of Present Illness']
# ED_Summary = query_with_chatgpt(question, df, relevant_cols, hadm_id )

context = {}
context = f"HPI: {cur_discharge_df['HPI'][0]} \n Documented surgeries and major treatments: {cur_discharge_df['Major Surgical Procedure'][0]} \n Perinent results: {cur_discharge_df['Pertinent Results'][0]}"
# print(context)

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)



In [None]:
# # question 4
# question = """Combine the following admission and discharge medication lists into a table with the columns "name" ,"admission dose", "discharge dose" ,"route" """"
# relevant_cols = ['Medication List']
# Med_comp = query_with_chatgpt(question, df, relevant_cols, hadm_id )

question = "Given the admission and discharge medication lists, what medications were new, discontinued, or changed and by how much"
context = {}
context = cur_discharge_df['Medication Lists'][0]# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:
# # # question 5
# # question = "What medications were new at discharge?"
# # relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
# # New_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )


# # question template
# question = ""
# context = {}

# # create prompt and query model
# prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
# generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)


# print(prompt)
# print("=================")
# print(generated_text)

In [None]:
# # # question 6
# # question = "What medications changed and by how much?"
# # relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
# # Changed_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question template
# question = ""
# context = {}

# # create prompt and query model
# prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
# generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

# print(prompt)
# print("=================")
# print(generated_text)

In [None]:
# # # question 7
# # question = "What medications were stopped or discontinued?"
# # relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
# # Discontinued_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )
# question = ""
# context = {}

# # create prompt and query model
# prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
# generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

# print(prompt)
# print("=================")
# print(generated_text)

In [None]:
# # question 8
question = "Based on the medication lists, what major medical problems was the patient treated for?"
# relevant_cols = ['Medication list']
# Med_indications = query_with_chatgpt(question, df, relevant_cols, hadm_id )
context = {}
context = cur_discharge_df['Medication Lists'][0]

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:
# # question 9
question = "What treatments, surgeries or procedures did the patient receive? "
# relevant_cols = ['Major Procedures', 'History of Present Illness'] # and add input from Med_comp
# Major_treatments = query_with_chatgpt(question, df, relevant_cols, hadm_id )
context = {}
context = f""

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:
# # question 10
# question = "Summarize the radiological tests and findings"
# relevant_cols = ['Pertinent findings'] from discharge,['examination','indication','Impressions'] from radiology # also add radiology notes ()
# Radiology_findings = query_with_chatgpt(question, df, relevant_cols, hadm_id )
question = ""
context = {}

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:
# # question 11
# question = "What were the most pertinent lab, radiology, or study results?"
# relevant_cols = ['Pertinent Results', 'History of Present Illness'] # add Radiology_findings from above
# Pertinent_results = query_with_chatgpt(question, df, relevant_cols, hadm_id )
question = ""
context = {}

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:
# # question 12
# question = "What diagnoses did the patient receive?"
# relevant_cols = ['discharge diagnosis'] # add in ICD codes, will need to construct a string: the patient recieved the following diagnosis from ICD version [ICD version]: codes [codes]
# Diagnosis_list = query_with_chatgpt(question, df, relevant_cols, hadm_id )

In [None]:
# # question 13
# question = "Create a list of problems the patient was treated for during this hospital admission.  Combine the major medical problems, the reason the patient was admitted, and icd diagnosis"
# relevant_cols = [''] # input will be answers to questions above Diagnosis_lis, Relevant_PMH, Major_treatments...
# Hospital_problems =  query_with_chatgpt(question, df, relevant_cols, hadm_id )
question = ""
context = {}

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:
# # generate outputs
# # question 15
# question = "Create a summary of the events and treatments during the hospital visit.  Start with a summary sentence describing the patient’s major medical problems and the reason for the hospital admission.  Then create a list of each problem the patient was treated for combining their major medical problems, reason for admission, and ICD diagnoses.  For each problem, describe the presenting symptom and severity, the diagnostic workup and results, and what treatments or procedures they had related to this problem. For each problem, are there any pending results?  Is there recommended follow up related to this problem? What medications did they receive related to this problem?  If they were already being treated with the medication, was there a change from the baseline dose?  Summarize in the form of a bullet point list with a maximum of 12 items.  "
# relevant_cols = [''] # inputs will be answers to previous questions
# BHC_generated = query_with_chatgpt(question, df, relevant_cols, hadm_id )
question = ""
context = {}

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:
# # question 16
# question = "Summarize in 1-2 sentences, the patient’s major medical problems and the reason for the hospital admission. You may use common abbreviations of medical terms when possible."
# relevant_cols = ['Reason_for_admission', 'Relevant_PMH']
# BHC_summary_sentence = query_with_chatgpt(question, df, relevant_cols, hadm_id )
question = ""
context = {}

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:
# # question 17
# question = "For each problem, describe the presenting symptom and severity, the diagnostic workup and results, and what treatments or procedures they had related to this problem. For each problem, are there any pending results?  Is there recommended follow up related to this problem? What medications did they receive related to this problem?  If they were already being treated with the medication, was there a change from the baseline dose?  Summarize in the form of a bullet point list with a maximum of 12 items.  "
# relevant_cols = ['']
# BHC_problem_list = query_with_chatgpt(question, df, relevant_cols, hadm_id )
question = ""
context = {}

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:
# # stitch together BHC summary sentence and problem list output

# # question 18
# question = "Compose a letter to the patient that is courteous and easy to understand.  There should be limited medical jargon and it should be written in layman's language.  The letter will describe briefly the reason for admission and what treatments were given.  It will also include any major changes to the patient's current medical management. Include the following information: why the patient was admitted to the hospital, including major medical condition or symptoms. What were the most relevant diagnostic tests and what did they show? What major treatments did the patient receive? Are there any pending results?  What are the changes to the existing medications? Is there any scheduled or recommended  follow up? Include the following items: "
# relevant_cols = ['']
# Discharge_instructions_generated = query_with_chatgpt(question, df, relevant_cols, hadm_id )
question = ""
context = {}

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:


# question template
question = ""
context = {}

# create prompt and query model
prompt = f"\n Context: question answering and summarization from clinical notes. Using information from a medical record from a patient's hospital admission, Question: {question} : \nClinical Information: \n {context} \n Answer: "
generated_text = pipe(prompt, max_length= len(prompt), num_return_sequences=1)

print(prompt)
print("=================")
print(generated_text)

In [None]:
!pip install jupyter_contrib_nbextensions
!jupyter contrib nbextension install --user
!jupyter nbextension enable codefolding/main

In [None]:
def extract_answer_from_generated_text(text, regexpkey)
    extracted_text ={}
    match = re.search(regexkey, text, re.DOTALL)
    if match:
        extracted_text = match.group(1).strip()
    else:
        extracted_text = None  
    
    return extracted_text


answer = extract_answer_from_generated_text(generated_text, 'Answer:')
# outputs = model.generate(prompt, sampling_params)
# Relevant_PMH =  output.outputs[0].text

In [None]:
discharge_sections_df.head(2)

In [None]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM
import pandas as pd

# Load the model and tokenizer
tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

def query_with_llama(question, df, columns, hadm_id):

    # Filter the dataframe for the specific hadm_id
    df = df[df['hadm_id'] == hadm_id]
    if df.empty:
        return "No records found for the given HADM ID."

    # Create a context string from the specified columns
    context_data = df[columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).str.cat(sep=' ')
    inputs = tokenizer(question + " " + context_data, return_tensors="pt", truncation=True, max_length=1024)

    # Generate the response using the model
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model.generate(**inputs, max_length=1024)
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.strip()

# Example usage (ensure 'merged_df_expanded_with_discharge' is defined in your script)
question = "What major treatments, surgeries or procedures did the patient receive?"
hadm_id = 22774359  # Specify the HADM ID you want to query
relevant_cols = ['HPI', 'Major Surgical Procedure']
answer = query_with_llama(question, discharge_sections_df, relevant_cols, hadm_id)
print(answer)


In [None]:
# # question 2
# question = "Why was the patient admitted to the hospital?"
# relevant_cols = ['History of Present Illness', 'Chief Complaint']
# Reason_for_admission = query_with_chatgpt(prompt, df, relevant_cols, hadm_id )

# # question 3
# question = "What tests, diagnostic workup and treatments did the patient receive in the emergency room?"
# relevant_cols = ['History of Present Illness']
# ED_Summary = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 4
# question = "Combine the following admission and discharge medication lists into a table with the columns "name" ,"admission dose", "discharge dose" ,"route"
# relevant_cols = ['Medication List']
# Med_comp = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 5
# question = "What medications were new at discharge?"
# relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
# New_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 6
# question = "What medications changed and by how much?"
# relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
# Changed_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 7
# question = "What medications were stopped or discontinued?"
# relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
# Discontinued_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )


# # question 8
# question = "Based on the medication list, what major medical problems was the patient treated for?"
# relevant_cols = ['Medication list']
# Med_indications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 9
# question = "What major treatments, surgeries or procedures did the patient receive? "
# relevant_cols = ['Major Procedures', 'History of Present Illness'] # and add input from Med_comp
# Major_treatments = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 10
# question = "Summarize the radiological tests and findings"
# relevant_cols = ['Pertinent findings', 'Impressions'] # also add radiology notes ()
# Radiology_findings = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 11
# question = "What were the most pertinent lab, radiology, or study results?"
# relevant_cols = ['Pertinent Results', 'History of Present Illness'] # add Radiology_findings from above
# Pertinent_results = query_with_chatgpt(question, df, relevant_cols, hadm_id )


# # question 12
# question = "What diagnoses did the patient receive?"
# relevant_cols = ['discharge diagnosis'] # add in ICD codes, will need to construct a string: the patient recieved the following diagnosis from ICD version [ICD version]: codes [codes]
# Diagnosis_list = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 13
# question = "Create a list of problems the patient was treated for during this hospital admission.  Combine the major medical problems, the reason the patient was admitted, and icd diagnosis"
# relevant_cols = [''] # input will be answers to questions above Diagnosis_lis, Relevant_PMH, Major_treatments...
# Hospital_problems =  query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # generate outputs
# # question 15
# question = "Create a summary of the events and treatments during the hospital visit.  Start with a summary sentence describing the patient’s major medical problems and the reason for the hospital admission.  Then create a list of each problem the patient was treated for combining their major medical problems, reason for admission, and ICD diagnoses.  For each problem, describe the presenting symptom and severity, the diagnostic workup and results, and what treatments or procedures they had related to this problem. For each problem, are there any pending results?  Is there recommended follow up related to this problem? What medications did they receive related to this problem?  If they were already being treated with the medication, was there a change from the baseline dose?  Summarize in the form of a bullet point list with a maximum of 12 items.  "
# relevant_cols = [''] # inputs will be answers to previous questions
# BHC_generated = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 16
# question = "Summarize in 1-2 sentences, the patient’s major medical problems and the reason for the hospital admission. You may use common abbreviations of medical terms when possible."
# relevant_cols = ['Reason_for_admission', 'Relevant_PMH']
# BHC_summary_sentence = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # question 17
# question = "For each problem, describe the presenting symptom and severity, the diagnostic workup and results, and what treatments or procedures they had related to this problem. For each problem, are there any pending results?  Is there recommended follow up related to this problem? What medications did they receive related to this problem?  If they were already being treated with the medication, was there a change from the baseline dose?  Summarize in the form of a bullet point list with a maximum of 12 items.  "
# relevant_cols = ['']
# BHC_problem_list = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# # stitch together BHC summary sentence and problem list output

# # question 18
# question = "Compose a letter to the patient that is courteous and easy to understand.  There should be limited medical jargon and it should be written in layman's language.  The letter will describe briefly the reason for admission and what treatments were given.  It will also include any major changes to the patient's current medical management. Include the following information: why the patient was admitted to the hospital, including major medical condition or symptoms. What were the most relevant diagnostic tests and what did they show? What major treatments did the patient receive? Are there any pending results?  What are the changes to the existing medications? Is there any scheduled or recommended  follow up? Include the following items: "
# relevant_cols = ['']
# Discharge_instructions_generated = query_with_chatgpt(question, df, relevant_cols, hadm_id )

In [None]:

import transformers 
from transformers import pipeline
 
pipe = pipeline("text-generation", model="microsoft/BioGPT-Large")

generated_text = pipe(prompt, max_length=150, num_return_sequences=1)
# generated_text = pipe(final_prompt, max_length=150, num_return_sequences=1)

In [None]:
!pip install sacremoses


In [None]:
print(cur_discharge_df['HPI'].iloc[0])
print(f"Prompt: {prompt}")
cur_discharge_df.head()
# print(f"Response: {generated_text}")
# print("\n==========================================================")

In [None]:
print(diagnosis_df['stay_id'].iloc[1])
# diagnosis_df.head(1)


In [None]:
outputs = model.generate(prompts, sampling_params)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text

    print(f"Prompt: {prompt}")
    print(f"Response: {generated_text}")
    print("\n==========================================================")


# Prune Data Tables


In [None]:
# Bhanu method - segment radiology notes
def extract_text_by_subheading_radiology_bhanu(text):
    # Define regular expressions for each subheading
    headings_regex = {
        'EXAMINATION': r'EXAMINATION:(.*?)(?=INDICATION:|$)',
        'INDICATION': r'INDICATION:(.*?)(?=TECHNIQUE:|$)',
        'TECHNIQUE': r'TECHNIQUE:(.*?)(?=COMPARISON:|$)',
        'COMPARISON': r'COMPARISON:(.*?)(?=FINDINGS:|$)',
        'FINDINGS': r'FINDINGS:(.*?)(?=IMPRESSION:|$)',
        'IMPRESSION': r'IMPRESSION:(.*)'
    }

    extracted_text = {}

    # Iterate over each subheading and extract corresponding text
    for heading, regex in headings_regex.items():
        match = re.search(regex, text, re.DOTALL)
        if match:
            extracted_text[heading] = match.group(1).strip()
        else:
            extracted_text[heading] = None

    return extracted_text



In [None]:
#apply function
radiology_sections_df = radiology_df['text'].apply(extract_text_by_subheading_radiology_bhanu).apply(pd.Series)

# Concatenating the columns from radiology_df into radiology_sections_df
radiology_sections_df = pd.concat([radiology_df[['subject_id', 'hadm_id']], radiology_sections_df], axis=1)


In [None]:
# Bhanu method - discharge note segmentation by heading

subject_id = discharge_df['subject_id']
hadm_id = discharge_df['hadm_id']
stay_id = edstays_df[edstays_df['hadm_id'].isin(hadm_id)]['stay_id']
dcnote_id = discharge_df['note_id']
cctriage = triage_df[triage_df['stay_id'].isin(stay_id)]['chiefcomplaint']
icd_codes = diagnosis_df[diagnosis_df['stay_id'].isin(stay_id)]['icd_code']
icd_version = diagnosis_df[diagnosis_df['stay_id'].isin(stay_id)]['icd_version']
icd_title = diagnosis_df[diagnosis_df['stay_id'].isin(stay_id)]['icd_title']
radiologynotes = radiology_df[radiology_df['hadm_id'].isin(hadm_id)]['text']
dischargenotes = discharge_df[discharge_df['hadm_id'].isin(hadm_id)]['text']

df = pd.DataFrame({
    'subject_id': subject_id,
    'hadm_id': hadm_id,
    'stay_id': stay_id,
    'dcnote_id': dcnote_id,
    'cctriage': cctriage,
    'icd_codes': icd_codes,
    'icd_version': icd_version,
    'icd_title': icd_title,
    'radiology_notes': radiologynotes,
    'discharge_notes': dischargenotes
})
def extract_text_by_subheading_discharge_bhanu(text):
    # Define regular expressions for each subheading in discharge notes
    headings_regex = {
        'CC': r'(CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:)(.*?)(?=Service:|Specialty:|Unit:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|Major Surgical or Invasive Procedure:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Service': r'(Service:|Specialty:|Unit:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Major Surgical Procedure': r'(Major Surgical or Invasive Procedure:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'HPI': r'(HPI:|History of Present Illness:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'PMH': r'(PMH:|Past Medical History:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Social History': r'(Social History:|SOC:|SH:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Family History': r'(Family History:|PFH:|FH:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Past Surgical History': r'(Past Surgical History:|PSH:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Problem List': r'(Problem List:|Problems:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Physical Exam': r'(Physical Exam:|PE:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Medication lists': r'(Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Pertinent Results': r'(Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'BHC': r'(BHC:|Brief Hospital Course:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Disposition': r'(Disposition:|Dispo:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Discharge Instructions': r'(Discharge Instructions:|Patient Instructions:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Followup Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Followup Instructions': r'(Followup Instructions:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:|Service:|Specialty:|Unit:|Major Surgical or Invasive Procedure:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Transitional Issues:|Pertinent Results:|Pertinent Findings:|Discharge Diagnosis:$)',
        'Discharge Diagnosis': r'(Discharge Diagnosis:)(.*?)(?=CC:|Chief Complaint:|Reason for admission:|Reason for hospital admission:| Service:|Specialty:|Unit:|Major Medical Procedures:|Major Procedures:|Major Surgeries:|HPI:|History of Present Illness:|PMH:|Past Medical History:|Social History:|SOC:|SH:|Family History:|PFH:|FH:|Past Surgical History:|PSH:|Problem List:|Problems:|Physical Exam:|PE:|Medication lists:|Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:|Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:|BHC:|Brief Hospital Course:|Disposition:|Dispo:|Discharge Instructions:|Patient Instructions:|Followup Instructions:|Transitional Issues:$)',
    }

    extracted_text = {}

    # Iterate over each subheading and extract corresponding text
    for heading, regex in headings_regex.items():
        match = re.search(regex, text, re.DOTALL)
        if match:
            extracted_text[heading] = match.group(2).strip()
        else:
            extracted_text[heading] = None

    return extracted_text



In [None]:
# apply bhanu function
discharge_sections_df = discharge_df['text'].apply(extract_text_by_subheading_discharge_bhanu).apply(pd.Series)

v# Concatenating the columns from discharge_df into discharge_sections_df
discharge_sections_df = pd.concat([discharge_df[['subject_id', 'hadm_id']], discharge_sections_df], axis=1)

In [None]:
# Raj method radiology
def extract_text_by_subheading_radiology_raj(text):
    headings_regex = {
        'EXAMINATION': r'EXAMINATION:(.*?)(?=INDICATION:|$)',
        'INDICATION': r'INDICATION:(.*?)(?=TECHNIQUE:|$)',
        'TECHNIQUE': r'TECHNIQUE:(.*?)(?=COMPARISON:|$)',
        'COMPARISON': r'COMPARISON:(.*?)(?=FINDINGS:|$)',
        'FINDINGS': r'FINDINGS:(.*?)(?=IMPRESSION:|$)',
        'IMPRESSION': r'IMPRESSION:(.*)'
    }
    extracted_text = {}
    for heading, regex in headings_regex.items():
        match = re.search(regex, text, re.DOTALL)
        if match:
            extracted_text[heading] = match.group(1).strip()
        else:
            extracted_text[heading] = None
    return extracted_text



In [None]:
# Apply the raj function to each row in the 'radiology_notes' column and create a new DataFrame from the results
extracted_df = pd.DataFrame(merged_df['radiology_notes'].apply(extract_text_by_subheading_radiology_raj).tolist())

# Concatenate the new DataFrame with the original merged_df
merged_df_expanded = pd.concat([merged_df, extracted_df], axis=1)
merged_df_expanded.head()

In [None]:
# raj method discharge notes


def extract_text_by_subheading_discharge_raj(text):

    headings_regex = {
    'Chief Complaint': r'(?:CC:|Chief Complaint:| \n___ Complaint:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Service': r'(?:Service:|Specialty:|Unit:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Major Surgical Procedure': r'(?:Major Medical Procedures:|Major Procedures|Major Surgeries|Major Surgical or Invasive Procedure)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'History of Present Illness': r'(?:HPI:|History of Present Illness:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Past Medical History': r'(?:PMH:|Past Medical History:|PAST MEDICAL HISTORY:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Social History': r'(?:Social History:|SOC:|SH:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Family History': r'(?:Family History:|PFH:|FH:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Past Surgical History': r'(?:Past Surgical History:|PSH:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Problem List': r'(?:Problem List:|Problems:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Physical Exam': r'(?:Physical Exam:|PE:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Medication lists': r'(?:Admission Medications:|Medications on Admission:|Discharge Medications:|Medications on Discharge:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Pertinent Results': r'(?:Pertinent Results:|Pertinent imaging:|Pertinent Labs:|Pertinent Microbiology:|Pertinent Micro:|Results:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Brief Hospital Course': r'(?:BHC:|Brief Hospital Course:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Disposition': r'(?:Disposition:|Dispo:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Discharge Diagnosis': r'Discharge Diagnosis:\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Discharge Instructions': r'(?:Discharge Instructions:|Patient Instructions:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Followup Instructions': r'(?:Followup Instructions:)\s*(.*?)(?=\n[A-Za-z ]+:|$)',
    'Transitional Issues': r'(?:Transitional Issues:)\s*(.*?)(?=\n[A-Za-z ]+:|$)'
}


    extracted_text = {}
    for heading, regex in headings_regex.items():
        match = re.search(regex, text, re.DOTALL)
        if match:
            extracted_text[heading] = match.group(1).strip() if match.group(1) else None
        else:
            extracted_text[heading] = None
    return extracted_text



In [None]:
# apply raj method of discharge note segmentation
# Apply the function to the 'discharge_notes' column
extracted_discharge = pd.DataFrame(merged_df_expanded['discharge_notes'].apply(extract_text_by_subheading_discharge_raj).tolist())

# Concatenate the new DataFrame with the original merged_df to include the new columns
merged_df_expanded_with_discharge = pd.concat([merged_df_expanded, extracted_discharge], axis=1)
merged_df_expanded_with_discharge.head()
print(merged_df_expanded_with_discharge.radiology_notes[0])

In [None]:
# save curated, pre-processed data to new file

# clear all loaded data from memory


In [None]:
# load pre-processed data

In [None]:
hadm_id = 22595853  # Specify the HADM ID you want to query, start with discharge_target 'hadm_id')


# question list, relevant_cols (read from csv file 'Pipeline_Prompt_Instructions_Data_Pairs.csv')
question = "What major treatments, surgeries or procedures did the patient receive?"
relevant_cols = ['History of Present Illness','Major Surgical or Invasive Procedure']  # Specify columns related to the question

answer = query_with_chatgpt(question, merged_df_expanded_with_discharge, relevant_cols, hadm_id)
print(answer)



In [None]:
# slightly different way using the segmented data frames (unmerged)
hadm_id = target_df.hadm_id[0] # select hadm_id from target table

df = merged_df_expanded_with_discharge
# seeding the conversation 0
prompt = 'This conversation is about clinical notes and documents from electronic health records'
relevant_cols = ['History of Present Illness','Major Surgical or Invasive Procedure']  # Specify columns related to the question
answer = query_with_chatgpt(prompt, df, relevant_cols, hadm_id )

# question 1
question = "What are the patient’s major medical problems?"
relevant_cols = ['History of Present Illness']
Relevant_PMH = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 2
question = "Why was the patient admitted to the hospital?"
relevant_cols = ['History of Present Illness', 'Chief Complaint']
Reason_for_admission = query_with_chatgpt(prompt, df, relevant_cols, hadm_id )

# question 3
question = "What tests, diagnostic workup and treatments did the patient receive in the emergency room?"
relevant_cols = ['History of Present Illness']
ED_Summary = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 4
question = "Combine the following admission and discharge medication lists into a table with the columns "name" ,"admission dose", "discharge dose" ,"route"
relevant_cols = ['Medication List']
Med_comp = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 5
question = "What medications were new at discharge?"
relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
New_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 6
question = "What medications changed and by how much?"
relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
Changed_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 7
question = "What medications were stopped or discontinued?"
relevant_cols = ['Medication LIst'] # or input Med_comp from previous question
Discontinued_medications = query_with_chatgpt(question, df, relevant_cols, hadm_id )


# question 8
question = "Based on the medication list, what major medical problems was the patient treated for?"
relevant_cols = ['Medication list']
Med_indications = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 9
question = "What major treatments, surgeries or procedures did the patient receive? "
relevant_cols = ['Major Procedures', 'History of Present Illness'] # and add input from Med_comp
Major_treatments = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 10
question = "Summarize the radiological tests and findings"
relevant_cols = ['Pertinent findings', 'Impressions'] # also add radiology notes ()
Radiology_findings = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 11
question = "What were the most pertinent lab, radiology, or study results?"
relevant_cols = ['Pertinent Results', 'History of Present Illness'] # add Radiology_findings from above
Pertinent_results = query_with_chatgpt(question, df, relevant_cols, hadm_id )


# question 12
question = "What diagnoses did the patient receive?"
relevant_cols = ['discharge diagnosis'] # add in ICD codes, will need to construct a string: the patient recieved the following diagnosis from ICD version [ICD version]: codes [codes]
Diagnosis_list = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 13
question = "Create a list of problems the patient was treated for during this hospital admission.  Combine the major medical problems, the reason the patient was admitted, and icd diagnosis"
relevant_cols = [''] # input will be answers to questions above Diagnosis_lis, Relevant_PMH, Major_treatments...
Hospital_problems =  query_with_chatgpt(question, df, relevant_cols, hadm_id )

# generate outputs
# question 15
question = "Create a summary of the events and treatments during the hospital visit.  Start with a summary sentence describing the patient’s major medical problems and the reason for the hospital admission.  Then create a list of each problem the patient was treated for combining their major medical problems, reason for admission, and ICD diagnoses.  For each problem, describe the presenting symptom and severity, the diagnostic workup and results, and what treatments or procedures they had related to this problem. For each problem, are there any pending results?  Is there recommended follow up related to this problem? What medications did they receive related to this problem?  If they were already being treated with the medication, was there a change from the baseline dose?  Summarize in the form of a bullet point list with a maximum of 12 items.  "
relevant_cols = [''] # inputs will be answers to previous questions
BHC_generated = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 16
question = "Summarize in 1-2 sentences, the patient’s major medical problems and the reason for the hospital admission. You may use common abbreviations of medical terms when possible."
relevant_cols = ['Reason_for_admission', 'Relevant_PMH']
BHC_summary_sentence = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# question 17
question = "For each problem, describe the presenting symptom and severity, the diagnostic workup and results, and what treatments or procedures they had related to this problem. For each problem, are there any pending results?  Is there recommended follow up related to this problem? What medications did they receive related to this problem?  If they were already being treated with the medication, was there a change from the baseline dose?  Summarize in the form of a bullet point list with a maximum of 12 items.  "
relevant_cols = ['']
BHC_problem_list = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# stitch together BHC summary sentence and problem list output

# question 18
question = "Compose a letter to the patient that is courteous and easy to understand.  There should be limited medical jargon and it should be written in layman's language.  The letter will describe briefly the reason for admission and what treatments were given.  It will also include any major changes to the patient's current medical management. Include the following information: why the patient was admitted to the hospital, including major medical condition or symptoms. What were the most relevant diagnostic tests and what did they show? What major treatments did the patient receive? Are there any pending results?  What are the changes to the existing medications? Is there any scheduled or recommended  follow up? Include the following items: "
relevant_cols = ['']
Discharge_instructions_generated = query_with_chatgpt(question, df, relevant_cols, hadm_id )



In [None]:
# scratch cell

# template
# question
question = ""
relevant_cols = ['']
answer = query_with_chatgpt(question, df, relevant_cols, hadm_id )

# Create a context string from the specified columns
context_data = df[columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).str.cat(sep=' ')
prompt = f"Question: {question}\nData: {context_data}\nAnswer:"

ans_1 = query_with_x(prompt)

# to do - new query_function (prompt, data)
# generate prompt with each question.  data = new table with selected cols

In [None]:
hadm_id = target_df.hadm_id[0] # select hadm_id from target table
context = discharge_df[discharge_df['hadm_id'] == hadm_id]['text'].values[0]

question = "what symptoms did the patient present with?"

answer = 

In [None]:
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
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

# openAIGPT

In [None]:
import os
import openai
import pandas as pd

# Set the API key
openai.api_key = os.getenv("OPENAI_API_KEY")

def query_with_chatgpt(question, df, columns, hadm_id):
    # Filter the dataframe for the specific hadm_id
    df = df[df['hadm_id'] == hadm_id]

    if df.empty:
        return "No records found for the given HADM ID."

    # Create a context string from the specified columns
    context_data = df[columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).str.cat(sep=' ')

    # Limit the context data length if necessary
    max_length = 4000  # Adjust according to the GPT-3 token limit
    if len(context_data) > max_length:
        context_data = context_data[:max_length]

    # Create the prompt for the API
    prompt = f"Question: {question}\nData: {context_data}\nAnswer:"

    try:
        # Call the OpenAI API
        response = openai.Completion.create(
            engine="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=150,
            temperature=0.5
        )
        # Extract the text from the response
        answer = response.choices[0].text.strip()
        return answer
    except Exception as e:
        return f"Error: {str(e)}"



In [None]:
# Example usage
question = "What major treatments, surgeries or procedures did the patient receive?"
hadm_id = 22595853  # Specify the HADM ID you want to query
relevant_cols = ['History of Present Illness','Major Surgical or Invasive Procedure']  # Specify columns related to the question
answer = query_with_chatgpt(question, merged_df_expanded_with_discharge, relevant_cols, hadm_id)
print(answer)

# bioGPT
https://github.com/microsoft/BioGPT/tree/main/src

# requirements
command line:

git clone https://github.com/pytorch/fairseq
cd fairseq
git checkout v0.12.0
pip install .
python setup.py build_ext --inplace
cd ..

git clone https://github.com/moses-smt/mosesdecoder.git
export MOSES=${PWD}/mosesdecoder

git clone https://github.com/glample/fastBPE.git
export FASTBPE=${PWD}/fastBPE
cd fastBPE
g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast

pip install sacremoses

pip install scikit-learn



mkdir checkpoints
cd checkpoints
wget https://msralaphilly2.blob.core.windows.net/release/BioGPT/checkpoints/Pre-trained-BioGPT.tgz?sp=r&st=2023-11-13T15:37:35Z&se=2099-12-30T23:37:35Z&spr=https&sv=2022-11-02&sr=b&sig=3CcG1TOhqJPBhkVutvVn3PtUq0vPyLBgwggUfojypfY%3D
tar -zxvf Pre-trained-BioGPT.tgz

# example useage
import torch
from fairseq.models.transformer_lm import TransformerLanguageModel
m = TransformerLanguageModel.from_pretrained(
        "checkpoints/Pre-trained-BioGPT",
        "checkpoint.pt",
        "data",
        tokenizer='moses',
        bpe='fastbpe',
        bpe_codes="data/bpecodes",
        min_len=100,
        max_len_b=1024)
m.cuda()
src_tokens = m.encode("COVID-19 is")
generate = m.generate([src_tokens], beam=5)[0]
output = m.decode(generate[0]["tokens"])
print(output)

In [None]:
import torch
from transformers import BioGptModel, BioGptConfig

# Initializing a BioGPT microsoft/biogpt style configuration
configuration = BioGptConfig()

# Initializing a model from the microsoft/biogpt style configuration
model = BioGptModel(configuration)

# Accessing the model configuration
configuration = model.config

from src.transformer_lm_prompt import TransformerLanguageModelPrompt
m = TransformerLanguageModelPrompt.from_pretrained(
        "checkpoints/RE-DTI-BioGPT",
        "checkpoint_avg.pt",
        "data/KD-DTI/relis-bin",
        tokenizer='moses',
        bpe='fastbpe',
        bpe_codes="data/bpecodes",
        max_len_b=1024,
        beam=1)
m.cuda()


src_text="What are the patiet's major medical problems?" # input text, e.g., a PubMed abstract
src_tokens = m.encode(src_text)
generate = m.generate([src_tokens], beam=args.beam)[0]
output = m.decode(generate[0]["tokens"])
print(output)

In [None]:
src_text="What are the patiet's major medical problems?" # input text, e.g., a PubMed abstract
context_data = df[columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1).str.cat(sep=' ')

src_tokens = m.encode(src_text)
generate = m.generate([src_tokens], beam=args.beam)[0]
output = m.decode(generate[0]["tokens"])
print(output)

# Amazon Bedrock

# RAG_demonstration

# Scoring

https://github.com/Stanford-AIMI/discharge-me/tree/main


https://github.com/yuh-zha/AlignScore
