# Prepare Single Note Brief Hospital Course Dataset

This notebook prepares the preceding text in the discharge summary as training data for the brief hospital course, including no other data as a baseline. 

In [50]:
import pandas as pd
import re
import tiktoken

In [2]:
challenge_data_fp = "/gpfs/gibbs/project/rtaylor/shared/DischargeMe/public/"


In [3]:
target_train = pd.read_csv(challenge_data_fp + "train/discharge_target.csv.gz", keep_default_na=False)
target_valid = pd.read_csv(challenge_data_fp + "valid/discharge_target.csv.gz", keep_default_na=False)
target_test = pd.read_csv(challenge_data_fp + "test_phase_1/discharge_target.csv.gz", keep_default_na=False)


In [4]:
target_train

Unnamed: 0,note_id,hadm_id,discharge_instructions,brief_hospital_course,discharge_instructions_word_count,brief_hospital_course_word_count
0,15373895-DS-19,28448473,"Dear Mr. ___,\n\nIt was a pleasure taking care...",Mr. ___ was admitted from the emergency depart...,760,398
1,19045496-DS-24,22343752,You were admitted to the hospital after walkin...,___ year old female with past medical history ...,302,515
2,15071083-DS-16,24572540,you were hospitalized for gi bleeding that was...,"___ gentleman DM2, CAD s/p CABG, ischemic card...",73,579
3,11669075-DS-16,25889399,"Dear Mr. ___,\n\nWHY WERE YOU ADMITTED TO THE ...",Mr. ___ is a pleasant ___ y/o gentleman with a...,113,42
4,17164417-DS-7,24903173,Take your pain medicine as prescribed.\n\n¨ ...,Mr. ___ was admitted from the emergency depart...,226,167
...,...,...,...,...,...,...
68780,14439892-DS-19,26082244,"Dear Mr. ___,\n\nYou were admitted to ___ due ...",Mr. ___ is a ___ year old man with a history o...,125,93
68781,19466866-DS-13,25505122,"Dear Mr. ___,\n\nThank you for choosing ___ fo...",___ is a ___ man with known metastatic \nmelan...,184,779
68782,17394776-DS-7,25400280,"Dear Ms. ___,\n\nIt was a pleasure taking care...","___ w/ history of remote cutaneous lymphoma, e...",134,637
68783,17261183-DS-24,26769375,"Dear Ms. ___, \n\nYou were admitted to the hos...",___ is a ___ yo F with mild static encephalopa...,95,176


In [5]:
target_train = pd.read_csv(challenge_data_fp + "train/discharge_target.csv.gz", keep_default_na=False)
target_valid = pd.read_csv(challenge_data_fp + "valid/discharge_target.csv.gz", keep_default_na=False)
target_test = pd.read_csv(challenge_data_fp + "test_phase_1/discharge_target.csv.gz", keep_default_na=False)

discharge_summ_train = pd.read_csv(challenge_data_fp + "train/discharge.csv.gz", keep_default_na=False)
discharge_summ_valid = pd.read_csv(challenge_data_fp + "valid/discharge.csv.gz", keep_default_na=False)
discharge_summ_test = pd.read_csv(challenge_data_fp + "test_phase_1/discharge.csv.gz", keep_default_na=False)

radiology_train = pd.read_csv(challenge_data_fp + "train/radiology.csv.gz", keep_default_na=False)
radiology_valid = pd.read_csv(challenge_data_fp + "valid/radiology.csv.gz", keep_default_na=False)
radiology_test = pd.read_csv(challenge_data_fp + "test_phase_1/radiology.csv.gz", keep_default_na=False)

edstay_train = pd.read_csv(challenge_data_fp + "train/edstays.csv.gz", keep_default_na=False)
edstay_valid = pd.read_csv(challenge_data_fp + "valid/edstays.csv.gz", keep_default_na=False)
edstay_test = pd.read_csv(challenge_data_fp + "test_phase_1/edstays.csv.gz", keep_default_na=False)

edtriage_train = pd.read_csv(challenge_data_fp + "train/triage.csv.gz", keep_default_na=False)
edtriage_valid = pd.read_csv(challenge_data_fp + "valid/triage.csv.gz", keep_default_na=False)
edtriage_test = pd.read_csv(challenge_data_fp + "test_phase_1/triage.csv.gz", keep_default_na=False)

eddiags_train = pd.read_csv(challenge_data_fp + "train/diagnosis.csv.gz", keep_default_na=False)
eddiags_valid = pd.read_csv(challenge_data_fp + "valid/diagnosis.csv.gz", keep_default_na=False)
eddiags_test = pd.read_csv(challenge_data_fp + "test_phase_1/diagnosis.csv.gz", keep_default_na=False)

In [6]:
target_train['split'] = "train"
target_valid['split'] = "valid"
target_test['split'] = "test"
discharge_summ_train['split'] = "train"
discharge_summ_valid['split'] = "valid"
discharge_summ_test['split'] = "test"
radiology_train['split'] = "train"
radiology_valid['split'] = "valid"
radiology_test['split'] = "test"
edstay_train['split'] = "train"
edstay_valid['split'] = "valid"
edstay_test['split'] = "test"
edtriage_train['split'] = "train"
edtriage_valid['split'] = "valid"
edtriage_test['split'] = "test"
eddiags_train['split'] = "train"
eddiags_valid['split'] = "valid"
eddiags_test['split'] = "test"

In [7]:
target = pd.concat([target_train, target_valid, target_test])
discharge_summ = pd.concat([discharge_summ_train, discharge_summ_valid, discharge_summ_test])
radiology = pd.concat([radiology_train, radiology_valid, radiology_test])
edstay = pd.concat([edstay_train, edstay_valid, edstay_test])
edtriage = pd.concat([edtriage_train, edtriage_valid, edtriage_test])
eddiags = pd.concat([eddiags_train, eddiags_valid, eddiags_test])

In [8]:
target = target.reset_index()
discharge_summ = discharge_summ.reset_index()
radiology = radiology.reset_index()
edstay = edstay.reset_index()
edtriage = edtriage.reset_index()
eddiags = eddiags.reset_index()

In [9]:
discharge_summ

Unnamed: 0,index,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,split
0,0,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...,train
1,1,10000117-DS-21,10000117,22927623,DS,21,2181-11-15 00:00:00,2181-11-15 15:04:00,\nName: ___ Unit No: ___\n...,train
2,2,10000935-DS-19,10000935,21738619,DS,19,2187-07-12 00:00:00,2187-07-12 14:01:00,\nName: ___ Unit No: ___...,train
3,3,10000935-DS-21,10000935,25849114,DS,21,2187-10-26 00:00:00,2187-10-27 15:36:00,\nName: ___ Unit No: ___...,train
4,4,10000980-DS-20,10000980,29654838,DS,20,2188-01-05 00:00:00,2188-01-06 20:49:00,\nName: ___ Unit No: ___\n \nAdmi...,train
...,...,...,...,...,...,...,...,...,...,...
98201,14697,19997293-DS-10,19997293,26366652,DS,10,2123-11-19 00:00:00,2124-07-01 14:15:00,\nName: ___ Unit No: _...,test
98202,14698,19998330-DS-17,19998330,23151993,DS,17,2178-09-23 00:00:00,2178-09-24 11:43:00,\nName: ___ Unit No: ...,test
98203,14699,19998497-DS-9,19998497,21557581,DS,9,2145-08-01 00:00:00,2145-08-01 14:13:00,\nName: ___ Unit No: ___\n \...,test
98204,14700,19998562-DS-21,19998562,26846592,DS,21,2166-04-16 00:00:00,2166-04-17 16:08:00,\nName: ___ Unit No: ___\...,test


In [36]:
# target['discharge_instructions'] = target['text'].apply(lambda x: re.findall(r'Discharge Instructions:\n(.*?)Followup Instruction', x, re.DOTALL))

discharge_summ['preceding_text'] = discharge_summ['text'].apply(lambda x: re.findall(r'(^.*?)Brief Hospital Course:\s*\n{0,2}(.*?)(?=\n\s*\n{0,2}\s*[A-Z_]+[^\n:]+:\n)', x, re.DOTALL)[0][0])

In [37]:
display(discharge_summ[discharge_summ['split'] == "train"], target[target['split'] == "train"])

Unnamed: 0,index,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,split,preceding_text
0,0,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...,train,\nName: ___ Unit No: _...
1,1,10000117-DS-21,10000117,22927623,DS,21,2181-11-15 00:00:00,2181-11-15 15:04:00,\nName: ___ Unit No: ___\n...,train,\nName: ___ Unit No: ___\n...
2,2,10000935-DS-19,10000935,21738619,DS,19,2187-07-12 00:00:00,2187-07-12 14:01:00,\nName: ___ Unit No: ___...,train,\nName: ___ Unit No: ___...
3,3,10000935-DS-21,10000935,25849114,DS,21,2187-10-26 00:00:00,2187-10-27 15:36:00,\nName: ___ Unit No: ___...,train,\nName: ___ Unit No: ___...
4,4,10000980-DS-20,10000980,29654838,DS,20,2188-01-05 00:00:00,2188-01-06 20:49:00,\nName: ___ Unit No: ___\n \nAdmi...,train,\nName: ___ Unit No: ___\n \nAdmi...
...,...,...,...,...,...,...,...,...,...,...,...
68780,68780,19999068-DS-14,19999068,21606769,DS,14,2161-09-02 00:00:00,2161-09-02 14:37:00,\nName: ___ Unit No: ...,train,\nName: ___ Unit No: ...
68781,68781,19999287-DS-7,19999287,22997012,DS,7,2197-07-31 00:00:00,2197-08-01 17:48:00,\nName: ___ Unit No: __...,train,\nName: ___ Unit No: __...
68782,68782,19999784-DS-7,19999784,26194817,DS,7,2119-07-02 00:00:00,2119-07-03 16:21:00,\nName: ___ Unit No: __...,train,\nName: ___ Unit No: __...
68783,68783,19999828-DS-6,19999828,29734428,DS,6,2147-08-04 00:00:00,2147-08-12 15:36:00,\nName: ___ Unit No: ___...,train,\nName: ___ Unit No: ___...


Unnamed: 0,index,note_id,hadm_id,discharge_instructions,brief_hospital_course,discharge_instructions_word_count,brief_hospital_course_word_count,split
0,0,15373895-DS-19,28448473,"Dear Mr. ___,\n\nIt was a pleasure taking care...",Mr. ___ was admitted from the emergency depart...,760,398,train
1,1,19045496-DS-24,22343752,You were admitted to the hospital after walkin...,___ year old female with past medical history ...,302,515,train
2,2,15071083-DS-16,24572540,you were hospitalized for gi bleeding that was...,"___ gentleman DM2, CAD s/p CABG, ischemic card...",73,579,train
3,3,11669075-DS-16,25889399,"Dear Mr. ___,\n\nWHY WERE YOU ADMITTED TO THE ...",Mr. ___ is a pleasant ___ y/o gentleman with a...,113,42,train
4,4,17164417-DS-7,24903173,Take your pain medicine as prescribed.\n\n¨ ...,Mr. ___ was admitted from the emergency depart...,226,167,train
...,...,...,...,...,...,...,...,...
68780,68780,14439892-DS-19,26082244,"Dear Mr. ___,\n\nYou were admitted to ___ due ...",Mr. ___ is a ___ year old man with a history o...,125,93,train
68781,68781,19466866-DS-13,25505122,"Dear Mr. ___,\n\nThank you for choosing ___ fo...",___ is a ___ man with known metastatic \nmelan...,184,779,train
68782,68782,17394776-DS-7,25400280,"Dear Ms. ___,\n\nIt was a pleasure taking care...","___ w/ history of remote cutaneous lymphoma, e...",134,637,train
68783,68783,17261183-DS-24,26769375,"Dear Ms. ___, \n\nYou were admitted to the hos...",___ is a ___ yo F with mild static encephalopa...,95,176,train


In [44]:
target_train = target_train.merge(discharge_summ[['note_id', "preceding_text"]], on="note_id", how="left", indicator="indicator")
target_valid = target_valid.merge(discharge_summ[['note_id', "preceding_text"]], on="note_id", how="left", indicator="indicator")

In [42]:
target_train

Unnamed: 0,note_id,hadm_id,discharge_instructions,brief_hospital_course,discharge_instructions_word_count,brief_hospital_course_word_count,split,preceding_text,indicator
0,15373895-DS-19,28448473,"Dear Mr. ___,\n\nIt was a pleasure taking care...",Mr. ___ was admitted from the emergency depart...,760,398,train,\nName: ___ Unit No: __...,both
1,19045496-DS-24,22343752,You were admitted to the hospital after walkin...,___ year old female with past medical history ...,302,515,train,\nName: ___ Unit No: ___\n...,both
2,15071083-DS-16,24572540,you were hospitalized for gi bleeding that was...,"___ gentleman DM2, CAD s/p CABG, ischemic card...",73,579,train,\nName: ___. Unit No: ___\n \nAdmis...,both
3,11669075-DS-16,25889399,"Dear Mr. ___,\n\nWHY WERE YOU ADMITTED TO THE ...",Mr. ___ is a pleasant ___ y/o gentleman with a...,113,42,train,\nName: ___ Unit No: _...,both
4,17164417-DS-7,24903173,Take your pain medicine as prescribed.\n\n¨ ...,Mr. ___ was admitted from the emergency depart...,226,167,train,\nName: ___ Unit No: _...,both
...,...,...,...,...,...,...,...,...,...
68780,14439892-DS-19,26082244,"Dear Mr. ___,\n\nYou were admitted to ___ due ...",Mr. ___ is a ___ year old man with a history o...,125,93,train,\nName: ___ Unit No: ___...,both
68781,19466866-DS-13,25505122,"Dear Mr. ___,\n\nThank you for choosing ___ fo...",___ is a ___ man with known metastatic \nmelan...,184,779,train,\nName: ___ Unit No: ___...,both
68782,17394776-DS-7,25400280,"Dear Ms. ___,\n\nIt was a pleasure taking care...","___ w/ history of remote cutaneous lymphoma, e...",134,637,train,\nName: ___ Unit No: ___\n \...,both
68783,17261183-DS-24,26769375,"Dear Ms. ___, \n\nYou were admitted to the hos...",___ is a ___ yo F with mild static encephalopa...,95,176,train,\nName: ___ Unit No: ___\n ...,both


In [52]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [55]:
target_train['tokens'] = target_train.apply(lambda row: num_tokens_from_string(row['brief_hospital_course'] + row['preceding_text'], "cl100k_base"), axis=1)

In [56]:
target_train['tokens'].describe()

count    68785.000000
mean      2456.550294
std       1097.577321
min        102.000000
25%       1721.000000
50%       2340.000000
75%       3032.000000
max      14117.000000
Name: tokens, dtype: float64

In [66]:
# we do this in the mistrial finetune notebook so don't have to do it here 
# target_train['prompt_with_preceding_text'] = "Summarize the following patient hospital encounter into a brief hospital course:\n\n" + target_train['preceding_text']
# target_valid['prompt_with_preceding_text'] = "Summarize the following patient hospital encounter into a brief hospital course:\n\n" + target_valid['preceding_text']

In [68]:
target_train.rename({"preceding_text":"input", "brief_hospital_course":"output"}, axis=1)[['input', "output"]].to_json(challenge_data_fp + "train/simple_train.json", orient="records")

In [69]:
target_valid.rename({"preceding_text":"input", "brief_hospital_course":"output"}, axis=1)[['input', "output"]].to_json(challenge_data_fp + "valid/simple_valid.json", orient="records")