# Initial Tests

In [None]:
import pandas as pd
import ast
import numpy as np

from tqdm import tqdm
# from tqdm.auto import tqdm  # for notebooks
tqdm.pandas()

import os
import openai

In [None]:
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.

In [None]:
# preprocessed file from Thomas
transfers_events = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/combined_data_export.csv')

In [None]:
# bunch of nan rows
transfers_events = transfers_events[~transfers_events['stay_id'].isna() & ~transfers_events['hadm_id'].isna()]


In [None]:
# grab subject IDs
transfers_events = transfers_events.merge(edstays[['stay_id', "subject_id"]], on="stay_id", how="left")

In [None]:
# discharge summaries
discharges = pd.read_csv("/gpfs/milgram/project/rtaylor/shared/DischargeMe/public/train/discharge.csv.gz")

# ed stays
edstays = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/public/train/edstays.csv.gz')

# triage
triage = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/public/train/triage.csv.gz')

In [None]:
triage.stay_id.nunique(), triage.subject_id.nunique(), triage.shape

In [None]:
# ward transfers
transfers = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/transfers.csv.gz')

# higher-level services (ICU, CARD, etc)
services = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/services.csv.gz')

# get patient info
pts = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/patients.csv.gz')

# admission demographics
admissions = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/admissions.csv.gz')

# procedures
procs = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/procedures_icd.csv.gz')
procs_icd = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/d_icd_procedures.csv.gz')

# diagnoses
diags = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/diagnoses_icd.csv.gz')
diags_icd = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/d_icd_diagnoses.csv.gz')

# meds
med_orders = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/emar.csv.gz')


In [None]:
med_orders = med_orders[~med_orders['charttime'].isna() & 
                        ~med_orders['medication'].isna() & 
                        ~med_orders['event_txt'].isna()]

In [None]:
procs = procs.astype({"chartdate":"datetime64[ns]"})
med_orders = med_orders.astype({"charttime":"datetime64[ns]",
                  "scheduletime":"datetime64[ns]",
                  "storetime":"datetime64[ns]",})

discharges = discharges.astype({"charttime":"datetime64[ns]",
                               "storetime":"datetime64[ns]"})

In [None]:
# are there multiple icd code mappings? 
procs_icd.shape, procs_icd.icd_code.nunique()
# looks like only for the two versions of ICD

In [None]:
procs_icd[procs_icd['icd_code'] == "067"]

In [None]:
# drop any potential repeats
procs_icd = procs_icd.groupby(["icd_code", "icd_version"]).first().reset_index()
diags_icd = diags_icd.groupby(["icd_code", "icd_version"]).first().reset_index()

In [None]:
# grab long_titles for procs/diags
procs = procs.merge(procs_icd, on=["icd_code", "icd_version"], how="left")
diags = diags.merge(diags_icd, on=["icd_code", "icd_version"], how="left")

In [None]:

# why would someone have multiple ED stays for the same admission?
# looks like they came and left the ED multiple times during the course of their hospital course? 
transfers_events[transfers_events['hadm_id'] == 23880210].iloc[0]['careunit']

In [None]:
transfers[transfers['hadm_id'] == 29999828].sort_values("intime")[['intime',"outtime",'careunit']]

# Subset Data by Challenge Cohort

## Read in Challenge Data

In [1]:
import pandas as pd

In [2]:
challenge_data_fp = "/gpfs/milgram/project/rtaylor/shared/DischargeMe/public/"


In [3]:
target_train = pd.read_csv(challenge_data_fp + "train/discharge_target.csv.gz", keep_default_na=False)
target_valid = pd.read_csv(challenge_data_fp + "valid/discharge_target.csv.gz", keep_default_na=False)
target_test = pd.read_csv(challenge_data_fp + "test_phase_1/discharge_target.csv.gz", keep_default_na=False)


In [4]:
target_train

Unnamed: 0,note_id,hadm_id,discharge_instructions,brief_hospital_course,discharge_instructions_word_count,brief_hospital_course_word_count
0,15373895-DS-19,28448473,"Dear Mr. ___,\n\nIt was a pleasure taking care...",Mr. ___ was admitted from the emergency depart...,760,398
1,19045496-DS-24,22343752,You were admitted to the hospital after walkin...,___ year old female with past medical history ...,302,515
2,15071083-DS-16,24572540,you were hospitalized for gi bleeding that was...,"___ gentleman DM2, CAD s/p CABG, ischemic card...",73,579
3,11669075-DS-16,25889399,"Dear Mr. ___,\n\nWHY WERE YOU ADMITTED TO THE ...",Mr. ___ is a pleasant ___ y/o gentleman with a...,113,42
4,17164417-DS-7,24903173,Take your pain medicine as prescribed.\n\n¨ ...,Mr. ___ was admitted from the emergency depart...,226,167
...,...,...,...,...,...,...
68780,14439892-DS-19,26082244,"Dear Mr. ___,\n\nYou were admitted to ___ due ...",Mr. ___ is a ___ year old man with a history o...,125,93
68781,19466866-DS-13,25505122,"Dear Mr. ___,\n\nThank you for choosing ___ fo...",___ is a ___ man with known metastatic \nmelan...,184,779
68782,17394776-DS-7,25400280,"Dear Ms. ___,\n\nIt was a pleasure taking care...","___ w/ history of remote cutaneous lymphoma, e...",134,637
68783,17261183-DS-24,26769375,"Dear Ms. ___, \n\nYou were admitted to the hos...",___ is a ___ yo F with mild static encephalopa...,95,176


In [3]:
target_train = pd.read_csv(challenge_data_fp + "train/discharge_target.csv.gz", keep_default_na=False)
target_valid = pd.read_csv(challenge_data_fp + "valid/discharge_target.csv.gz", keep_default_na=False)
target_test = pd.read_csv(challenge_data_fp + "test_phase_1/discharge_target.csv.gz", keep_default_na=False)

discharge_summ_train = pd.read_csv(challenge_data_fp + "train/discharge.csv.gz", keep_default_na=False)
discharge_summ_valid = pd.read_csv(challenge_data_fp + "valid/discharge.csv.gz", keep_default_na=False)
discharge_summ_test = pd.read_csv(challenge_data_fp + "test_phase_1/discharge.csv.gz", keep_default_na=False)

radiology_train = pd.read_csv(challenge_data_fp + "train/radiology.csv.gz", keep_default_na=False)
radiology_valid = pd.read_csv(challenge_data_fp + "valid/radiology.csv.gz", keep_default_na=False)
radiology_test = pd.read_csv(challenge_data_fp + "test_phase_1/radiology.csv.gz", keep_default_na=False)

edstay_train = pd.read_csv(challenge_data_fp + "train/edstays.csv.gz", keep_default_na=False)
edstay_valid = pd.read_csv(challenge_data_fp + "valid/edstays.csv.gz", keep_default_na=False)
edstay_test = pd.read_csv(challenge_data_fp + "test_phase_1/edstays.csv.gz", keep_default_na=False)

edtriage_train = pd.read_csv(challenge_data_fp + "train/triage.csv.gz", keep_default_na=False)
edtriage_valid = pd.read_csv(challenge_data_fp + "valid/triage.csv.gz", keep_default_na=False)
edtriage_test = pd.read_csv(challenge_data_fp + "test_phase_1/triage.csv.gz", keep_default_na=False)

eddiags_train = pd.read_csv(challenge_data_fp + "train/diagnosis.csv.gz", keep_default_na=False)
eddiags_valid = pd.read_csv(challenge_data_fp + "valid/diagnosis.csv.gz", keep_default_na=False)
eddiags_test = pd.read_csv(challenge_data_fp + "test_phase_1/diagnosis.csv.gz", keep_default_na=False)

In [4]:
target_train['split'] = "train"
target_valid['split'] = "valid"
target_test['split'] = "test"
discharge_summ_train['split'] = "train"
discharge_summ_valid['split'] = "valid"
discharge_summ_test['split'] = "test"
radiology_train['split'] = "train"
radiology_valid['split'] = "valid"
radiology_test['split'] = "test"
edstay_train['split'] = "train"
edstay_valid['split'] = "valid"
edstay_test['split'] = "test"
edtriage_train['split'] = "train"
edtriage_valid['split'] = "valid"
edtriage_test['split'] = "test"
eddiags_train['split'] = "train"
eddiags_valid['split'] = "valid"
eddiags_test['split'] = "test"

In [5]:
target = pd.concat([target_train, target_valid, target_test])
discharge_summ = pd.concat([discharge_summ_train, discharge_summ_valid, discharge_summ_test])
radiology = pd.concat([radiology_train, radiology_valid, radiology_test])
edstay = pd.concat([edstay_train, edstay_valid, edstay_test])
edtriage = pd.concat([edtriage_train, edtriage_valid, edtriage_test])
eddiags = pd.concat([eddiags_train, eddiags_valid, eddiags_test])

In [6]:
target = target.reset_index()
discharge_summ = discharge_summ.reset_index()
radiology = radiology.reset_index()
edstay = edstay.reset_index()
edtriage = edtriage.reset_index()
eddiags = eddiags.reset_index()

## Read in MIMIC-IV Data

In [8]:
# ward transfers
transfers = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/transfers.csv.gz')

# higher-level services (ICU, CARD, etc)
services = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/services.csv.gz')

# get patient info
pts = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/patients.csv.gz')

# admission demographics
admissions = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/admissions.csv.gz')

# procedures
procs = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/procedures_icd.csv.gz')
procs_icd = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/d_icd_procedures.csv.gz')

# diagnoses
diags = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/diagnoses_icd.csv.gz')
diags_icd = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/d_icd_diagnoses.csv.gz')

# meds
med_orders = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/emar.csv.gz')


In [9]:
# drop any potential repeats
procs_icd = procs_icd.groupby(["icd_code", "icd_version"]).first().reset_index()
diags_icd = diags_icd.groupby(["icd_code", "icd_version"]).first().reset_index()

In [10]:
# grab long_titles for procs/diags
procs = procs.merge(procs_icd, on=["icd_code", "icd_version"], how="left")
diags = diags.merge(diags_icd, on=["icd_code", "icd_version"], how="left")

In [11]:
prescriptions = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/prescriptions.csv.gz')
labs = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/labevents.csv.gz')
microbio = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/microbiologyevents.csv.gz')

  prescriptions = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/prescriptions.csv.gz')
  microbio = pd.read_csv('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/microbiologyevents.csv.gz')


In [12]:
labs_icd = pd.read_csv("/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/d_labitems.csv.gz")

In [13]:
labs = labs.merge(labs_icd, on="itemid", how="left")

## Subset

In [14]:
cohort_subject_ids = edstay['subject_id'].unique()
cohort_hadm_ids = edstay['hadm_id'].unique()
cohort_stay_ids = edstay['stay_id'].unique()

In [15]:
len(cohort_subject_ids), len(cohort_hadm_ids), len(cohort_stay_ids)

(61316, 98206, 98418)

In [16]:
transfers = transfers[transfers['subject_id'].isin(cohort_subject_ids)]
services = services[services['subject_id'].isin(cohort_subject_ids)]
pts = pts[pts['subject_id'].isin(cohort_subject_ids)]
admissions = admissions[admissions['hadm_id'].isin(cohort_hadm_ids)]
med_orders = med_orders[med_orders['subject_id'].isin(cohort_subject_ids)]
procs = procs[procs['hadm_id'].isin(cohort_hadm_ids)]
diags = diags[diags['hadm_id'].isin(cohort_hadm_ids)]
prescriptions = prescriptions[prescriptions['hadm_id'].isin(cohort_hadm_ids)]
labs = labs[labs['subject_id'].isin(cohort_subject_ids)]
microbio = microbio[microbio['subject_id'].isin(cohort_subject_ids)]

In [17]:
transfers.to_pickle('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/cohort_transfers.pkl')
services.to_pickle('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/cohort_services.pkl')
pts.to_pickle('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/cohort_patients.pkl')
admissions.to_pickle('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/cohort_admissions.pkl')
med_orders.to_pickle('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/cohort_emar.pkl')
procs.to_pickle('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/cohort_procedures_icd.pkl')
diags.to_pickle('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/cohort_diagnoses_icd.pkl')
prescriptions.to_pickle('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/cohort_prescriptions.pkl')
labs.to_pickle('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/cohort_labevents.pkl')
microbio.to_pickle('/gpfs/milgram/project/rtaylor/shared/DischargeMe/mimiciv/hosp/cohort_microbiologyevents.pkl')