In [None]:
#  MIMIC-IV
import pandas as pd
import numpy as np
import time
import math
from collections import defaultdict
import pickle as pkl
import ast
import datetime
from tslearn.metrics import cdist_dtw
import scipy.spatial as sp, scipy.cluster.hierarchy as hc
import seaborn as sns;
import matplotlib.pyplot as plt
sns.set(color_codes=True)
import matplotlib
matplotlib.use('TkAgg')
from sklearn.cluster import AgglomerativeClustering
import json
import warnings
warnings.filterwarnings("ignore")
import re

#### Utility Functions

In [None]:
def process_value_dose_val_rx(value):
    if '-' in value:
        return value.split('-')[0]
    elif ',' in value:
        return value.replace(',', '')
    else:
        return value

#### Processing

In [None]:
input_file = 'Data/MIMIC/mimic-iv-2.2/'
output_file = input_file
df_prescription = pd.read_csv(output_file + "hosp/prescriptions.csv")
df_patient = pd.read_csv(output_file + "patient_info.csv") #contains only sepsis patients
feats = ['subject_id', 'hadm_id', 'pharmacy_id', 'starttime', 'stoptime', 'drug_type', 'drug',
 'formulary_drug_cd', 'dose_val_rx', 'dose_unit_rx', 'form_unit_disp', 'doses_per_24_hrs', 'route']
df_prescription = df_prescription[feats]
df_prescription = df_prescription[df_prescription['subject_id'].isin(df_patient['subject_id'].values)]
df_prescription['drug'] = df_prescription['drug'].apply(str.lower)

df_prescription['stoptime'] = pd.to_datetime(df_prescription['stoptime'])
df_prescription['starttime'] = pd.to_datetime(df_prescription['starttime'])
df_patient['intime'] = pd.to_datetime(df_patient['intime'])
df_patient['outtime'] = pd.to_datetime(df_patient['outtime'])

In [None]:
# Keeping columns with steroid drugs
steroid_drugs = ['prednisolone', 'prednisone', 'hydrocortisone', 'dexamethasone', 'methylprednisolone']
pattern = '|'.join(steroid_drugs)
df_prescription['drug_sub'] = df_prescription['drug'].str.extract(f'(?i)({pattern})', expand=False)
df_prescription_filt = df_prescription.dropna(subset=['drug_sub'])
df_prescription_filt['dose_val_rx'] = df_prescription_filt['dose_val_rx'].astype(str).apply(process_value_dose_val_rx).astype(float)
df_prescription_filt['doses_per_24_hrs'] = df_prescription_filt['doses_per_24_hrs'].astype(float)
df_prescription_filt['dose_unit_rx'] = df_prescription_filt['dose_unit_rx'].replace({'mg\\ 0 mg':'mg'})
df_prescription_filt['doses_per_24_hrs'] = df_prescription_filt['doses_per_24_hrs'].fillna(1)
df_prescription_filt = df_prescription_filt[~df_prescription_filt['dose_unit_rx'].isna()]
df_prescription_filt = df_prescription_filt[~df_prescription_filt['starttime'].isna()]
df_prescription_filt = df_prescription_filt[~df_prescription_filt['stoptime'].isna()]

In [None]:
# Creating Patient Dictionary
pats_dict = {}
pats = pd.unique(df_patient['subject_id'])
for pat in pats:
    pat_dict = {}
    df_patient_pat = df_patient[df_patient['subject_id'] == pat]
    pat_dict['ham_id'] = df_patient_pat['hadm_id'].values[0]
    pat_dict['stay_id'] = df_patient_pat['stay_id'].values[0]
    pat_dict['intime'] = df_patient_pat['intime'].values[0]
    pat_dict['outtime'] = df_patient_pat['outtime'].values[0]
    pats_dict[pat] = pat_dict

In [None]:
patients_with_presc = pd.unique(df_prescription_filt['subject_id'])
i = 0
for pat, value in pats_dict.items():
    pats_dict[pat]['steroid_info'] = {}
    if pat not in patients_with_presc:
        pats_dict[pat]['steroid_info']['starttime'] = []
        pats_dict[pat]['steroid_info']['stoptime'] = []
        pats_dict[pat]['steroid_info']['duration'] = []
        pats_dict[pat]['steroid_info']['duration_days'] = []
        pats_dict[pat]['steroid_info']['drug'] = []
        pats_dict[pat]['steroid_info']['dose'] = []
        pats_dict[pat]['steroid_info']['adjusted_dose'] = []
    else:
        df_prescription_pat = df_prescription_filt[df_prescription_filt['subject_id'] == pat]
        df_prescription_pat['duration'] = df_prescription_pat['stoptime'] - df_prescription_pat['starttime']
        df_prescription_pat['duration'] = df_prescription_pat['duration'].dt.total_seconds().div(60).astype(int)
        df_prescription_pat['duration_days'] = df_prescription_pat['duration'] / 1440
        df_prescription_pat['duration_days'] = df_prescription_pat['duration_days'].apply(np.ceil)

        processed_rows = []
        for index, pat_row in df_prescription_pat.iterrows():
            if pat_row['dose_unit_rx'] == 'mg':
                pat_row['total_dosage_daily'] = pat_row['dose_val_rx'] * pat_row['doses_per_24_hrs']
            else:
                if pat_row['dose_unit_rx'] == 'mL':
                    if pat_row['formulary_drug_cd'] == 'DEXA0.5L':
                        pat_row['total_dosage_daily'] = pat_row['dose_val_rx'] * pat_row['doses_per_24_hrs'] * 0.1
                elif pat_row['dose_unit_rx'] == 'TAB':
                    if pat_row['formulary_drug_cd'] == 'DEXA2':
                        pat_row['total_dosage_daily'] = pat_row['dose_val_rx'] * pat_row['doses_per_24_hrs']
                elif pat_row['dose_unit_rx'] == 'gtt':
                    #1gtt = 82.15mg
                    if pat_row['drug'] == 'prednisolone acetate 1% ophth. susp.':
                        pat_row['total_dosage_daily'] = pat_row['dose_val_rx'] * pat_row['doses_per_24_hrs'] * 82.15 * 0.01
                elif pat_row['dose_unit_rx'] == 'dose':    
                    pat_row['total_dosage_daily'] = pat_row['dose_val_rx'] * pat_row['doses_per_24_hrs'] * 10
                elif pat_row['dose_unit_rx'] == 'DROP':
                    drug_name = pat_row['drug']
                    if 'prednisolone acetate' in drug_name:
                        drug = 10
                    elif 'prednisolone' in drug_name:
                        drug = 15
                    elif 'prednisone' in drug_name:
                        drug = 15
                    if 'dexamethasone' in drug_name:
                        drug = 1
                    pat_row['total_dosage_daily'] = pat_row['dose_val_rx'] * pat_row['doses_per_24_hrs'] * drug
                elif pat_row['dose_unit_rx'] == 'SUPP':
                    if pat_row['formulary_drug_cd'] == 'ANHC25R':
                        pat_row['total_dosage_daily'] = pat_row['dose_val_rx'] * pat_row['doses_per_24_hrs'] * 25
                elif pat_row['dose_unit_rx'] == 'Appl':
                    drug_name = pat_row['drug']
                    if 'dexamethasone' in drug_name:
                        # Based on looking up the drug dosage
                        pat_row['total_dosage_daily'] = pat_row['dose_val_rx'] * pat_row['doses_per_24_hrs'] * 1
                    elif 'hydrocortisone acetate' in drug_name:
                        # Based on looking up the drug dosage
                        pat_row['total_dosage_daily'] = pat_row['dose_val_rx'] * pat_row['doses_per_24_hrs'] * 250
                    elif 'hydrocortisone' in drug_name:
                        pattern = r'\d+(\.\d+)?%'
                        match = re.search(pattern, drug_name)
                        percentage = match.group()
                        dosage = float(percentage[:-1]) * 10
                        pat_row['total_dosage_daily'] = pat_row['dose_val_rx'] * pat_row['doses_per_24_hrs'] * dosage
                else:
                    print(f"Couldn't process {pat}")
            if (pat_row['drug_sub'] == 'prednisolone') or (pat_row['drug_sub'] == 'prednisone'):
                pat_row['total_dosage_daily_adj'] = pat_row['total_dosage_daily'] * 4
            elif (pat_row['drug_sub'] == 'dexamethasone'):
                pat_row['total_dosage_daily_adj'] = pat_row['total_dosage_daily'] * 25
            elif (pat_row['drug_sub'] == 'methylprednisolone'):
                pat_row['total_dosage_daily_adj'] = pat_row['total_dosage_daily'] * 5
            else:
                pat_row['total_dosage_daily_adj'] = pat_row['total_dosage_daily']
            processed_rows.append(pat_row.to_frame().T)
        df_prescription_pat_new = pd.concat(processed_rows)
        pats_dict[pat]['steroid_info']['starttime'] = df_prescription_pat['starttime'].values
        pats_dict[pat]['steroid_info']['stoptime'] = df_prescription_pat['stoptime'].values
        pats_dict[pat]['steroid_info']['duration'] = df_prescription_pat['duration'].values
        pats_dict[pat]['steroid_info']['duration_days'] = df_prescription_pat['duration_days'].values
        pats_dict[pat]['steroid_info']['drug'] = df_prescription_pat['drug_sub'].values
        pats_dict[pat]['steroid_info']['dose'] = df_prescription_pat_new['total_dosage_daily'].values
        pats_dict[pat]['steroid_info']['adjusted_dose'] = df_prescription_pat_new['total_dosage_daily_adj'].values
    i+= 1
    print("Done with " + str(i) + " out of " + str(len(pats_dict)) + " patients", end='\r')

In [None]:
all_info_dict = {}

all_pat_dfs = []
j = 0
for pat, value in pats_dict.items():
    patient_info = pats_dict[pat]
    patient_intime = patient_info['intime']
    patient_outtime = patient_info['outtime']
    steroid_start_times = patient_info['steroid_info']['starttime']
    steroid_start_times_filt = []
    steroid_end_times_filt = []
    steroid_duration_filt = []
    steroid_duration_days_filt = []
    steroid_dose_filt = []
    for i in range(0, len(steroid_start_times)):
        if (steroid_start_times[i] <= patient_outtime) and (patient_info['steroid_info']['duration'][i] > 0):
            steroid_start_times_filt.append(steroid_start_times[i])
            steroid_end_times_filt.append(patient_info['steroid_info']['stoptime'][i])
            steroid_duration_filt.append(patient_info['steroid_info']['duration'][i])
            steroid_duration_days_filt.append(patient_info['steroid_info']['duration_days'][i])
            steroid_dose_filt.append(patient_info['steroid_info']['adjusted_dose'][i])
    all_time_deltas = []
    all_days = []
    if len(steroid_start_times_filt) > 0:
        invidual_dfs = []
        for index in range(0, len(steroid_start_times_filt)):
            time_delta = steroid_start_times_filt[index] - patient_intime
            all_time_deltas.append(time_delta / np.timedelta64(1, 'm'))
            hours = time_delta / np.timedelta64(1, 'h')
            days = math.ceil(time_delta / np.timedelta64(24, 'h'))
            if days < -1:
                days = -1
            all_days.append(days)
            col_titles = [f'day_{num}' for num in range(-1, 29)]
            df_pat_ind = pd.DataFrame(columns=col_titles, data=np.zeros((1, len(col_titles))))
            ending = int(days+steroid_duration_days_filt[index])
            if ending > 29:
                ending = 29
            cols_to_fill = [f'day_{num}' for num in range(days, ending)]
            df_pat_ind[cols_to_fill] = steroid_dose_filt[index]
            invidual_dfs.append(df_pat_ind)
        df_pat_steroid = pd.concat(invidual_dfs).sum(axis=0).to_frame().T
        df_pat_steroid['patientunitstayid'] = int(patient_info['stay_id'])
    else:
        col_titles = [f'day_{num}' for num in range(-1, 29)]
        df_pat_steroid = pd.DataFrame(columns=col_titles, data=np.zeros((1, len(col_titles))))
        df_pat_steroid['patientunitstayid'] = int(patient_info['stay_id'])
    all_pat_dfs.append(df_pat_steroid)
    j+=1
    all_info_dict[patient_info['stay_id']] = [pat, patient_info['intime'], steroid_start_times_filt, steroid_end_times_filt, all_time_deltas, all_days, steroid_duration_filt, steroid_duration_days_filt, steroid_dose_filt]
    print("Done with " + str(j) + " out of " + str(len(pats_dict)) + " patients", end='\r')

df_steroid = pd.concat(all_pat_dfs)
cols = df_steroid.columns.tolist()
cols = [cols[-1]] + cols[:-1]
df_steroid = df_steroid[cols]

In [None]:
def sort_and_sum(lists):
    # 1. Zip the lists into a list of tuples
    zipped = list(zip(lists[0], lists[1], lists[2], lists[3], lists[4]))
    # 2. Sort by the second element in each tuple
    zipped.sort(key=lambda x: x[0])
    # 3. Calculate the cumulative sum
    cum_sum = 0
    index_to_stop = 0
    for i, tup in enumerate(zipped):
        cum_sum += tup[4]
        if cum_sum >= 160:
            index_to_stop = i + 1
            break
    # 4. Split the sorted tuples back to separate lists
    if (len(zipped) == 0) or (index_to_stop == 0):
        return 0
    list_1, list_2, list_3, list_4, list_5 = zip(*zipped[:index_to_stop])
    return sum(list(list_1))


all_pat_ids = df_steroid['patientunitstayid'].values
follow_up_tims = []
for p in all_pat_ids:
    if len(all_info_dict[p][-5:][0]) > 0:
        ls = sort_and_sum(all_info_dict[p][-5:])
    else:
        ls = 0
    follow_up_tims.append(np.round(ls, 2))

In [None]:
df_steroid.to_csv(output_file+'steroids.csv', index=False)