In [None]:
import pandas as pd
import sys 
from matplotlib import pyplot as plt
import os
%matplotlib inline
sys.path.append('../')
from src.plots import add_panel_text
import numpy as np

from src.constants import *
OUTPUT_DIR = '/app/output'
DATA_DIR = '/app/data/mimic-iv-2.0/'

# Load Data

In [None]:
patients_file = os.path.join(DATA_DIR, 'hosp', 'patients.csv.gz')
admissions_file = os.path.join(DATA_DIR, 'hosp', 'admissions.csv.gz')
lab_file = os.path.join(DATA_DIR, 'hosp', 'labevents.csv.gz')
lab_meta_file = os.path.join(DATA_DIR, 'hosp', 'd_labitems.csv.gz')

In [None]:
patients_df = pd.read_csv(patients_file, compression='gzip')
patients_df.head()

In [None]:
COLUMNS_TO_DROP = ['dod']
patients_df.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

In [None]:
print(len(patients_df))

In [None]:
patients_df.dtypes

In [None]:
# fig, ax = plt.subplots(1,1,dpi=100)
# tmp = patients_df[[AGE_COL, GENDER_COL]]
# tmp[AGE_COL] = pd.cut(tmp[AGE_COL], bins=AGE_BINS, labels=AGE_LABELS)
# tmp.groupby([AGE_COL, GENDER_COL]).size().unstack().plot(kind='bar', ax=ax)
# ax.set_xlabel('Anchor Age [years]', fontsize=font_sz)
# ax.set_ylabel('Number of Patients', fontsize=font_sz)
# ax.set_title(f'Total Population, N={len(tmp)}', fontsize=font_sz)
# ax.legend(labels=['Female', 'Male'], title="Sex")
# ax.set_xticklabels(AGE_LABELS, rotation=90)
# fig.savefig(os.path.join(OUTPUT_DIR, 'age_gender_total.png'), dpi=300)

In [None]:
# fig, ax = plt.subplots(1,1,dpi=100)
# patients_df[YEAR_GROUP_COL].value_counts().plot.bar(ax=ax)
# ax.set_ylabel('Number of Patients', fontsize=font_sz)
# ax.set_xlabel('Anchor Year Group', fontsize=font_sz)
# for p in ax.patches:
#     ax.annotate(str(p.get_height()), (p.get_x(), p.get_height() * 1.01))

In [None]:
admissions_df = pd.read_csv(admissions_file, compression='gzip', parse_dates=[ADMISSION_TIME_COL,
                            DISCHARGE_TIME_COL, DEATH_TIME_COL, ED_REG_TIME, ED_OUT_TIME])
admissions_df

In [None]:
COLUMNS_TO_DROP = ['hospital_expire_flag', 'edouttime', 'edregtime', 'deathtime', 'language']
admissions_df.drop(COLUMNS_TO_DROP, axis=1, inplace=True)

In [None]:
admissions_df = admissions_df.merge(patients_df, on=[SUBJECT_ID_COL])
admissions_df.shape

# Calculate Age at Admission and Group of Admission Year

Based on mimic IV example https://mimic.mit.edu/docs/iv/modules/hosp/patients/

In [None]:
# Diff column first
admissions_df[ADMISSION_YEAR_COL] = (admissions_df[ADMISSION_TIME_COL].dt.year - admissions_df['anchor_year'])

# Age at admission calculation
admissions_df[ADMISSION_AGE_COL] = (admissions_df[AGE_COL] + admissions_df[ADMISSION_YEAR_COL])

# Admission year group lower bound calculation
admissions_df[ADMISSION_YEAR_COL] = admissions_df[ADMISSION_YEAR_COL] + admissions_df[YEAR_GROUP_COL].apply(lambda x: int(x.split(' ')[0]))

In [None]:
fig, ax = plt.subplots(1,1,dpi=100)
admissions_df[ADMISSION_YEAR_COL].value_counts().sort_index().plot.bar(ax=ax)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_xlabel('Admission Year (lower bound)', fontsize=font_sz)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x(), p.get_height() * 1.01))

In [None]:
fig, ax = plt.subplots(1,1,figsize=(8,4))
tmp = admissions_df[[ADMISSION_AGE_COL, GENDER_COL]]
tmp.groupby([ADMISSION_AGE_COL, GENDER_COL]).size().unstack().plot(kind='bar', ax=ax)
ax.set_xlabel('Age at Admission [years]', fontsize=font_sz)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_title(f'Total Population, N={len(tmp)}', fontsize=font_sz)
ax.legend(labels=['Female', 'Male'], title="Sex")
#ax.set_xticklabels(AGE_LABELS, rotation=90)
plt.setp(ax.get_xticklabels()[1::2], visible=False)
plt.show()
#fig.savefig(os.path.join(OUTPUT_DIR, 'age_gender_admissions_total.png'), dpi=300)

# Calculating LOS (exact, days resolution) and night admission indicator

In [None]:
NIGHT_ADMISSION_FLAG = 'night_admission' 

admissions_df[LOS_EXACT_COL] = (admissions_df[DISCHARGE_TIME_COL] - admissions_df[ADMISSION_TIME_COL])
admissions_df[NIGHT_ADMISSION_FLAG] = ((admissions_df[ADMISSION_TIME_COL].dt.hour >= 20) | \
                                       (admissions_df[ADMISSION_TIME_COL].dt.hour < 8) ).values
admissions_df[LOS_DAYS_COL] = admissions_df[LOS_EXACT_COL].dt.ceil('1d')

In [None]:
admissions_df[NIGHT_ADMISSION_FLAG].mean()

In [None]:
fig, ax = plt.subplots(1,1,dpi=100)
admissions_df[ADMISSION_TYPE_COL].value_counts().plot.bar(ax=ax)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_xlabel('Admission Type', fontsize=font_sz)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x(), p.get_height() * 1.01))

In [None]:
max_clip_days = 30

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

ax = axes[0]
tmp = admissions_df[admissions_df[ADMISSION_TYPE_COL] == 'URGENT']
los_bar = tmp[LOS_DAYS_COL].clip(pd.to_timedelta('1d'), pd.to_timedelta(f'{max_clip_days}d')).value_counts().sort_index()
los_bar.index = np.arange(1, max_clip_days+1)
los_bar.plot.bar(ax=ax)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_xlabel('LOS (Days)', fontsize=font_sz)
ax.grid(axis='y')
ax.set_title('URGENT', fontsize=font_sz)

ax = axes[1]
tmp = admissions_df[admissions_df[ADMISSION_TYPE_COL] == 'EW EMER.']
los_bar = tmp[LOS_DAYS_COL].clip(pd.to_timedelta('1d'), pd.to_timedelta(f'{max_clip_days}d')).value_counts().sort_index()
los_bar.index = np.arange(1, max_clip_days+1)
los_bar.plot.bar(ax=ax)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_xlabel('LOS (Days)', fontsize=font_sz)
ax.grid(axis='y')
ax.set_title('EW EMER.', fontsize=font_sz)

ax = axes[2]
tmp = admissions_df[admissions_df[ADMISSION_TYPE_COL] == 'DIRECT EMER.']
los_bar = tmp[LOS_DAYS_COL].clip(pd.to_timedelta('1d'), pd.to_timedelta(f'{max_clip_days}d')).value_counts().sort_index()
los_bar.index = np.arange(1, max_clip_days+1)
los_bar.plot.bar(ax=ax)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_xlabel('LOS (Days)', fontsize=font_sz)
ax.grid(axis='y')
ax.set_title('DIRECT EMER.', fontsize=font_sz)

fig.tight_layout()

# Taking only SPECIFIC_ADMISSION_TYPE admissions from now on

In [None]:
#SPECIFIC_ADMISSION_TYPE = ['URGENT']
SPECIFIC_ADMISSION_TYPE = ['DIRECT EMER.', 'EW EMER.']

In [None]:
print(len(admissions_df))
admissions_df = admissions_df[admissions_df[ADMISSION_TYPE_COL].isin(SPECIFIC_ADMISSION_TYPE)]
print(len(admissions_df))

In [None]:
# add direct emergency if needed
DIRECT_IND_COL = 'direct_emrgency_flag'

if 'DIRECT EMER.' in SPECIFIC_ADMISSION_TYPE:
    admissions_df[DIRECT_IND_COL] = (admissions_df[ADMISSION_TYPE_COL] == 'DIRECT EMER.').astype(int)

# Counting SPECIFIC_ADMISSION_TYPE admissions to each patient 

In [None]:
number_of_admissions = admissions_df.groupby(SUBJECT_ID_COL)[ADMISSION_ID_COL].nunique()
number_of_admissions.name = ADMISSION_COUNT_COL
number_of_admissions

In [None]:
fig, ax = plt.subplots(1,1,dpi=100)
number_of_admissions.value_counts().sort_index().plot.bar(ax=ax, logy=True)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_xlabel('Number of Admissions', fontsize=font_sz)
ax.grid('y', which='minor', alpha=0.4)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x(), p.get_height() * 1.01))

In [None]:
admissions_df = admissions_df.merge(number_of_admissions, on=SUBJECT_ID_COL)
admissions_df.head()

# Add recurrent admissions group per patient according to last admission

In [None]:
ADMISSION_COUNT_GROUP_COL = ADMISSION_COUNT_COL + '_group'

ADMISSION_COUNT_BINS = [1, 1.5, 2.5, 5000]
ADMISSION_COUNT_LABELS = ['1', '2', '3up']

admissions_df[ADMISSION_COUNT_GROUP_COL] = pd.cut(admissions_df[ADMISSION_COUNT_COL], 
                                                  bins=ADMISSION_COUNT_BINS, 
                                                  labels=ADMISSION_COUNT_LABELS, 
                                                  include_lowest=True)
admissions_df.head(50)

# Adds last admission with previous admission in past month indicator

In [None]:
PREV_ADMISSION_IND_COL = 'last_less_than_diff'
indicator_diff = pd.to_timedelta('30d')

tmp_admissions = admissions_df[admissions_df[ADMISSION_COUNT_COL] > 1]
print(tmp_admissions.shape)
ind_ser = tmp_admissions.sort_values(by=[SUBJECT_ID_COL, ADMISSION_TIME_COL]).groupby(
    SUBJECT_ID_COL).apply(
    lambda tmp_df: (tmp_df[ADMISSION_TIME_COL] - tmp_df[DISCHARGE_TIME_COL].shift(1)) <= indicator_diff)

ind_ser.index = ind_ser.index.droplevel(1)
ind_ser.name = PREV_ADMISSION_IND_COL
ind_ser = ind_ser.iloc[ind_ser.reset_index().drop_duplicates(subset=[SUBJECT_ID_COL], keep='last').index]
ind_ser

In [None]:
admissions_df = admissions_df.merge(ind_ser.astype(int), left_on=SUBJECT_ID_COL, right_index=True, how='outer')
admissions_df[PREV_ADMISSION_IND_COL].fillna(0, inplace=True)
admissions_df

In [None]:
# Example
admissions_df[admissions_df[PREV_ADMISSION_IND_COL] == 1].sort_values(by=[SUBJECT_ID_COL, ADMISSION_TIME_COL])

# Keep only last admission per patient

In [None]:
only_last_admission = admissions_df.sort_values(by=[ADMISSION_TIME_COL]).drop_duplicates(subset=[SUBJECT_ID_COL], keep='last')
len(only_last_admission)

# Take only patients with last admission after MINIMUM YEAR

In [None]:
# MINIMUM_YEAR = 2017
MINIMUM_YEAR = 2014
print(len(only_last_admission))
only_last_admission = only_last_admission[only_last_admission[ADMISSION_YEAR_COL] >= MINIMUM_YEAR]
print(len(only_last_admission))

In [None]:
only_last_admission[PREV_ADMISSION_IND_COL].sum()

In [None]:
pids = only_last_admission[SUBJECT_ID_COL].drop_duplicates()
adm_ids = only_last_admission[ADMISSION_ID_COL].drop_duplicates()
print(len(pids))
print(len(adm_ids))

# Load relevant lab tests

In [None]:
LOAD_SPECIFIC_COLUMNS = [SUBJECT_ID_COL, ADMISSION_ID_COL, ITEM_ID_COL, 'storetime', 'flag']

In [None]:
chunksize = 10 ** 6
full_df = pd.DataFrame()
with pd.read_csv(lab_file, chunksize=chunksize, compression='gzip', parse_dates=[STORE_TIME_COL], usecols=LOAD_SPECIFIC_COLUMNS) as reader:
    for chunk in reader:
        tmp_chunk = chunk[chunk[SUBJECT_ID_COL].isin(pids) & chunk[ADMISSION_ID_COL].isin(adm_ids)]
        tmp_adms = only_last_admission[only_last_admission[SUBJECT_ID_COL].isin(pids) & only_last_admission[ADMISSION_ID_COL].isin(adm_ids)]
        #tmp_patinets = patients_df[patients_df[SUBJECT_ID_COL].isin(pids)]
        tmp_chunk = tmp_chunk.merge(tmp_adms, on=[SUBJECT_ID_COL, ADMISSION_ID_COL])
        #tmp = tmp_chunk.merge(tmp_patinets, on=[SUBJECT_ID_COL])
        full_df = pd.concat([full_df, tmp_chunk])
        print(len(full_df))

full_df.head()

# Continue only with included patients_df and admissions_df and full_df

In [None]:
pids = full_df[SUBJECT_ID_COL].drop_duplicates().values
adms_ids = full_df[ADMISSION_ID_COL].drop_duplicates().values
print(len(patients_df))
patients_df = patients_df[patients_df[SUBJECT_ID_COL].isin(pids)]
print(len(patients_df))
print(len(admissions_df))
admissions_df = admissions_df[admissions_df[ADMISSION_ID_COL].isin(adms_ids)]
print(len(admissions_df))

In [None]:
len(full_df)

In [None]:
fig, ax = plt.subplots(1,1,dpi=100)
admissions_df[ADMISSION_LOCATION_COL].value_counts().plot.bar(ax=ax)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_xlabel('Admission Location', fontsize=font_sz)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x(), p.get_height() * 1.01))

In [None]:
fig, ax = plt.subplots(1,1,dpi=100)
admissions_df[DISCHARGE_LOCATION_COL].value_counts().plot.bar(ax=ax)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_xlabel('Discharge Location', fontsize=font_sz)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x(), p.get_height() * 1.01))

# Regrouping discharge location

In [None]:
DISCHARGE_REGROUPING_DICT = {
    'HOME': 'HOME',
    'HOME HEALTH CARE': 'HOME',
    'SKILLED NURSING FACILITY': 'FURTHER TREATMENT',
    'DIED': 'DIED',
    'REHAB': 'HOME',
    'CHRONIC/LONG TERM ACUTE CARE': 'FURTHER TREATMENT',
    'HOSPICE': 'FURTHER TREATMENT',
    'AGAINST ADVICE': 'CENSORED',
    'ACUTE HOSPITAL': 'FURTHER TREATMENT',
    'PSYCH FACILITY': 'FURTHER TREATMENT',
    'OTHER FACILITY': 'FURTHER TREATMENT',
    'ASSISTED LIVING': 'HOME',
    'HEALTHCARE FACILITY': 'FURTHER TREATMENT',
}

In [None]:
admissions_df[DISCHARGE_LOCATION_COL].replace(DISCHARGE_REGROUPING_DICT, inplace=True)
full_df[DISCHARGE_LOCATION_COL].replace(DISCHARGE_REGROUPING_DICT, inplace=True)

In [None]:
fig, ax = plt.subplots(1,1,dpi=100)
admissions_df[DISCHARGE_LOCATION_COL].value_counts().plot.bar(ax=ax)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_xlabel('Discharge Location', fontsize=font_sz)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x(), p.get_height() * 1.01))

In [None]:
fig, ax = plt.subplots(1,1,dpi=100)
tmp = admissions_df[[ADMISSION_AGE_COL, GENDER_COL]]
#tmp[ADMISSION_AGE_COL] = pd.cut(tmp[ADMISSION_AGE_COL], bins=AGE_BINS, labels=AGE_LABELS)
tmp.groupby([ADMISSION_AGE_COL, GENDER_COL]).size().unstack().plot(kind='bar', ax=ax)
ax.set_xlabel('Age at Admission [years]', fontsize=font_sz)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_title(f'Total Population, N={len(tmp)}', fontsize=font_sz)
ax.legend(labels=['Female', 'Male'], title="Sex")
plt.setp(ax.get_xticklabels()[1::2], visible=False)
#ax.set_xticklabels(AGE_LABELS, rotation=90)
fig.savefig(os.path.join(OUTPUT_DIR, 'age_gender_admissions_subset.png'), dpi=300)

# Regroup Race

In [None]:
RACE_REGROUPING_DICT = {
    'WHITE': 'WHITE',
    'UNKNOWN': 'OTHER',
    'BLACK/AFRICAN AMERICAN': 'BLACK',
    'OTHER': 'OTHER',
    'ASIAN': 'ASIAN',
    'WHITE - OTHER EUROPEAN': 'WHITE',
    'HISPANIC/LATINO - PUERTO RICAN': 'HISPANIC',
    'HISPANIC/LATINO - DOMINICAN': 'HISPANIC',
    'ASIAN - CHINESE': 'ASIAN',
    'BLACK/CARIBBEAN ISLAND': 'BLACK',
    'BLACK/AFRICAN': 'BLACK',
    'BLACK/CAPE VERDEAN': 'BLACK',
    'PATIENT DECLINED TO ANSWER': 'OTHER',
    'WHITE - BRAZILIAN': 'WHITE',
    'PORTUGUESE': 'HISPANIC', 
    'ASIAN - SOUTH EAST ASIAN': 'ASIAN',
    'WHITE - RUSSIAN': 'WHITE',
    'ASIAN - ASIAN INDIAN': 'ASIAN',
    'WHITE - EASTERN EUROPEAN': 'WHITE',
    'AMERICAN INDIAN/ALASKA NATIVE': 'OTHER',
    'HISPANIC/LATINO - GUATEMALAN': 'HISPANIC',
    'HISPANIC/LATINO - MEXICAN': 'HISPANIC',
    'HISPANIC/LATINO - SALVADORAN': 'HISPANIC',
    'SOUTH AMERICAN': 'HISPANIC',
    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'OTHER',
    'HISPANIC/LATINO - COLUMBIAN': 'HISPANIC',
    'HISPANIC/LATINO - CUBAN': 'HISPANIC',
    'ASIAN - KOREAN': 'ASIAN',
    'HISPANIC/LATINO - HONDURAN': 'HISPANIC',
    'HISPANIC/LATINO - CENTRAL AMERICAN': 'HISPANIC',
    'UNABLE TO OBTAIN': 'OTHER',
    'HISPANIC OR LATINO': 'HISPANIC'
}

In [None]:
admissions_df[RACE_COL].replace(RACE_REGROUPING_DICT, inplace=True)
full_df[RACE_COL].replace(RACE_REGROUPING_DICT, inplace=True)

In [None]:
fig, ax = plt.subplots(1,1,dpi=100)
admissions_df[RACE_COL].value_counts().plot.bar(ax=ax)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_xlabel('Race', fontsize=font_sz)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x(), p.get_height() * 1.01))

In [None]:
fig, ax = plt.subplots(1,1,dpi=100)
admissions_df[INSURANCE_COL].value_counts().plot.bar(ax=ax)
ax.set_ylabel('Number of Patients', fontsize=font_sz)
ax.set_xlabel('Insurance', fontsize=font_sz)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x(), p.get_height() * 1.01))

# Taking only results 24 hours from admission

In [None]:
full_df.head()

In [None]:
full_df[ADMISSION_TO_RESULT_COL] = (full_df[STORE_TIME_COL] - full_df[ADMISSION_TIME_COL])

In [None]:
print(len(full_df[ADMISSION_TO_RESULT_COL]))
print(len(full_df[full_df[ADMISSION_TO_RESULT_COL] <= pd.to_timedelta('1d')]))
print(len(full_df[full_df[ADMISSION_TO_RESULT_COL] <= pd.to_timedelta('2d')]))

In [None]:
print(len(full_df[ADMISSION_ID_COL].drop_duplicates()))
print(len(full_df[full_df[ADMISSION_TO_RESULT_COL] <= pd.to_timedelta('1d')][ADMISSION_ID_COL].drop_duplicates()))
print(len(full_df[full_df[ADMISSION_TO_RESULT_COL] <= pd.to_timedelta('2d')][ADMISSION_ID_COL].drop_duplicates()))

In [None]:
full_df = full_df[full_df[ADMISSION_TO_RESULT_COL] <= pd.to_timedelta('1d')]
full_df.head()

In [None]:
print(len(full_df))
full_df.sort_values(by=[ADMISSION_TIME_COL, STORE_TIME_COL]).drop_duplicates(subset=[SUBJECT_ID_COL, ADMISSION_ID_COL, ITEM_ID_COL], 
    inplace=True, keep='last')
print(len(full_df))

# Most common lab tests upon arrival

In [None]:
lab_meta_df = pd.read_csv(lab_meta_file, compression='gzip')
lab_meta_df

In [None]:
threshold = 25000

In [None]:
common_tests = full_df.groupby(ITEM_ID_COL)[ADMISSION_ID_COL].nunique().sort_values(ascending=False)
included_in_threshold = common_tests[common_tests > threshold].to_frame().merge(lab_meta_df, on=ITEM_ID_COL)
included_in_threshold

In [None]:
print(len(full_df))
full_df = full_df[full_df[ITEM_ID_COL].isin(included_in_threshold[ITEM_ID_COL].values)]
print(len(full_df))

In [None]:
minimal_item_id = included_in_threshold.iloc[-1][ITEM_ID_COL]
minimal_item_id

In [None]:
pids = full_df[full_df[ITEM_ID_COL] == minimal_item_id][SUBJECT_ID_COL].drop_duplicates().values
adms_ids = full_df[full_df[ITEM_ID_COL] == minimal_item_id][ADMISSION_ID_COL].drop_duplicates().values
print(len(patients_df))
patients_df = patients_df[patients_df[SUBJECT_ID_COL].isin(pids)]
print(len(patients_df))
print(len(admissions_df))
admissions_df = admissions_df[admissions_df[ADMISSION_ID_COL].isin(adms_ids)]
print(len(admissions_df))
print(len(admissions_df))
full_df = full_df[full_df[SUBJECT_ID_COL].isin(pids)]
full_df = full_df[full_df[ADMISSION_ID_COL].isin(adms_ids)]
print(len(admissions_df))

In [None]:
full_df.head()

In [None]:
full_df.columns

In [None]:
full_df.shape

In [None]:
full_df['flag'].fillna('normal', inplace=True)
full_df['flag'].replace({'normal': 0, 'abnormal':1}, inplace=True)

In [None]:
full_df['flag'].value_counts()

In [None]:
full_df = full_df.sort_values(by=[ADMISSION_TIME_COL, STORE_TIME_COL]).drop_duplicates(
    subset=[SUBJECT_ID_COL, ADMISSION_ID_COL, ITEM_ID_COL], 
    keep='last')
full_df

In [None]:
tmp = full_df[[SUBJECT_ID_COL, ADMISSION_ID_COL, ITEM_ID_COL, 'flag']]
fitters_table = pd.pivot_table(tmp, values=['flag'], index=[SUBJECT_ID_COL, ADMISSION_ID_COL], 
                               columns=[ITEM_ID_COL], aggfunc=np.sum)
fitters_table

In [None]:
fitters_table = fitters_table.droplevel(1, axis=0).droplevel(0, axis=1)
fitters_table

In [None]:
MARITAL_STATUS_COL = 'marital_status'

In [None]:
full_df.columns

In [None]:
dummies_df = full_df.drop_duplicates(subset=[SUBJECT_ID_COL]).set_index(SUBJECT_ID_COL)
dummies_df

In [None]:
pd.get_dummies(dummies_df[INSURANCE_COL], prefix='Insurance', drop_first=True).head()

In [None]:
dummies_df[NIGHT_ADMISSION_FLAG].astype(int)

In [None]:
del full_df
del admissions_df
del patients_df

In [None]:
dummies_df[GENDER_COL].value_counts()

# Standardize age

In [None]:
from sklearn.preprocessing import StandardScaler
STANDARDIZED_AGE_COL = 'standardized_age'
scaler = StandardScaler()
dummies_df[STANDARDIZED_AGE_COL] = scaler.fit_transform(dummies_df[[AGE_COL]])

In [None]:
J_DICT = {'HOME': 1, 'FURTHER TREATMENT': 2, 'DIED': 3, 'CENSORED': 0} 
GENDER_DICT = {'F': 1, 'M': 0}

In [None]:
dummies_df[GENDER_COL] = dummies_df[GENDER_COL].replace(GENDER_DICT)

# Table 1

In [None]:
from tableone import TableOne

In [None]:
included_in_threshold['label'] = included_in_threshold['label'].apply(lambda x: x.replace(' ', '')).apply(lambda x: x.replace(',', ''))
RENAME_ITEMS_DICT = included_in_threshold[[ITEM_ID_COL, 'label']].set_index(ITEM_ID_COL).to_dict()['label']
RENAME_ITEMS_DICT

In [None]:
table1 = pd.concat([
    fitters_table,
    #pd.get_dummies(dummies_df[INSURANCE_COL], prefix='Insurance'),
    #pd.get_dummies(dummies_df[MARITAL_STATUS_COL], prefix='Marital'),
    #pd.get_dummies(dummies_df[RACE_COL], prefix='Ethnicity'),
    #pd.get_dummies(dummies_df[ADMISSION_COUNT_GROUP_COL], prefix='AdmsCount'),
    dummies_df[[NIGHT_ADMISSION_FLAG,
                GENDER_COL, 
                DIRECT_IND_COL,
                PREV_ADMISSION_IND_COL,
                ADMISSION_AGE_COL]].astype(int),
    dummies_df[[INSURANCE_COL,
                MARITAL_STATUS_COL,
                RACE_COL,
                ADMISSION_COUNT_GROUP_COL]],
    dummies_df[LOS_DAYS_COL].dt.days,
    dummies_df[DISCHARGE_LOCATION_COL].dropna().replace(J_DICT).astype(int)
], axis=1)
    
table1.rename(RENAME_ITEMS_DICT, inplace=True, axis=1)  
table1

In [None]:
table1.columns

In [None]:
columns = ['gender', 'admission_age', 'race', 'insurance', 'marital_status',
           'direct_emrgency_flag', 'night_admission', 'last_less_than_diff', 
           'admissions_count_group', 'LOS days', 'discharge_location']
categorical = ['gender', 'race', 'insurance', 'marital_status',
           'direct_emrgency_flag', 'night_admission', 'last_less_than_diff', 
           'admissions_count_group', 'discharge_location']

groupby = [GENDER_COL]
mytable = TableOne(table1.dropna(), columns, categorical, groupby)
mytable

In [None]:
print(mytable.tableone.round(3).to_latex())

In [None]:
columns = ['gender', 'AnionGap', 'Bicarbonate', 'CalciumTotal', 'Chloride', 'Creatinine',
           'Glucose', 'Magnesium', 'Phosphate', 'Potassium', 'Sodium',
           'UreaNitrogen', 'Hematocrit', 'Hemoglobin', 'MCH', 'MCHC', 'MCV',
           'PlateletCount', 'RDW', 'RedBloodCells', 'WhiteBloodCells']
categorical = ['gender', 'AnionGap', 'Bicarbonate', 'CalciumTotal', 'Chloride', 'Creatinine',
           'Glucose', 'Magnesium', 'Phosphate', 'Potassium', 'Sodium',
           'UreaNitrogen', 'Hematocrit', 'Hemoglobin', 'MCH', 'MCHC', 'MCV',
           'PlateletCount', 'RDW', 'RedBloodCells', 'WhiteBloodCells']

groupby = [GENDER_COL]
mytable = TableOne(table1.dropna(), columns, categorical, groupby)
mytable

In [None]:
print(mytable.tableone.round(3).to_latex())

In [None]:
fitters_table = pd.concat([
    fitters_table,
    pd.get_dummies(dummies_df[INSURANCE_COL], prefix='Insurance', drop_first=True),
    pd.get_dummies(dummies_df[MARITAL_STATUS_COL], prefix='Marital', drop_first=True),
    pd.get_dummies(dummies_df[RACE_COL], prefix='Ethnicity', drop_first=True),
    pd.get_dummies(dummies_df[ADMISSION_COUNT_GROUP_COL], prefix='AdmsCount', drop_first=True),
    dummies_df[[NIGHT_ADMISSION_FLAG, 
                GENDER_COL, 
                DIRECT_IND_COL,
                PREV_ADMISSION_IND_COL]].astype(int),
    dummies_df[STANDARDIZED_AGE_COL],
    dummies_df[LOS_DAYS_COL].dt.days,
    dummies_df[DISCHARGE_LOCATION_COL].dropna().replace(J_DICT).astype(int)
], axis=1)
    
fitters_table   

In [None]:
print(len(fitters_table))
fitters_table.dropna(inplace=True)
print(len(fitters_table))

In [None]:
fitters_table.reset_index(inplace=True)

In [None]:
fitters_table.rename({DISCHARGE_LOCATION_COL: 'J', LOS_DAYS_COL: 'X', SUBJECT_ID_COL: 'pid'}, inplace=True, axis=1)

In [None]:
fitters_table.rename(RENAME_ITEMS_DICT, inplace=True, axis=1)

In [None]:
ADMINISTRATIVE_CENSORING = 28

In [None]:
fitters_table.columns

In [None]:
fitters_table = fitters_table[fitters_table['X'] > 0]
fitters_table.loc[fitters_table.X > ADMINISTRATIVE_CENSORING, 'J'] = 0
fitters_table.loc[fitters_table.X > ADMINISTRATIVE_CENSORING, 'X'] = ADMINISTRATIVE_CENSORING + 1

In [None]:
fitters_table['J'] = fitters_table['J'].astype(int)

In [None]:
fitters_table.groupby(['X', 'J'])['pid'].count().sort_index().tail(50)

In [None]:
from pydts.examples_utils.plots import plot_example_pred_output
from pydts.examples_utils.plots import add_panel_text
from pydts.fitters import TwoStagesFitter, DataExpansionFitter
from pydts.examples_utils.plots import plot_events_occurrence

from time import time

slicer = pd.IndexSlice

plot_events_occurrence(fitters_table)

In [None]:
COEF_COL = '   coef   '
STDERR_COL = ' std err '
import pickle

In [None]:
case = f'mimic_final_'
two_step_timing = []
lee_timing = []

# Two step fitter
new_fitter = TwoStagesFitter()
print(f'Starting two-step')
two_step_start = time()
new_fitter.fit(df=fitters_table, nb_workers=1)
two_step_end = time()
print(f'Finished two-step: {two_step_end-two_step_start}sec')

two_step_timing.append(two_step_end-two_step_start)

# Lee et al fitter
print(f'Starting Lee et al.')
lee_fitter = DataExpansionFitter()
lee_start = time()
lee_fitter.fit(df=fitters_table)
lee_end = time()
print(f'Finished lee: {lee_end-lee_start}sec')

lee_timing.append(lee_end-lee_start) 

lee_alpha_ser = lee_fitter.get_alpha_df().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()
lee_beta_ser = lee_fitter.get_beta_SE().loc[:, slicer[:, [COEF_COL, STDERR_COL] ]].unstack().sort_index()

two_step_alpha_k_results = new_fitter.alpha_df[['J', 'X', 'alpha_jt']]
two_step_beta_k_results = new_fitter.get_beta_SE().unstack().to_frame()

lee_alpha_k_results = lee_alpha_ser.to_frame()
lee_beta_k_results = lee_beta_ser.to_frame()

# Cache results
two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha.csv'))
two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta.csv'))
lee_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha.csv'))
lee_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_beta.csv'))

In [None]:
covariates = [c for c in fitters_table.columns if c not in ['pid', 'J', 'X']]
covariates

In [None]:
two_step_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha.csv'), 
                                       index_col=['J', 'X'])
two_step_beta_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta.csv'),
                                      index_col=[0, 1])
lee_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha.csv'),
                                  index_col=[0,1,2])
lee_beta_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_beta.csv'),
                                 index_col=[0, 1, 2])


twostep_beta1_summary = two_step_beta_k_results.mean(axis=1).unstack([0]).round(3).iloc[:, [1,0]]
twostep_beta1_summary.index = [f'{iii.replace(" ", "")}_1' for iii in twostep_beta1_summary.index]
twostep_beta2_summary = two_step_beta_k_results.mean(axis=1).unstack([0]).round(3).iloc[:, [3,2]]
twostep_beta2_summary.index = [f'{iii.replace(" ", "")}_2' for iii in twostep_beta2_summary.index]
twostep_beta3_summary = two_step_beta_k_results.mean(axis=1).unstack([0]).round(3).iloc[:, [5,4]]
twostep_beta3_summary.index = [f'{iii.replace(" ", "")}_3' for iii in twostep_beta3_summary.index]

lee_beta1_summary = lee_beta_k_results.mean(axis=1).loc[slicer[1,:,:]].unstack([0]).round(3)
lee_beta1_summary.index = [f'{iii.replace(" ", "")}_1' for iii in lee_beta1_summary.index]
lee_beta2_summary = lee_beta_k_results.mean(axis=1).loc[slicer[2,:,:]].unstack([0]).round(3)
lee_beta2_summary.index = [f'{iii.replace(" ", "")}_2' for iii in lee_beta2_summary.index]
lee_beta3_summary = lee_beta_k_results.mean(axis=1).loc[slicer[3,:,:]].unstack([0]).round(3)
lee_beta3_summary.index = [f'{iii.replace(" ", "")}_3' for iii in lee_beta3_summary.index]
    
lee_beta1_summary.columns = pd.MultiIndex.from_tuples([('Lee et al.', 'Estimate'), ('Lee et al.', 'Estimated SE')])
lee_beta2_summary.columns = pd.MultiIndex.from_tuples([('Lee et al.', 'Estimate'), ('Lee et al.', 'Estimated SE')])
lee_beta3_summary.columns = pd.MultiIndex.from_tuples([('Lee et al.', 'Estimate'), ('Lee et al.', 'Estimated SE')])

beta_summary_comparison = pd.concat([lee_beta1_summary, lee_beta2_summary, lee_beta3_summary], axis=0)
#beta_summary_comparison.index = [r'$\beta_{11}$', r'$\beta_{12}$', r'$\beta_{13}$', r'$\beta_{14}$', r'$\beta_{15}$',
#                                 r'$\beta_{21}$', r'$\beta_{22}$', r'$\beta_{23}$', r'$\beta_{24}$', r'$\beta_{25}$',
#                                 r'$\beta_{31}$', r'$\beta_{32}$', r'$\beta_{33}$', r'$\beta_{34}$', r'$\beta_{35}$']
twostep_beta1_summary.columns = pd.MultiIndex.from_tuples([('two-step', 'Estimate'), ('two-step', 'Estimated SE')])
twostep_beta2_summary.columns = pd.MultiIndex.from_tuples([('two-step', 'Estimate'), ('two-step', 'Estimated SE')])
twostep_beta3_summary.columns = pd.MultiIndex.from_tuples([('two-step', 'Estimate'), ('two-step', 'Estimated SE')])

tmp = pd.concat([twostep_beta1_summary.round(3), twostep_beta2_summary.round(3), twostep_beta3_summary.round(3)], axis=0)
#tmp.index = [r'$\beta_{11}$', r'$\beta_{12}$', r'$\beta_{13}$', r'$\beta_{14}$', r'$\beta_{15}$',
#             r'$\beta_{21}$', r'$\beta_{22}$', r'$\beta_{23}$', r'$\beta_{24}$', r'$\beta_{25}$',
#             r'$\beta_{31}$', r'$\beta_{32}$', r'$\beta_{33}$', r'$\beta_{34}$', r'$\beta_{35}$']


beta_summary_comparison = pd.concat([beta_summary_comparison, tmp], axis=1)
beta_summary_comparison.index.name =  r'$\beta_{jk}$'
beta_summary_comparison.index = [c.replace("_", " ") for c in beta_summary_comparison.index]
beta_summary_comparison

In [None]:
print(beta_summary_comparison.to_latex(escape=False))

In [None]:
filename = 'mimic_summary_.png'

first_model_name = 'Lee et al.'
second_model_name = 'two-step'
times = range(1, ADMINISTRATIVE_CENSORING+1)

lee_colors = ['tab:blue', 'tab:green', 'tab:red']
two_step_colors = ['navy', 'darkgreen', 'tab:brown']
true_colors = ['tab:blue', 'tab:green', 'tab:red']

fig, ax = plt.subplots(1, 1, figsize=(10, 8))

counts = fitters_table.groupby(['J', 'X'])['pid'].count().unstack('J').fillna(0)

two_step_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha.csv'), 
                                         index_col=['J', 'X'])

lee_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha.csv'),
                                   index_col=[0,1,2])

ax.tick_params(axis='both', which='major', labelsize=15)
ax.tick_params(axis='both', which='minor', labelsize=15)

for j in [1, 2, 3]:

    tmp_alpha = lee_alpha_k_results.loc[slicer[j, COEF_COL, :]].mean(axis=1)
    tmp_alpha.index = [int(idx.split(')[')[1].split(']')[0]) for idx in tmp_alpha.index]
    tmp_alpha = pd.Series(tmp_alpha.values.squeeze().astype(float), index=tmp_alpha.index)

    ax.scatter(tmp_alpha.index, tmp_alpha.values,
       label=f'J={j} ({first_model_name})', color=lee_colors[j-1], marker='o', alpha=0.4, s=40)

    tmp_alpha = two_step_alpha_k_results.loc[slicer[j, 'alpha_jt']]
    ax.scatter(tmp_alpha.index, tmp_alpha.values,
       label=f'J={j} ({second_model_name})', color=two_step_colors[j-1], marker='*', alpha=0.7, s=20)

    ax.set_xlabel(r'Time', fontsize=18)
    ax.set_ylabel(r'$\alpha_{jt}$', fontsize=18)
    ax.legend(loc='upper right', fontsize=12)

ax.set_ylim([-13, 3])

ax2 = ax.twinx()
ax2.bar(counts.index, counts[1].values.squeeze(), label='J=1', color='navy', alpha=0.4, width=0.4)
ax2.bar(counts.index, counts[2].values.squeeze(), label='J=2', color='darkgreen', alpha=0.4, align='edge',
        width=0.4)
ax2.bar(counts.index, counts[3].values.squeeze(), label='J=3', color='tab:red', alpha=0.6, align='edge',
        width=-0.4)
ax2.legend(loc='upper center', fontsize=12)
ax2.set_ylabel('Number of observed events', fontsize=16, color='red')
ax2.tick_params(axis='y', colors='red')
ax2.set_ylim([0, 8000])
ax2.tick_params(axis='both', which='major', labelsize=15)
ax2.tick_params(axis='both', which='minor', labelsize=15)
    
fig.tight_layout()

if filename is not None:
    fig.savefig(os.path.join(OUTPUT_DIR, filename), dpi=300)

In [None]:
case = f'mimic_regularized_final_'
two_step_timing = []
lee_timing = []

penalizer = 0

fit_beta_kwargs = {
    'model_kwargs': {
        'penalizer': penalizer,
        'l1_ratio': 1
    }
}

# Two step fitter
regularized_fitter = TwoStagesFitter()
print(f'Starting two-step')
two_step_start = time()
regularized_fitter.fit(df=fitters_table, fit_beta_kwargs=fit_beta_kwargs, nb_workers=1)
two_step_end = time()
print(f'Finished two-step: {two_step_end-two_step_start}sec')

two_step_timing.append(two_step_end-two_step_start)

two_step_alpha_k_results = regularized_fitter.alpha_df[['J', 'X', 'alpha_jt']]
two_step_beta_k_results = regularized_fitter.get_beta_SE().unstack().to_frame()

two_step_alpha_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha.csv'))
two_step_beta_k_results.to_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta.csv'))

In [None]:
two_step_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha.csv'), 
                                       index_col=['J', 'X'])
two_step_beta_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_beta.csv'),
                                      index_col=[0, 1])

twostep_beta1_summary = two_step_beta_k_results.mean(axis=1).unstack([0]).round(3).iloc[:, [1,0]]
twostep_beta1_summary.index = [f'{iii.replace(" ", "")}_1' for iii in twostep_beta1_summary.index]
twostep_beta2_summary = two_step_beta_k_results.mean(axis=1).unstack([0]).round(3).iloc[:, [3,2]]
twostep_beta2_summary.index = [f'{iii.replace(" ", "")}_2' for iii in twostep_beta2_summary.index]
twostep_beta3_summary = two_step_beta_k_results.mean(axis=1).unstack([0]).round(3).iloc[:, [5,4]]
twostep_beta3_summary.index = [f'{iii.replace(" ", "")}_3' for iii in twostep_beta3_summary.index]

twostep_beta1_summary.columns = pd.MultiIndex.from_tuples([('two-step', 'Estimate'), ('two-step', 'Estimated SE')])
twostep_beta2_summary.columns = pd.MultiIndex.from_tuples([('two-step', 'Estimate'), ('two-step', 'Estimated SE')])
twostep_beta3_summary.columns = pd.MultiIndex.from_tuples([('two-step', 'Estimate'), ('two-step', 'Estimated SE')])

tmp = pd.concat([twostep_beta1_summary.round(3), twostep_beta2_summary.round(3), twostep_beta3_summary.round(3)], 
                axis=0)

beta_summary_comparison = pd.concat([beta_summary_comparison, tmp], axis=1)
beta_summary_comparison.index.name =  r'$\beta_{jk}$'
beta_summary_comparison.index = [c.replace("_", " ") for c in beta_summary_comparison.index]
beta_summary_comparison

In [None]:
print(beta_summary_comparison.to_latex(escape=False))

In [None]:
filename = 'mimic_regularized_summary.png'

first_model_name = 'Lee et al.'
second_model_name = 'two-step'
times = range(1, ADMINISTRATIVE_CENSORING+1)

lee_colors = ['tab:blue', 'tab:green', 'tab:red']
two_step_colors = ['navy', 'darkgreen', 'tab:brown']
true_colors = ['tab:blue', 'tab:green', 'tab:red']

fig, ax = plt.subplots(1, 1, figsize=(10, 8))

counts = fitters_table.groupby(['J', 'X'])['pid'].count().unstack('J').fillna(0)

two_step_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_two_step_alpha.csv'), 
                                         index_col=['J', 'X'])

lee_alpha_k_results = pd.read_csv(os.path.join(OUTPUT_DIR, f'{case}_lee_alpha.csv'),
                                   index_col=[0,1,2])

ax.tick_params(axis='both', which='major', labelsize=15)
ax.tick_params(axis='both', which='minor', labelsize=15)

for j in [1, 2, 3]:

    tmp_alpha = lee_alpha_k_results.loc[slicer[j, COEF_COL, :]].mean(axis=1)
    tmp_alpha.index = [int(idx.split(')[')[1].split(']')[0]) for idx in tmp_alpha.index]
    tmp_alpha = pd.Series(tmp_alpha.values.squeeze().astype(float), index=tmp_alpha.index)

    ax.scatter(tmp_alpha.index, tmp_alpha.values,
       label=f'J={j} ({first_model_name})', color=lee_colors[j-1], marker='o', alpha=0.4, s=40)

    tmp_alpha = two_step_alpha_k_results.loc[slicer[j, 'alpha_jt']]
    ax.scatter(tmp_alpha.index, tmp_alpha.values,
       label=f'J={j} ({second_model_name})', color=two_step_colors[j-1], marker='*', alpha=0.7, s=20)

    ax.set_xlabel(r'Time', fontsize=18)
    ax.set_ylabel(r'$\alpha_{jt}$', fontsize=18)
    ax.legend(loc='upper right', fontsize=12)

ax.set_ylim([-13, 3])

ax2 = ax.twinx()
ax2.bar(counts.index, counts[1].values.squeeze(), label='J=1', color='navy', alpha=0.4, width=0.4)
ax2.bar(counts.index, counts[2].values.squeeze(), label='J=2', color='darkgreen', alpha=0.4, align='edge',
        width=0.4)
ax2.bar(counts.index, counts[3].values.squeeze(), label='J=3', color='tab:red', alpha=0.6, align='edge',
        width=-0.4)
ax2.legend(loc='upper center', fontsize=12)
ax2.set_ylabel('Number of observed events', fontsize=16, color='red')
ax2.tick_params(axis='y', colors='red')
ax2.set_ylim([0, 8000])
ax2.tick_params(axis='both', which='major', labelsize=15)
ax2.tick_params(axis='both', which='minor', labelsize=15)
    
fig.tight_layout()

if filename is not None:
    fig.savefig(os.path.join(OUTPUT_DIR, filename), dpi=300)