In [1]:
import os
import gc

import numpy as np
import pandas as pd

from utils_prediction.dataloader.mimic4 import dataloader

In [2]:
base_fpath = '/hpf/projects/lsung/projects/mimic4ds/Experiments/baseline/data'

In [3]:
timebins = ['_0_0.167','_-7_0','_-30_-7','_-180_-30','-365000_-180']

feature_tags = [
    ['diag','count'],
    ['hcpcs','count'],
    ['lab','count'],
    ['presc','count'],
    ['proc','count'],
    ['icucharts','measurement'],
    ['lab','measurement']
]

config = {
    'features_fpath':base_fpath,
    'features_ftype':'parquet',
    'verbose':False,
    'label_col':'label',
    'id_col':'subject_id',
    'group':'2008 - 2010'
}

In [None]:
## grab all feature labels
all_cols = {}

for task in ['mortality','longlos','sepsis','invasivevent']:
    config['analysis_id'] = task

    data = dataloader(**config).load_features()
    cols = data.features.columns
    df = pd.DataFrame(columns=['task','tag','timebins','count'])
    c = 0
    for timebin in timebins:
        for t,tag in enumerate(feature_tags):
            c+=1
            df.loc[c,:] = [
                task,
                '_'.join(feature_tags[t]),
                timebin,
                len([
                    x for x in cols
                    if timebin in x and
                    tag[0] in x and
                    tag[1] in x
                ])
            ]
    df.to_csv(f'artifacts/{task}_features.csv',index=False)

In [3]:
## join dfs
df = pd.concat((
    pd.read_csv(f'artifacts/{x}_features.csv')
    for x in ['mortality','longlos','sepsis','invasivevent']
), axis=0)

df = df.pivot(
    index = ['task','timebins'], 
    columns = 'tag', 
    values = 'count'
)
df = df.reindex(labels = ['mortality','longlos','invasivevent','sepsis'], level=0)
df = df[['diag_count','proc_count','hcpcs_count','presc_count','lab_count','lab_measurement','icucharts_measurement']]

In [8]:
# total number of features
df.groupby('task').sum().sum(axis=1)

task
mortality       48423
longlos         48423
invasivevent    44907
sepsis          40691
dtype: int64

In [9]:
df

Unnamed: 0_level_0,tag,diag_count,proc_count,hcpcs_count,presc_count,lab_count,lab_measurement,icucharts_measurement
task,timebins,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
mortality,-365000_-180,2189,545,42,2037,556,1071,0
mortality,_-180_-30,5450,1727,198,3424,729,1395,0
mortality,_-30_-7,4534,1402,121,3390,714,1326,0
mortality,_-7_0,3262,900,68,3657,743,1389,0
mortality,_0_0.167,0,0,0,3154,653,1257,2490
longlos,-365000_-180,2189,545,42,2037,556,1071,0
longlos,_-180_-30,5450,1727,198,3424,729,1395,0
longlos,_-30_-7,4534,1402,121,3390,714,1326,0
longlos,_-7_0,3262,900,68,3657,743,1389,0
longlos,_0_0.167,0,0,0,3154,653,1257,2490
