## Purpose of this notebook is to engineer lab test features from the LABEVENTS tabele
LBAEVENTS table is 1.85GB. To avoid slow processing I will load the table in chunks.

In [5]:
import pandas as pd
# let's get the header
lab_columns = pd.read_csv('../../data/raw/LABEVENTS.csv', nrows=0)
lab_columns.columns

Index(['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'ITEMID', 'CHARTTIME', 'VALUE',
       'VALUENUM', 'VALUEUOM', 'FLAG'],
      dtype='object')

## Useful rows
* HADM_ID id to merge with admissions information
* ITEMID id to merge with D_LABIMES to find the name of the lab test
* VALUENUM - numerical value of the test result

Note, some rows are missing a HADMID (outpatient services), we will filter those out. Since the lab test may have been performed multiple times during the same admission, we will calculate the summary statistic mean, variance and keep track of the counts. In the end, only the lab tests performed for the majotrity of admissions will be useful for readmission prediction. Since we don't know those from the start we will keep track of all the tests for all admissions. 

In [7]:
cols_to_drop = ['ROW_ID', 'SUBJECT_ID', 'CHARTTIME', 'VALUE', 'VALUEUOM', 'FLAG']
from datetime import datetime
start_time = datetime.now()
chunksize = 10**6
lab_summary = pd.DataFrame()
temp_lab_summary = pd.DataFrame()
for chunk in pd.read_csv('../../data/raw/LABEVENTS.csv', chunksize=chunksize):
    # drop the unused columns and empty rows
    chunk.drop(columns = cols_to_drop, inplace=True)
    chunk = chunk[~chunk.isna()]
    temp_lab_summary['Mean'] = chunk.groupby(['HADM_ID','ITEMID'])['VALUENUM'].mean()
    temp_lab_summary['Var'] = chunk.groupby(['HADM_ID','ITEMID'])['VALUENUM'].var()
    temp_lab_summary['Count'] = chunk.groupby(['HADM_ID','ITEMID'])['VALUENUM'].count()
    # variances will be combined in a later step, cannot have Nan values
    temp_lab_summary = temp_lab_summary[~temp_lab_summary.Var.isna()]
    lab_summary = pd.concat([lab_summary,temp_lab_summary], ignore_index=False)
    temp_lab_summary = pd.DataFrame()
    print(len(lab_summary))
    print("Prossing chunk took ", datetime.now()-start_time)

61667
Prossing chunk took  0:00:01.341801
124425
Prossing chunk took  0:00:02.605407
189995
Prossing chunk took  0:00:03.893482
253624
Prossing chunk took  0:00:05.134867
316952
Prossing chunk took  0:00:06.384037
384834
Prossing chunk took  0:00:07.946215
451068
Prossing chunk took  0:00:09.251154
516716
Prossing chunk took  0:00:10.545553
581235
Prossing chunk took  0:00:11.832888
646055
Prossing chunk took  0:00:13.136443
710759
Prossing chunk took  0:00:14.436015
777453
Prossing chunk took  0:00:15.742250
844316
Prossing chunk took  0:00:17.056281
909915
Prossing chunk took  0:00:18.705729
977859
Prossing chunk took  0:00:20.031913
1043378
Prossing chunk took  0:00:21.372867
1111148
Prossing chunk took  0:00:22.711146
1183051
Prossing chunk took  0:00:24.180860
1257417
Prossing chunk took  0:00:25.718282
1333113
Prossing chunk took  0:00:27.264443
1425280
Prossing chunk took  0:00:28.805513
1516493
Prossing chunk took  0:00:30.279463
1607635
Prossing chunk took  0:00:31.750113
1701

In [8]:
lab_summary.columns

Index(['Mean', 'Var', 'Count'], dtype='object')

In [9]:
# we have 2 million rows of statistical summaries of lab tests
# write a fucntion to combine the summary statistics for the same admission and the same test
def combined_stats(grouped_arrays):
    "calculate mean of means, combined variance, combined count"
    grouped_arrays = grouped_arrays.values
    if grouped_arrays.shape[0]==1:
        return pd.DataFrame({'Mean': [grouped_arrays[0,0]], 'Var': [grouped_arrays[0,1]], 'Count':[grouped_arrays[0,2]]})
    else:
        mean_comb = (grouped_arrays[:,0]*grouped_arrays[:,2]).sum()/grouped_arrays[:,2].sum()
        var_comb = (grouped_arrays[:,2]*(grouped_arrays[:,1]+(grouped_arrays[:,0]-mean_comb)**2)).sum()/grouped_arrays[:,2].sum()
        count_comb = grouped_arrays[:,2].sum()
        return pd.DataFrame({'Mean': [mean_comb], 'Var': [var_comb], 'Count':[count_comb]})

In [11]:
start_time = datetime.now()
starting_size = len(lab_summary)
lab_summary = lab_summary.groupby(['HADM_ID','ITEMID']).apply(combined_stats)
print("The number of rows was reduced from ", starting_size, "to", len(lab_summary))
print("Merging duplicate summaries took ",datetime.now()-start_time)

The number of rows was reduced from  2061634 to 1963876
Merging duplicate summaries took  0:52:51.633342


In [12]:
lab_summary.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Mean,Var,Count
HADM_ID,ITEMID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100001.0,50868,0,16.642857,35.17033,14.0
100001.0,50882,0,19.714286,17.912088,14.0
100001.0,50893,0,8.554545,0.350727,11.0
100001.0,50902,0,107.214286,16.027473,14.0
100001.0,50912,0,2.214286,0.047473,14.0


In [13]:
lab_summary.reset_index(inplace=True)
lab_summary.columns

Index(['HADM_ID', 'ITEMID', 'level_2', 'Mean', 'Var', 'Count'], dtype='object')

In [14]:
# load the D_LABITEM table to get the test names
lab_item_df = pd.read_csv('../../data/raw/D_LABITEMS.csv')
lab_item_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 753 entries, 0 to 752
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ROW_ID      753 non-null    int64 
 1   ITEMID      753 non-null    int64 
 2   LABEL       753 non-null    object
 3   FLUID       753 non-null    object
 4   CATEGORY    753 non-null    object
 5   LOINC_CODE  585 non-null    object
dtypes: int64(2), object(4)
memory usage: 35.4+ KB


In [16]:
# merge the two dataframes on ITEMID
lab_summary = lab_summary.merge(lab_item_df[['ITEMID', 'LABEL']], left_on='ITEMID', right_on='ITEMID', how='left')
# make sure there are no unmatched lab tests
lab_summary.LABEL.isna().sum()

0

In [23]:
#lab_summary.drop(columns=['level_2','ITEMID'])
# construct the cross tabulated table
lab_crosstab = pd.pivot_table(lab_summary, values=['Mean', 'Var', 'Count'], index='HADM_ID', columns='LABEL')
lab_crosstab.values.shape

(53791, 1038)

In [24]:
lab_crosstab.columns

MultiIndex([('Count',          '% Hemoglobin A1c'),
            ('Count',             '24 hr Calcium'),
            ('Count',          '24 hr Creatinine'),
            ('Count',             '24 hr Protein'),
            ('Count',           '25-OH Vitamin D'),
            ('Count',                 '<Albumin>'),
            ('Count',        'Absolute CD3 Count'),
            ('Count',        'Absolute CD4 Count'),
            ('Count',        'Absolute CD8 Count'),
            ('Count', 'Absolute Lymphocyte Count'),
            ...
            (  'Var',              'WBC, Pleural'),
            (  'Var',         'White Blood Cells'),
            (  'Var',                     'Yeast'),
            (  'Var',               'Young Cells'),
            (  'Var',                      'pCO2'),
            (  'Var',          'pCO2, Body Fluid'),
            (  'Var',                        'pH'),
            (  'Var',                       'pO2'),
            (  'Var',           'pO2, Body Fluid

In [25]:
# we have a multiindex column, the first level identifies the statistic and the second level the lab test
# combine the two levels
lab_crosstab.columns = lab_crosstab.columns.map('_'.join).str.strip('_')
lab_crosstab.columns

Index(['Count_% Hemoglobin A1c', 'Count_24 hr Calcium',
       'Count_24 hr Creatinine', 'Count_24 hr Protein',
       'Count_25-OH Vitamin D', 'Count_<Albumin>', 'Count_Absolute CD3 Count',
       'Count_Absolute CD4 Count', 'Count_Absolute CD8 Count',
       'Count_Absolute Lymphocyte Count',
       ...
       'Var_WBC, Pleural', 'Var_White Blood Cells', 'Var_Yeast',
       'Var_Young Cells', 'Var_pCO2', 'Var_pCO2, Body Fluid', 'Var_pH',
       'Var_pO2', 'Var_pO2, Body Fluid', 'Var_tacroFK'],
      dtype='object', length=1038)

In [26]:
# save the intermediate analysis
lab_crosstab.to_csv('../../data/intermediate/inter022020b.csv')