### Code to create desired matrices for the matrix factorisation

- Matrices' dimension CxF (C : number of ccs codes, F : number of features)
- Matrix one contains {+1, 0, -1} at each entry. These are positive, unknown, neutral correlation of ccs code with feature
- Matrix two contains the number of occurences of ccs code with this feature

Still remain TO DO: find a way to map the df_val -> {-1,0,1}


In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
###path where the data is and where the results will be stored
data_path = r'/local/home/papageoa/data/mimic/mimic_iii_data/compressed/'
output_path = r'/local/home/papageoa/data/mimic/test/'
mapping_path = r'/local/home/papageoa/data'

In [3]:
# Load chartevents
dict_types = {"ROW_ID" : int,
              "SUBJECT_ID" : int,
              "HADM_ID" : int,
              "ICUSTAY_ID" : float,
              "ITEM_ID" : int,
              "CHARTTIME" : str,
              "STORETIME" : str,
              "CGID" : float,
              "VALUE" : str,
              "VALUENUM" : float,
              "VALUEUOM" : str,
              "WARNING" : float,
              "ERROR" : float,
              "RESULTSTATUS" :str,
              "STOPPED" : str
              }
chartevents = pd.read_csv(data_path+"CHARTEVENTS.csv.gz", dtype = dict_types, compression='gzip')
chartevents.CHARTTIME = pd.to_datetime(chartevents.CHARTTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

# Load d_items
d_items = pd.read_csv(data_path+'D_ITEMS.csv.gz', compression='gzip', index_col=0, usecols=['ITEMID','PARAM_TYPE','LINKSTO'])

In [4]:
# Filter: Exclude Errors and Stopped entries from CHARTEVENTS
chartevents=chartevents[chartevents.ERROR!=1] # Filter errors
chartevents=chartevents[chartevents.STOPPED!="D/C'd"] # Filter stopped observations because of discharge
# Filter: Include numeric and checklist type of observations
d_items_numCheck = d_items[(d_items.LINKSTO == 'chartevents') & ((d_items.PARAM_TYPE == 'Numeric') | (d_items.PARAM_TYPE == 'Checklist'))]
chartevents = chartevents.merge(d_items_numCheck, left_on='ITEMID', how='inner', right_index=True)
print('Number of unique ITEMID in chartevents:', chartevents.ITEMID.nunique())

Number of unique ITEMID in chartevents: 450


In [5]:
###read the ccs codes and merge the dataframes

###read the diagnoses icd codes
df_ccs = pd.read_csv(os.path.join(data_path,'DIAGNOSES_ICD.csv.gz'), compression = 'gzip')
df_ccs = df_ccs[df_ccs['ICD9_CODE'].notna()]
def format_icd9code_string(code):

    while len(code)<5:
        code = code+' '
    return code
df_ccs['ICD9_CODE_CAST'] = df_ccs['ICD9_CODE'].apply(format_icd9code_string) 
###read the mapping from icd9 to ccs
ccs_map = pd.read_csv(os.path.join(mapping_path,'ICD9_CCS_MAPPING.csv'), quotechar="'", skiprows=[0], 
                          usecols=['ICD-9-CM CODE', 'CCS CATEGORY','CCS CATEGORY DESCRIPTION','ICD-9-CM CODE DESCRIPTION'])
ccs_map.set_index('ICD-9-CM CODE', inplace=True)

###do the merging
df_ccs = pd.merge(df_ccs, ccs_map, left_on='ICD9_CODE_CAST', right_index=True, how='inner')
df_ccs.sort_values(by = ['HADM_ID', 'SEQ_NUM'], inplace=True)
df_ccs = df_ccs.drop_duplicates(subset = ['HADM_ID', 'CCS CATEGORY'], keep = 'first')

In [6]:
#df_ccs[['HADM_ID', 'CCS CATEGORY']]
df_merged = pd.merge(chartevents[['HADM_ID', 'ITEMID', 'VALUENUM']], df_ccs[['HADM_ID', 'CCS CATEGORY']], on = 'HADM_ID')
df_merged.sort_values(by = ['ITEMID', 'HADM_ID'], inplace = True)

In [7]:
df_mean = df_merged.groupby(['ITEMID', 'CCS CATEGORY']).mean()
df_count = df_merged.groupby(['ITEMID', 'CCS CATEGORY']).count()
df_mean.reset_index(inplace = True)
df_count.reset_index(inplace = True)
items = df_mean.ITEMID.unique()
###we have data for 272 codes but clinical bert predicts 281
#ccs_codes = df_mean['CCS CATEGORY'].unique()
ccs_codes = df_ccs['CCS CATEGORY'].unique()
ccs_codes.sort()

In [8]:
n_rows = ccs_codes.shape[0]
n_cols = items.shape[0]
array_val = np.zeros([n_rows, n_cols])
array_counts = np.zeros([n_rows, n_cols])
for i in range(n_rows):
    for j in range(n_cols):
        temp_ccs = ccs_codes[i]
        temp_item = items[j]
        temp = df_mean.loc[(df_mean.ITEMID == temp_item) & (df_mean['CCS CATEGORY'] == temp_ccs)].VALUENUM
        if temp.shape[0] != 0:
            array_val[i,j] = temp.item()
            array_counts[i,j] = df_count.loc[(df_mean.ITEMID == temp_item) & (df_mean['CCS CATEGORY'] == temp_ccs)].VALUENUM.item()
        else:
            array_val[i,j] = np.nan

In [12]:
###normalise the first array using the mean of each item_id
#df_item_mean = df_merged.groupby('ITEMID').mean()
#df_item_std = df_merged.groupby('ITEMID').std()
chartevents.sort_values(by = 'ITEMID', inplace = True)
df_item_mean = chartevents.groupby('ITEMID').mean()
df_item_std = chartevents.groupby('ITEMID').std()

item_mean = df_item_mean.VALUENUM.to_numpy()
item_std = df_item_std.VALUENUM.to_numpy()
### replace nans with 0 (itemsids where we have only one measurements and the denom of std n-1 = 0)
item_std = np.nan_to_num(item_std, nan = 0)

In [13]:
### this will give a warning since it will try to divide by 0 for some features! This is because we do not have many 
###observations for some ccs codes, and the standard deviation is 0 
### Then we replace nan and inf with 0.
z_scores = (array_val - item_mean)/ item_std
z_scores = np.nan_to_num(z_scores, nan = 0.0, posinf = 0.0, neginf = 0.0)

  after removing the cwd from sys.path.


In [14]:
z_scores[(z_scores < 1) & (z_scores > -1)] = 0 
z_scores[z_scores >= 1] = 1
z_scores[z_scores <= -1] = -1
z_scores
print(z_scores[z_scores == 1].shape)
print(z_scores[z_scores == 0].shape)
print(z_scores[z_scores == -1].shape)

(2265,)
(122771,)
(1414,)


In [15]:
output_path = r'/local/home/papageoa/data/mimic/matrix_factorization'

df_val = pd.DataFrame(z_scores, columns = items)
df_val.set_index(ccs_codes, inplace = True)

df_counts = pd.DataFrame(array_counts, columns = items)
df_counts.set_index(ccs_codes, inplace = True)

df_val.to_csv(os.path.join(output_path, 'mf_val.csv.gz'), compression = 'gzip')
df_counts.to_csv(os.path.join(output_path, 'mf_count.csv.gz'), compression = 'gzip')