In [None]:
import numpy as np
import pandas as pd
import requests
from datetime import datetime
from collections import OrderedDict
import copy
import matplotlib.pyplot as plt
import random


# load raw data: MIMIC-IV v2.0

## 1. Cohort extraction: patients with admission history and didn't die in hospital. 

In [None]:
folder_name = '/data/chao/mimic/physionet.org/files/mimiciv/2.0/hosp' ## Replace with your local path for mimiciv-2.0 data

# load patients, make patient dictionary.
patients = pd.read_csv(folder_name + '/patients.csv')
patients_dict = {}
for row in patients.values:
    patients_dict[row[0]] = [row[1], row[2], row[3], row[4], row[5]]  # gender, anchor_age, anchor_year, anchor_year_group, dod

# load admission
admissions = pd.read_csv(folder_name + '/admissions.csv')
patients_anchor_age_not_last_v_y = {}
tmp = admissions.groupby(['subject_id'])
for key, item in tmp:
    group_df = tmp.get_group(key)
    group_df = group_df.reset_index(drop=True)
    discharge_time_list = [datetime.strptime(str_time, "%Y-%m-%d %H:%M:%S") for str_time in list(group_df['dischtime'])]
    last_discharge_timestr = sorted(discharge_time_list, key=lambda t: t, reverse=True)[0].strftime("%Y-%m-%d %H:%M:%S")
    last_admission_row_id = list(group_df['dischtime'] == last_discharge_timestr).index(1)
    if key in patients_dict.keys():
        patients_dict[key] = patients_dict[key] + [group_df.iloc[last_admission_row_id]['race'], group_df.iloc[last_admission_row_id]['hadm_id'], group_df.iloc[last_admission_row_id]['admittime'], group_df.iloc[last_admission_row_id]['dischtime'], group_df.iloc[last_admission_row_id]['deathtime'], group_df.iloc[last_admission_row_id]['admission_type'], group_df.iloc[last_admission_row_id]['insurance'], group_df.iloc[last_admission_row_id]['admission_location'], group_df.iloc[last_admission_row_id]['discharge_location']]
    
    if patients_dict[key][2] != int(patients_dict[key][7][:4]):
        patients_anchor_age_not_last_v_y[key] = [patients_dict[key][1], patients_dict[key][2], patients_dict[key][7]]

patients_dict_w_admission = {}
patients_dict_w_admission_n_die_inhosp = {}
one_y_outcome = {}
die_count = 0
for key, ele in patients_dict.items():
    if len(ele) > 5:
        patients_dict_w_admission[key] = ele
        if ele[9] != ele[9]: # patients not die in hospital
            patients_dict_w_admission_n_die_inhosp[key] = ele
            if ele[4] == ele[4]:
                one_y_outcome[key] = 1  # die in one year after last hospital stay
                die_count += 1
            else:
                one_y_outcome[key] = 0  # not die in one year after last hospital stay

print("Num of patients in total: %d" % len(patients_dict))        
print("Num of patients with admissions: %d" % len(patients_dict_w_admission))
print("Num of patients with admissions and not die in hospital: %d" % len(patients_dict_w_admission_n_die_inhosp))
print("    among which, %d (%.2f) patients died in one year after their last admission." % (die_count, die_count/len(patients_dict_w_admission_n_die_inhosp))) 


np.save('/data/chao/syn_mimic/preprocessing/patients_dict_w_admission.npy', patients_dict_w_admission)
np.save('/data/chao/syn_mimic/preprocessing/patients_anchor_age_not_last_v_y.npy', patients_anchor_age_not_last_v_y)
np.save('/data/chao/syn_mimic/preprocessing/patients_dict_w_admission_n_die_inhosp.npy', patients_dict_w_admission_n_die_inhosp)
np.save('/data/chao/syn_mimic/preprocessing/one_y_outcome.npy', one_y_outcome)


## 2. Diagnosis extraction

In [None]:
# load diagnoses
diagnoses = pd.read_csv(folder_name + '/diagnoses_icd.csv')
diagnoses_pt_df = pd.DataFrame()
tmp = diagnoses.groupby(['subject_id'])

for key, item in tmp:
    if key in patients_dict_w_admission_n_die_inhosp.keys():
        group_df = tmp.get_group(key)
        group_df = group_df.reset_index(drop=True)
        diagnoses_pt_df = diagnoses_pt_df.append(group_df, ignore_index=True)
        
diagnoses_pt_df.to_csv(folder_name + '/diagnoses_selected_cohort.csv', index=False)

# diagnoses_pt_df = pd.read_csv(folder_name + '/diagnoses_selected_cohort.csv')

patient_diagnosis_dict = {}
patients_w_icd_9_10 = []
temp = diagnoses_pt_df.groupby(['subject_id', 'icd_version'])
for key, item in temp:
    if key[0] not in patient_diagnosis_dict.keys():
        patient_diagnosis_dict[key[0]] = {'icd9':[], 'icd10':[]}
    if key[1] == 9:
        patient_diagnosis_dict[key[0]]['icd9'] = list(set(item['icd_code']))
    else:
        patient_diagnosis_dict[key[0]]['icd10'] = list(set(item['icd_code']))
    
    if len(patient_diagnosis_dict[key[0]]['icd9']) > 0 and len(patient_diagnosis_dict[key[0]]['icd10']) > 0:
        patients_w_icd_9_10.append(key[0])

np.save('/data/chao/syn_mimic/preprocessing/patient_diagnosis_dict.npy', patient_diagnosis_dict)
np.save('/data/chao/syn_mimic/preprocessing/patients_w_icd_diag_9_10.npy', patients_w_icd_9_10)

In [None]:
len(patients_w_icd_9_10)

## 3. Procedure extraction

In [None]:
# load procedure hcpcs
procedures_hcpcs = pd.read_csv(folder_name + '/hcpcsevents.csv')
procedures_hcpcs_pt_df = pd.DataFrame()
tmp = procedures_hcpcs.groupby(['subject_id'])
for key, item in tmp:
    if key in patients_dict_w_admission_n_die_inhosp.keys():
        group_df = tmp.get_group(key)
        group_df = group_df.reset_index(drop=True)
        procedures_hcpcs_pt_df = procedures_hcpcs_pt_df.append(group_df, ignore_index=True)
        
procedures_hcpcs_pt_df.to_csv(folder_name + '/procedures_hcpcs_selected_cohort.csv', index=False)

patient_procedures_hcpcs_dict = {}
temp = procedures_hcpcs_pt_df.groupby(['subject_id'])
for key, item in temp:
    hcpcs_list = list(item['hcpcs_cd'])
    cpt_list = []
    for code in hcpcs_list:
        if not any(c.isalpha() for c in code):
            cpt_list.append(code)
    patient_procedures_hcpcs_dict[key] = list(set(cpt_list))
    
np.save('/data/chao/syn_mimic/preprocessing/patient_procedures_hcpcs_dict.npy', patient_procedures_hcpcs_dict)

In [None]:
# load procedure icd
procedures_icd = pd.read_csv(folder_name + '/procedures_icd.csv')
procedures_icd_pt_df = pd.DataFrame()
tmp = procedures_icd.groupby(['subject_id'])
for key, item in tmp:
    if key in patients_dict_w_admission_n_die_inhosp.keys():
        group_df = tmp.get_group(key)
        group_df = group_df.reset_index(drop=True)
        procedures_icd_pt_df = procedures_icd_pt_df.append(group_df, ignore_index=True)
        
procedures_icd_pt_df.to_csv(folder_name + '/procedures_icd_selected_cohort.csv', index=False)

patient_procedures_icd_dict = {}
patients_w_icd_9_10 = []
temp = procedures_icd_pt_df.groupby(['subject_id', 'icd_version'])
for key, item in temp:
    if key[0] not in patient_procedures_icd_dict.keys():
        patient_procedures_icd_dict[key[0]] = {'icd9':[], 'icd10':[]}
    if key[1] == 9:
        patient_procedures_icd_dict[key[0]]['icd9'] = list(set(item['icd_code']))
    else:
        patient_procedures_icd_dict[key[0]]['icd10'] = list(set(item['icd_code']))
    
    if len(patient_procedures_icd_dict[key[0]]['icd9']) > 0 and len(patient_procedures_icd_dict[key[0]]['icd10']) > 0:
        patients_w_icd_9_10.append(key[0])

np.save('/data/chao/syn_mimic/preprocessing/patient_procedures_icd_dict.npy', patient_procedures_icd_dict)
np.save('/data/chao/syn_mimic/preprocessing/patients_w_icd_proc_9_10.npy', patients_w_icd_9_10)


## 4. Prescription extraction

In [None]:
# load prescription (ndc codes)
prescriptions = pd.read_csv(folder_name + '/prescriptions.csv', dtype={"ndc":np.str})
prescriptions = prescriptions[["subject_id", "hadm_id", "ndc"]]
prescriptions_pt_df = pd.DataFrame()
tmp = prescriptions.groupby(['subject_id'])

count = 0
for key, item in tmp:
    if key in patients_dict_w_admission_n_die_inhosp.keys():
        group_df = tmp.get_group(key)
        group_df = group_df.reset_index(drop=True)
        prescriptions_pt_df = prescriptions_pt_df.append(group_df, ignore_index=True)
        
    count += 1
    if count % 5000 == 1:
        print(count)
        
prescriptions_pt_df.to_csv(folder_name + '/prescriptions_selected_cohort.csv', index=False)

patient_prescriptions_dict = {}
temp = prescriptions_pt_df.groupby(['subject_id'])
for key, item in temp:
    candidate_list = list(set(item['ndc']))
    if '0' in candidate_list:
        candidate_list.remove('0')
    patient_prescriptions_dict[key] = candidate_list

np.save('/data/chao/syn_mimic/preprocessing/patient_prescriptions_dict.npy', patient_prescriptions_dict)

## 5. Other measure extraction

In [None]:
# load BMI blood pressure (diastolic/systolic)
measures = pd.read_csv(folder_name + '/omr.csv')
measures_pt_df = pd.DataFrame()
tmp = measures.groupby(['subject_id'])

patient_bmi_bp = {}
for key, item in tmp:
    if key in patients_dict_w_admission_n_die_inhosp.keys():
        group_df = tmp.get_group(key)
        group_df = group_df.reset_index(drop=True)
        group_df = group_df.sort_values(by='chartdate',ascending=False)
        cat_list = list(group_df['result_name'])
        if 'BMI (kg/m2)' in cat_list:
            bmi_index = cat_list.index('BMI (kg/m2)')
            bmi = np.float(group_df.iloc[bmi_index]['result_value'])
        else:
            bmi = np.nan
            
        if 'Blood Pressure' in cat_list:
            bp_index = cat_list.index('Blood Pressure')
            bp_str = group_df.iloc[bp_index]['result_value'].split('/')
            dias_bp = np.float(bp_str[1])
            syst_bp = np.float(bp_str[0])
            if dias_bp <= 0 and syst_bp <=0:
                dias_bp = np.nan
                syst_bp = np.nan
        else:
            dias_bp = np.nan
            syst_bp = np.nan
            
        patient_bmi_bp[key] = [bmi, dias_bp, syst_bp]
        
np.save('/data/chao/syn_mimic/preprocessing/patient_bmi_bp_dict.npy', patient_bmi_bp)


In [None]:
patient_bmi_bp_dict = np.load('/data/chao/syn_mimic/preprocessing/patient_bmi_bp_dict.npy', allow_pickle=True).item()
len(patient_bmi_bp_dict)



# Build patient by concept matrix

## 1.a Diagnosis (map to Phecode)  --> the tutorial uses this mapping instead of 1.b

In [None]:
# load icd to phemap mapping
phecode_icd9_map = pd.read_csv('/data/chao/syn_mimic/preprocessing/mapping_tables/phecode_icd9_map_unrolled.csv', dtype={"phecode":np.str})
phecode_icd9_map_dict = {}
for row in phecode_icd9_map.values:
    if str(row[1]) != 'nan':
        phecode_icd9_map_dict[row[0].replace(".", "")] = row[1]

phecode_icd10_map = pd.read_csv('/data/chao/syn_mimic/preprocessing/mapping_tables/Phecode_map_v1_2_icd10_beta.csv', dtype={"PHECODE":np.str})
phecode_icd10_map_dict = {}
for row in phecode_icd10_map.values:
    if str(row[1]) != 'nan':
        phecode_icd10_map_dict[row[0].replace(".", "")] = row[1]


patient_diagnosis_dict = np.load('/data/chao/syn_mimic/preprocessing/patient_diagnosis_dict.npy', allow_pickle=True).item()
patient_diagnosis_final_dict = {}
missed_icd9_code = []
missed_icd10_code = []
for key, code_list in patient_diagnosis_dict.items():
    icd9_list = code_list['icd9']
    icd10_list = code_list['icd10']
    phecodes_for_icd9_list = []
    phecodes_for_icd10_list = []
    
    for code in icd9_list:
        if code in phecode_icd9_map_dict.keys():
            phecodes_for_icd9_list.append(phecode_icd9_map_dict[code])
        else:
            if len(code) > 3:
                if code[:len(code)-1] in phecode_icd9_map_dict.keys():
                    phecodes_for_icd9_list.append(phecode_icd9_map_dict[code[:len(code)-1]])
                elif code[:len(code)-2] in phecode_icd9_map_dict.keys():
                    phecodes_for_icd9_list.append(phecode_icd9_map_dict[code[:len(code)-2]])
                else:
                    missed_icd9_code.append(code)
                
    for code in icd10_list:
        if code in phecode_icd10_map_dict.keys():
            phecodes_for_icd10_list.append(phecode_icd10_map_dict[code])
        else:
            if len(code)>4:
                if code[:len(code)-1] in phecode_icd10_map_dict.keys():
                    phecodes_for_icd10_list.append(phecode_icd10_map_dict[code[:len(code)-1]])
                elif code[:len(code)-2] in phecode_icd10_map_dict.keys():
                    phecodes_for_icd10_list.append(phecode_icd10_map_dict[code[:len(code)-2]])
                elif code[:len(code)-3] in phecode_icd10_map_dict.keys():
                    phecodes_for_icd10_list.append(phecode_icd10_map_dict[code[:len(code)-3]])
                else:
                    missed_icd10_code.append(code)
            
    patient_diagnosis_final_dict[key] = list(set(phecodes_for_icd9_list + phecodes_for_icd10_list))
    
np.save('/data/chao/syn_mimic/preprocessing/patient_phecode_diagnosis_final_dict.npy', patient_diagnosis_final_dict)




## 1.b Diagnosis (map to icd9)


In [None]:
# load icd10 to icd9 mapping
icd10_icd9_map = pd.read_csv('/data/chao/syn_mimic/preprocessing/mapping_tables/icd10cmtoicd9gem.csv')
icd10_icd9_map_dict = {}
for row in icd10_icd9_map.values:
    if str(row[1]) != 'nan':
        icd10_icd9_map_dict[row[0]] = row[1]

patient_diagnosis_dict = np.load('./patient_diagnosis_dict.npy', allow_pickle=True).item()
patient_diagnosis_final_dict = {}
missed_icd10_code = []
for key, code_list in patient_diagnosis_dict.items():
    icd9_list = code_list['icd9']
    icd10_list = code_list['icd10']
    mapped_icd10_list = []
                    
    for code in icd10_list:
        if code in icd10_icd9_map_dict.keys():
            mapped_icd10_list.append(icd10_icd9_map_dict[code])
        else:
            if len(code)>4:
                if code[:len(code)-1] in icd10_icd9_map_dict.keys():
                    mapped_icd10_list.append(icd10_icd9_map_dict[code[:len(code)-1]])
                elif code[:len(code)-2] in icd10_icd9_map_dict.keys():
                    mapped_icd10_list.append(icd10_icd9_map_dict[code[:len(code)-2]])
                elif code[:len(code)-3] in icd10_icd9_map_dict.keys():
                    mapped_icd10_list.append(icd10_icd9_map_dict[code[:len(code)-3]])
                else:
                    missed_icd10_code.append(code)
            
    patient_diagnosis_final_dict[key] = list(set(icd9_list + mapped_icd10_list))
    
    
np.save('/data/chao/syn_mimic/preprocessing/patient_ICD_diagnosis_final_dict.npy', patient_diagnosis_final_dict)

## 2. Procedure (cpt4)

In [None]:
patient_procedures_hcpcs_final_dict = np.load('/data/chao/syn_mimic/preprocessing/patient_procedures_hcpcs_dict.npy', allow_pickle=True).item()
patient_procedures_hcpcs_final_dict

## 3. Prescription (ndc-->rxcui-->ingradient rxcui)

In [None]:
patient_prescriptions_dict = np.load('/data/chao/syn_mimic/preprocessing/patient_prescriptions_dict.npy', allow_pickle=True).item()
patient_prescriptions_dict

patient_prescriptions_rxcui_final_dict = {}
patient_prescriptions_ingr_rxcui_final_dict = {}

count = 0
for key, item in patient_prescriptions_dict.items():
    rxcui_list = []
    ingr_list = []
    for ndc_code in item:
        response = requests.get(f'https://rxnav.nlm.nih.gov/REST/relatedndc.json?relation=product&ndc={ndc_code}')
        content = response.json()
        if len(content)>0:
            rxcui = content['ndcInfoList']['ndcInfo'][0]['rxcui']
            rxcui_list.append(rxcui)
            response_ing = requests.get(f'https://rxnav.nlm.nih.gov/REST/rxcui/{rxcui}/historystatus.json?caller=RxNav').json()
            drug_type = response_ing['rxcuiStatusHistory']['attributes']['tty']
            if response_ing['rxcuiStatusHistory']['derivedConcepts'] is not None:
                num_ingredient = len(response_ing['rxcuiStatusHistory']['derivedConcepts']['ingredientConcept'])
                for i in range(0,num_ingredient):
                    ingredient = response_ing['rxcuiStatusHistory']['derivedConcepts']['ingredientConcept'][i]['ingredientRxcui']
                    ingr_list.append(ingredient)
    
    patient_prescriptions_rxcui_final_dict[key] = list(set(rxcui_list))
    patient_prescriptions_ingr_rxcui_final_dict[key] = list(set(ingr_list))
    count += 1
    if count % 5000 == 1:
        print(count, flush=True)
    
    
np.save('/data/chao/syn_mimic/preprocessing/patient_prescriptions_rxcui_final_dict.npy', patient_prescriptions_rxcui_final_dict)
np.save('/data/chao/syn_mimic/preprocessing/patient_prescriptions_ingr_rxcui_final_dict.npy', patient_prescriptions_ingr_rxcui_final_dict)


In [None]:
patient_prescriptions_dict = np.load('/data/chao/syn_mimic/preprocessing/patient_prescriptions_dict.npy', allow_pickle=True).item()

distinct_ndc_dict = {}
for key, item in patient_prescriptions_dict.items():
    for ndc_code in item:
        if ndc_code not in distinct_ndc_dict.keys():
            distinct_ndc_dict[ndc_code] = 1
        else:
            distinct_ndc_dict[ndc_code] = distinct_ndc_dict[ndc_code] + 1

ndc_rxcui_mapping = {}
ndc_rxcui_ingr_mapping = {}

count = 0
for ndc in distinct_ndc_dict.keys():
    response = requests.get(f'https://rxnav.nlm.nih.gov/REST/relatedndc.json?relation=product&ndc={ndc}')
    content = response.json()
    if len(content)>0:
        rxcui = content['ndcInfoList']['ndcInfo'][0]['rxcui']
        ndc_rxcui_mapping[ndc] = rxcui
        response_ing = requests.get(f'https://rxnav.nlm.nih.gov/REST/rxcui/{rxcui}/historystatus.json?caller=RxNav').json()
        drug_type = response_ing['rxcuiStatusHistory']['attributes']['tty']
        if response_ing['rxcuiStatusHistory']['derivedConcepts'] is not None:
            ingr_list = []
            num_ingredient = len(response_ing['rxcuiStatusHistory']['derivedConcepts']['ingredientConcept'])
            for i in range(0,num_ingredient):
                ingredient = response_ing['rxcuiStatusHistory']['derivedConcepts']['ingredientConcept'][i]['ingredientRxcui']
                ingr_list.append(ingredient)

            ndc_rxcui_ingr_mapping[ndc] = ingr_list
        else:
            ndc_rxcui_ingr_mapping[ndc] = []
    count += 1     
    if count % 500 == 1:
        print(count, flush=True)

patient_prescriptions_rxcui_final_dict = {}
patient_prescriptions_ingr_rxcui_final_dict = {}
count = 0
for key, item in patient_prescriptions_dict.items():
    individual_rxcui_list = []
    individual_ingr_list = []
    for ndc_code in item:
        if ndc_code in ndc_rxcui_mapping.keys():
            individual_rxcui_list.append(ndc_rxcui_mapping[ndc_code])
            individual_ingr_list.extend(ndc_rxcui_ingr_mapping[ndc_code])
    
    patient_prescriptions_rxcui_final_dict[key] = list(set(individual_rxcui_list))
    patient_prescriptions_ingr_rxcui_final_dict[key] = list(set(individual_ingr_list))
    count += 1
    if count % 5000 == 1:
        print(count, flush=True)
        
np.save('/data/chao/syn_mimic/preprocessing/patient_prescriptions_rxcui_final_dict.npy', patient_prescriptions_rxcui_final_dict)
np.save('/data/chao/syn_mimic/preprocessing/patient_prescriptions_ingr_rxcui_final_dict.npy', patient_prescriptions_ingr_rxcui_final_dict)
        

In [None]:
print(len(ndc_rxcui_mapping))
print(len(set(sum(list(ndc_rxcui_ingr_mapping.values()),[]))))

## 4. Other measures

In [None]:
patient_bmi_bp_final_dict = np.load('/data/chao/syn_mimic/preprocessing/patient_bmi_bp_dict.npy', allow_pickle=True).item()
patient_bmi_bp_final_dict


## 5. Demographics

In [None]:
patients_dict = np.load('/data/chao/syn_mimic/preprocessing/patients_dict_w_admission_n_die_inhosp.npy', allow_pickle=True).item()
patients_final_dict = {}
race_set = set()
gender_set = set()
for key, item in patients_dict.items():
    race = item[5]
    gender = item[0]
    anchor_age = item[1]
    anchor_year = item[2]
    age_at_last_discharge = anchor_age + int(item[8][:4]) - anchor_year
    if age_at_last_discharge > 91:
        age_at_last_discharge = 91
    patients_final_dict[key] = [age_at_last_discharge, gender, race]
    race_set.add(item[5])
    gender_set.add(item[0])
    
np.save('/data/chao/syn_mimic/preprocessing/patients_final_dict.npy', patients_final_dict)

In [None]:
patients_final_dict

## 6. Combine all info into a patient by concept matrix

In [None]:
## figure out all columns
# diagnoses
patient_phecode_diagnosis_final_dict = np.load('/data/chao/syn_mimic/preprocessing/patient_phecode_diagnosis_final_dict.npy', allow_pickle=True).item()
patient_phecode_space = {}
for key, item in patient_phecode_diagnosis_final_dict.items():
    for code in item:
        if code not in patient_phecode_space.keys():
            patient_phecode_space[code] = 0
patient_phecode_space = OrderedDict(sorted(patient_phecode_space.items()))
phecode_key_list = list(patient_phecode_space.keys())

patient_diagnosis_section = {}
for key, item in patient_phecode_diagnosis_final_dict.items():
    individual_phecode_indicator = [0] * len(phecode_key_list)
    for code in item:
        individual_phecode_indicator[phecode_key_list.index(code)] = 1
    patient_diagnosis_section[key] = individual_phecode_indicator
    
# procedures (not included in demo)
patient_procedures_hcpcs_final_dict = np.load('/data/chao/syn_mimic/preprocessing/patient_procedures_hcpcs_dict.npy', allow_pickle=True).item()
patient_cpt_space = {}
for key, item in patient_procedures_hcpcs_final_dict.items():
    for code in item:
        if code not in patient_cpt_space.keys():
            patient_cpt_space[code] = 0
patient_cpt_space = OrderedDict(sorted(patient_cpt_space.items()))
cpt_key_list = list(patient_cpt_space.keys())

patient_procedure_section = {}
for key, item in patient_procedures_hcpcs_final_dict.items():
    individual_cpt_indicator = [0] * len(cpt_key_list)
    for code in item:
        individual_cpt_indicator[cpt_key_list.index(code)] = 1
    patient_procedure_section[key] = individual_cpt_indicator
    
# medication (not included in demo)

# other measures
patient_bmi_bp_final_dict = np.load('/data/chao/syn_mimic/preprocessing/patient_bmi_bp_dict.npy', allow_pickle=True).item()
patient_measure_section = copy.deepcopy(patient_bmi_bp_final_dict)

# demographics
patients_final_dict = np.load('/data/chao/syn_mimic/preprocessing/patients_final_dict.npy', allow_pickle=True).item()
# merge race
patient_demo_section = {}
race_count = {'WHITE':0, 'BLACK':0, 'ASIAN':0, 'HISPANIC':0, 'UN':0, 'OTHER':0}
for key, item in patients_final_dict.items():
    if item[2].startswith('WHITE'):
        race = [1,0,0,0,0,0]
        race_count['WHITE'] = race_count['WHITE'] + 1
    elif item[2].startswith('BLACK'):
        race = [0,1,0,0,0,0]
        race_count['BLACK'] = race_count['BLACK'] + 1
    elif item[2].startswith('ASIAN'):
        race = [0,0,1,0,0,0]
        race_count['ASIAN'] = race_count['ASIAN'] + 1
    elif item[2].startswith('HISPANIC'):
        race = [0,0,0,1,0,0]
        race_count['HISPANIC'] = race_count['HISPANIC'] + 1
    elif item[2].startswith('UN'):
        race = [0,0,0,0,1,0]
        race_count['UN'] = race_count['UN'] + 1
    else:
        race = [0,0,0,0,0,1]
        race_count['OTHER'] = race_count['OTHER'] + 1
        
    if item[1] == 'F':
        gender = 1
    else:
        gender = 0
        
    patient_demo_section[key] = [item[0], gender] + race
    


In [None]:
one_y_outcome = np.load('/data/chao/syn_mimic/preprocessing/one_y_outcome.npy', allow_pickle=True).item()

patient_concept_mat = []
for key, demo_vec in patient_demo_section.items():
    if key in patient_diagnosis_section.keys():
        diagnosis_vec = patient_diagnosis_section[key]
    else:
        diagnosis_vec = [0] * len(phecode_key_list)
    if key in patient_measure_section.keys():
        measure_vec = patient_measure_section[key]
    else:
        measure_vec = [np.nan] * 3
    
    patient_concept_mat.append([one_y_outcome[key]] + demo_vec + measure_vec + diagnosis_vec)
    
column_list = ['DIE_1y', 'AGE', 'GENDER'] + list(race_count.keys()) + ['BMI','DIASTOLIC','SYSTOLIC'] + phecode_key_list
assert len(column_list) == len(patient_concept_mat[0])
np.save('/data/chao/syn_mimic/preprocessing/column_list_only_diagnosis.npy', column_list)
np.save('/data/chao/syn_mimic/preprocessing/patient_concept_mat_only_diagnosis.npy', patient_concept_mat)


In [None]:
patient_concept_mat_df = pd.DataFrame(patient_concept_mat, columns = column_list)
patient_concept_mat_df.to_csv('/data/chao/syn_mimic/preprocessing/patient_concept_mat_df.csv', index=False)
patient_concept_mat_df

# Data summarization

## Overall characteristics
- **Number of patients**: 181,294
- **Number of columns**: 1,600

## Column breakdown
- **Outcome (binary)**: Die in one year since last admission: 11.3%
- **Age (continuous)**: 56.20 ± 20.39
- **Gender (binary)**: Female vs Male: 53.3% vs 46.7%
- **Race (onehot length 6)**: WHITE (67.3%)  BLACK (13.2%)  ASIAN (4.2%)  HISPANIC (5.5%)  UNKNOWN (4.2%)  OTHER (5.6%)
- **BMI (continuous)**: 21.06 ± 277.03, max: 107840.2 (subject to outlier removal)  
- **DIASTOLIC (continuous)**: 47.55 ± 36.42, max: 168.0
- **SYSTOLIC (continuous)**: 81.87 ± 62.27, max: 243.0

**Topic 10 prevalent phecodes:**
- `401`: Hypertension (31.57%)
- `272`: Disorders of lipoid metabolism (21.63%)
- `285`: Other anemias (18.74%)
- `401.1`: Essential hypertension (17.48%)
- `272.1`: Hyperlipidemia (15.45%)
- `530`: Diseases of esophagus (14.28%)
- `427`: Cardiac dysrhythmias (13.95%)
- `296`: Mood disorders (13.90%)
- `318`: Tobacco use disorder (13.32%)
- `276`: Disorders of fluid, electrolyte, and acid-base balance (13.18%)
- `250`: Diabetes mellitus (13.12%)

**Number of phecodes with less than x prevalence:**
- 10<sup>-6</sup>: 0
- 10<sup>-5</sup>: 25
- 10<sup>-4</sup>: 224
- 10<sup>-3</sup>: 718

In [None]:
from operator import itemgetter
phecode_prevalence = {}
for code in phecode_key_list:
    phecode_prevalence[code] = np.mean(patient_concept_mat_df[code])

phecode_prevalence = OrderedDict(sorted(phecode_prevalence.items(), key=itemgetter(1),reverse=True))
phecode_prevalence

In [None]:
np.sum(np.array(list(phecode_prevalence.values()))<0.001)

In [None]:
np.max(patient_concept_mat_df['BMI'])
len(cpt_key_list)

In [None]:
patient_ICD_diagnosis_final_dict = np.load('/data/chao/syn_mimic/preprocessing/patient_ICD_diagnosis_final_dict.npy', allow_pickle=True).item()
patient_ICD_space = {}
for key, item in patient_ICD_diagnosis_final_dict.items():
    for code in item:
        if code not in patient_ICD_space.keys():
            patient_ICD_space[code] = 0
patient_ICD_space = OrderedDict(sorted(patient_ICD_space.items()))
ICD_key_list = list(patient_ICD_space.keys())

# patient_diagnosis_section = {}
# for key, item in patient_ICD_diagnosis_final_dict.items():
#     individual_ICD_indicator = [0] * len(ICD_key_list)
#     for code in item:
#         individual_ICD_indicator[ICD_key_list.index(code)] = 1
#     patient_diagnosis_section[key] = individual_ICD_indicator
len(ICD_key_list)

In [None]:
patient_concept_mat_df = pd.read_csv('/data/chao/syn_mimic/preprocessing/patient_concept_mat_df.csv')
patient_concept_mat_df

# Data preprocessing for GAN training

## 1. Remove outliers


In [None]:
## Focus on non-binary features, here we check the distribution characteristics of AGE, BMI, DIASTOLIC, and SYSTOLIC columns.
cols = ['AGE', 'BMI', 'DIASTOLIC', 'SYSTOLIC']
for col_name in cols:
    col_data = list(patient_concept_mat_df[col_name])
    print(col_name)
    print('   min value: ', np.nanmin(col_data), ' max value: ', np.nanmax(col_data), ' mean value: ', np.nanmean(col_data), ' median value: ', np.nanmedian(col_data))

for i in range(len(cols)):
    plt.hist(list(patient_concept_mat_df[cols[i]]), density=True, bins=80)
    plt.ylabel('Probability')
    plt.xlabel(cols[i])
    plt.show()
    


In [None]:
## Process outliers

# AGE: looks normal except that there are ~7 thousand patients with age 91 because of the age procedure for privacy protection (age >=91 --> 91).

# BMI: the max value is not reasonable --> correspond to an outlier. We remove patients with BMI > 60 or BMI < 10.
print('Num of patients with BMI > 60: %d' % np.sum((patient_concept_mat_df['BMI']) > 60))
print('Num of patients with BMI < 10: %d' % np.sum((patient_concept_mat_df['BMI']) < 10))
patient_concept_mat_df.drop(patient_concept_mat_df[patient_concept_mat_df['BMI'] > 60].index, inplace = True)
patient_concept_mat_df.drop(patient_concept_mat_df[patient_concept_mat_df['BMI'] < 10].index, inplace = True)

# DIASTOLIC: the max value is reasonable but the min value isn't --> correspond to an outlier. We remove patients with Diastolic pressure > 30.
print('Num of patients with DIASTOLIC pressure < 30: %d' % np.sum((patient_concept_mat_df['DIASTOLIC']) < 30))
patient_concept_mat_df.drop(patient_concept_mat_df[patient_concept_mat_df['DIASTOLIC'] < 30].index, inplace = True)

# SYSTOLIC: the min/max value is reasonable given the cohort was admitted to ICU/ED.

patient_concept_mat_df = patient_concept_mat_df.reset_index(drop=True)

# if DIASTOLIC > SYSTOLIC, remove the patient.
dias_bp_list = list(patient_concept_mat_df['DIASTOLIC'])
syst_bp_list = list(patient_concept_mat_df['DIASTOLIC'])
remove_row_index_list = []
for i in range(len(dias_bp_list)):
    dias_bp = dias_bp_list[i]
    syst_bp = syst_bp_list[i]
    if dias_bp != dias_bp and syst_bp != syst_bp and dias_bp >= syst_bp:
        remove_row_index_list.append(i)
if len(remove_row_index_list) > 0:
    patient_concept_mat_df.drop(remove_row_index_list, inplace = True)
    print("%d patients with wrong BP relationship removed." % len(remove_row_index_list))
else:
    print("No BP violations.")


for i in range(len(cols)):
    plt.hist(list(patient_concept_mat_df[cols[i]]), density=True, bins=80)
    plt.ylabel('Probability')
    plt.xlabel(cols[i])
    plt.show()
    

## 2. Handle missing values

In [None]:
## Focus on non-binary features, here we check AGE, BMI, DIASTOLIC, and SYSTOLIC columns.
# Check column-wise missing rate
missing_ratio = {}
for col_name in cols:
    missing_ratio[col_name] = patient_concept_mat_df[col_name].isna().sum() / len(patient_concept_mat_df)   
print(missing_ratio)

# Determine missing value imputation strategy. 
# Since the missing rate for the three measures are high, one may choose to remove these columns.
# In this demo, we apply a naive sampling strategy to impute missing data. One can also learn a ML model to predict the measures.
cols = [col_name for col_name in cols if missing_ratio[col_name] > 0]
random.seed(4)
for col_name in cols:
    missing_num = patient_concept_mat_df[col_name].isna().sum()
    col_values = list(patient_concept_mat_df[col_name])
    sampling_list = [x for x in col_values if np.isnan(x) == False]
    for i in range(len(col_values)):
        if np.isnan(col_values[i]) == True:
            col_values[i] = random.choices(sampling_list)[0]

    patient_concept_mat_df[col_name] = col_values
    
for i in range(len(cols)):
    plt.hist(list(patient_concept_mat_df[cols[i]]), density=True, bins=80)
    plt.ylabel('Probability')
    plt.xlabel(cols[i])
    plt.show()


## 3. Normalize continuous columns

In [None]:
## Normalize the continuous feature in to [0,1]
# Store min max values for each related feature
min_max_log = {}
cols = ['AGE', 'BMI', 'DIASTOLIC', 'SYSTOLIC']
for col_name in cols:
    col_value = np.array(patient_concept_mat_df[col_name])
    min_max_log[col_name] = [np.min(col_value), np.max(col_value)]
    norm_col_value = (col_value - min_max_log[col_name][0]) / (min_max_log[col_name][1] - min_max_log[col_name][0])
    patient_concept_mat_df[col_name] = list(norm_col_value)
print(min_max_log)
np.save('/data/chao/syn_mimic/preprocessing/min_max_log.npy', min_max_log)


In [None]:
patient_concept_mat_df

## 4. Remove extremely rare features

In [None]:
## Remove the binary columns with less than x 1s.
cols = list(patient_concept_mat_df.columns)
cols.remove('AGE')
cols.remove('BMI')
cols.remove('DIASTOLIC')
cols.remove('SYSTOLIC')
col_to_remove = []
for col_name in cols:
    col_value = np.array(patient_concept_mat_df[col_name])
    if np.sum(col_value)/len(col_value) < 0.00005:
        col_to_remove.append(col_name)
for col_name in col_to_remove:
    cols.remove(col_name)
    
cols = ['WHITE', 'BLACK', 'ASIAN', 'HISPANIC', 'UN', 'OTHER', 'DIE_1y', 'GENDER'] + cols[8:]

patient_concept_mat_df_ = patient_concept_mat_df[cols + ['AGE','BMI','DIASTOLIC','SYSTOLIC']]
patient_concept_mat_df_.to_csv('/data/chao/syn_mimic/preprocessing/preprocessed_training_data.csv', index=False)

In [None]:
patient_concept_mat_df_

In [None]:
print(np.sum(patient_concept_mat_df_['WHITE']))
print(np.sum(patient_concept_mat_df_['BLACK']))
print(np.sum(patient_concept_mat_df_['ASIAN']))
print(np.sum(patient_concept_mat_df_['HISPANIC']))
print(np.sum(patient_concept_mat_df_['UN']))
print(np.sum(patient_concept_mat_df_['OTHER']))

In [None]:
cols

In [None]:
all_data = patient_concept_mat_df_.values
np.random.seed(0)
np.random.shuffle(all_data)
training_data = all_data[:int(len(all_data)*0.7)]
testing_data = all_data[int(len(all_data)*0.7):]
training_data_df = pd.DataFrame(data=training_data, columns=list(patient_concept_mat_df_.columns))
testing_data_df = pd.DataFrame(data=testing_data, columns=list(patient_concept_mat_df_.columns))
print(np.sum(training_data_df['DIE_1y'])/len(training_data_df))
print(np.sum(testing_data_df['DIE_1y'])/len(testing_data_df))

training_data_df.to_csv('/data/chao/syn_mimic/preprocessing/normalized_training_data.csv', index=False)
testing_data_df.to_csv('/data/chao/syn_mimic/preprocessing/normalized_testing_data.csv', index=False)

min_max_log = np.load('/data/chao/syn_mimic/preprocessing/min_max_log.npy', allow_pickle=True).item()
for key, min_max in min_max_log.items():
    min_, max_ = min_max[0], min_max[1]
    col_values = np.array(training_data_df[key])
    training_data_df[key] = (1 - col_values)*min_ + col_values*max_
    col_values = np.array(testing_data_df[key])
    testing_data_df[key] = (1 - col_values)*min_ + col_values*max_
    
training_data_df.to_csv('/data/chao/syn_mimic/preprocessing/original_training_data.csv', index=False)
testing_data_df.to_csv('/data/chao/syn_mimic/preprocessing/original_testing_data.csv', index=False)


In [None]:
training_data_df

In [None]:
testing_data_df