In [1]:
## Authors:     Michael Quillen & Max Parker; M.D. Candidates @ University of Florida
## Project:     'Towards prediction of CRC in patients under the age of 50'
## PIs:         Dr. Thomas George, MD; Dr. Jiang Bian, PhD
## 
## **base code adapted from Dr. Xi Yang, PhD project: 'Early Prediction of Alzheimer's Disease and Related Dementias
##                                                          Using Electronic Health Records'

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

In [3]:
import time
import os
import sys
import re 
from collections import Counter, defaultdict
from concurrent.futures import ProcessPoolExecutor, wait
from functools import partial
import mmap
import json
import pickle as pkl
import gc
import logging
from tqdm import tqdm
CPU_COUNT=9

In [4]:
def pkl_dump(data, file):
    with open(file, "wb") as fw:
        pkl.dump(data, fw)

        
def pkl_load(file):
    with open(file, "rb") as fr:
        data = pkl.load(fr)
    return data


def pkl4_dump(data, file):
    with open(file, "wb") as fw:
        pkl.dump(data, fw, pkl.HIGHEST_PROTOCOL)

        
def pkl4_load(file):
    with open(file, "rb") as fr:
        data = pkl.load(fr)
    return data

# Case_Control

In [5]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/psm_result/')
os.getcwd()
#case_control = pd.read_csv('DATA_v2/matched_case_control.csv')
#case_control[case_control.has_CRC ==1].head()

'/mnt/data1/chong/2021-CRC/updated_data/psm_result'

In [6]:
case_control_CC_01yr = pd.read_csv("matched_case_control_CC_01yr.csv")
case_control_CC_01yr[case_control_CC_01yr.has_CRC ==1].head()
print(case_control_CC_01yr.shape)

(4103, 22)


In [7]:
case_control_CC_3yr = pd.read_csv("matched_case_control_CC_3yr.csv")
case_control_CC_3yr[case_control_CC_3yr.has_CRC ==1].head()
print(case_control_CC_3yr.shape)

(2236, 22)


In [8]:
case_control_CC_5yr = pd.read_csv("matched_case_control_CC_5yr.csv")
case_control_CC_5yr[case_control_CC_5yr.has_CRC ==1].head()
print(case_control_CC_5yr.shape)

(892, 22)


In [9]:
# initialize global data variable

global_data_CC_01yr = dict()
global_data_CC_3yr = dict()
global_data_CC_5yr = dict()


In [10]:
# function which puts cases and control into global data

def f_case_control01(x):
    pid = x['PATID']
    has_CRC = x['has_CRC']
    age = x['age_index']
    if has_CRC:
        idxd = str(x['index_CRC']).split(" ")[0]
    else:
        idxd = str(x['ref_index_date']).split(" ")[0]
    global_data_CC_01yr[pid] = dict()
    global_data_CC_01yr[pid]['index_date'] = idxd
    global_data_CC_01yr[pid]['age'] = age
    global_data_CC_01yr[pid]['has_CRC'] = has_CRC

def f_case_control3(x):
    pid = x['PATID']
    has_CRC = x['has_CRC']
    age = x['age_index']
    if has_CRC:
        idxd = str(x['index_CRC']).split(" ")[0]
    else:
        idxd = str(x['ref_index_date']).split(" ")[0]
    global_data_CC_3yr[pid] = dict()
    global_data_CC_3yr[pid]['index_date'] = idxd
    global_data_CC_3yr[pid]['age'] = age
    global_data_CC_3yr[pid]['has_CRC'] = has_CRC

def f_case_control5(x):
    pid = x['PATID']
    has_CRC = x['has_CRC']
    age = x['age_index']
    if has_CRC:
        idxd = str(x['index_CRC']).split(" ")[0]
    else:
        idxd = str(x['ref_index_date']).split(" ")[0]
    global_data_CC_5yr[pid] = dict()
    global_data_CC_5yr[pid]['index_date'] = idxd
    global_data_CC_5yr[pid]['age'] = age
    global_data_CC_5yr[pid]['has_CRC'] = has_CRC

In [11]:
# puts cases and controls into global data
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')
os.getcwd()
x01 = case_control_CC_01yr.apply(f_case_control01, axis=1)
x3 = case_control_CC_3yr.apply(f_case_control3, axis=1)
x5 = case_control_CC_5yr.apply(f_case_control5, axis=1)

pkl_dump(global_data_CC_01yr, "aggCC01.pkl")
#global_data_CC_01yr['11e750610fb3ce06bc3a0050569ea8fb'], len(global_data_CC_01yr)

In [12]:
pkl_dump(global_data_CC_3yr, "aggCC3.pkl")
#global_data_CC_3yr['11e750610fb3ce06bc3a0050569ea8fb'], len(global_data_CC_3yr)

In [13]:
pkl_dump(global_data_CC_5yr, "aggCC5.pkl")
#global_data_CC_5yr['11e750610fb3ce06bc3a0050569ea8fb'], len(global_data_CC_5yr)

# Data Preprocessing

#### ENCOUNTERS

In [14]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/EHRs/')
fields = ['PATID','ADMIT_DATE']
df_enc = pd.read_csv("ENCOUNTER.csv", dtype =str,skipinitialspace = True, usecols = fields)
df_enc.head()

Unnamed: 0,PATID,ADMIT_DATE
0,11e75061a5e8c7c8a3fa0050569ea8fb,2013-04-19
1,11e7506100fa593488ed0050569ea8fb,2017-09-05
2,11e75061216d13e68cac0050569ea8fb,2014-02-15
3,11e7506101fb9c9e9be90050569ea8fb,2012-08-21
4,11e75060c0e2203e94360050569ea8fb,2018-08-02


In [15]:
# gets first encounter data from ENCOUNTER for each patient
enc_dates = dict()
for each in df_enc.groupby('PATID'):
    pid = each[0]
    tmp = list(each[1].sort_values('ADMIT_DATE')['ADMIT_DATE'])
    enc_dates[pid] = {"f_enc":tmp[0]}

#### DEMOGRAPHICS

In [16]:
df_dem = pd.read_csv("DEMOGRAPHIC.csv",dtype=str)

In [17]:
# defining criteria to fill as NaN is demographics with UN
print(set(df_dem['SEX'])) # replace sex nan with UN
print(set(df_dem['HISPANIC'])) # NI, R => UN
print(set(df_dem['RACE'])) # '07', 'NI' => UN
et_map = {'NI', 'R'}
race_map = {'07', 'NI'}

{'M', 'F'}
{'NI', 'Y', 'UN', 'N', 'OT', 'R'}
{'01', '02', '06', 'NI', 'UN', '07', 'OT', '04', '03', '05'}


In [18]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')
global_data_CC_01yr = pkl_load("aggCC01.pkl")
global_data_CC_3yr = pkl_load("aggCC3.pkl")
global_data_CC_5yr = pkl_load("aggCC5.pkl")

In [19]:
print(df_dem['HISPANIC'].isna().sum())
print(df_dem['SEX'].isna().sum())
print(df_dem['RACE'].isna().sum())

0
0
0


In [20]:
# function to fix issues with unknowns in demographics

def f_dem01(x):
    pid = x['PATID']
    if pid in global_data_CC_01yr:
        sex = x['SEX']
        if pd.isna(sex):
            sex = 'UN'
        global_data_CC_01yr[pid]['SEX'] = sex
        et = x['HISPANIC']
        if et in et_map:
            et = 'UN'
        global_data_CC_01yr[pid]['Hispanic'] = et
        race = x['RACE']
        if race in race_map:
            race = 'UN'
        global_data_CC_01yr[pid]['Race'] = race
        
def f_dem3(x):
    pid = x['PATID']
    if pid in global_data_CC_3yr:
        sex = x['SEX']
        if pd.isna(sex):
            sex = 'UN'
        global_data_CC_3yr[pid]['SEX'] = sex
        et = x['HISPANIC']
        if et in et_map:
            et = 'UN'
        global_data_CC_3yr[pid]['Hispanic'] = et
        race = x['RACE']
        if race in race_map:
            race = 'UN'
        global_data_CC_3yr[pid]['Race'] = race
        
def f_dem5(x):
    pid = x['PATID']
    if pid in global_data_CC_5yr:
        sex = x['SEX']
        if pd.isna(sex):
            sex = 'UN'
        global_data_CC_5yr[pid]['SEX'] = sex
        et = x['HISPANIC']
        if et in et_map:
            et = 'UN'
        global_data_CC_5yr[pid]['Hispanic'] = et
        race = x['RACE']
        if race in race_map:
            race = 'UN'
        global_data_CC_5yr[pid]['Race'] = race

In [21]:
# puts demographics data into global data by patient ID
xx = df_dem.apply(f_dem01,axis=1)
xx = df_dem.apply(f_dem3, axis =1)
xx = df_dem.apply(f_dem5, axis=1)

#global_data_CC_01yr['11e750610fb3ce06bc3a0050569ea8fb'], len(global_data_CC_01yr)

In [22]:
pkl_dump(global_data_CC_01yr, "aggCC01.pkl")
pkl_dump(global_data_CC_3yr, "aggCC3.pkl")
pkl_dump(global_data_CC_5yr, "aggCC5.pkl")

#### DIAGNOSES

In [23]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/02_MQ_Pre_Processing_Agg_Files/01_MQ_Mapping_Files')
icd2phewas = pd.read_csv("icd2phewasfinal.csv",dtype = str)
icd2phewas = icd2phewas[['ICD','PheCode']]
icd2phewas = icd2phewas.set_index('ICD')
icd2phewas.head()

Unnamed: 0_level_0,PheCode
ICD,Unnamed: 1_level_1
1.0,8.0
1.0,8.0
1.1,8.0
1.9,8.0
2.0,8.0


In [24]:
icd2phewas = icd2phewas.PheCode.to_dict()
#idc2phewas = dict(zip(icd2phewas.ICD,icd2phewasi))

In [25]:
len(icd2phewas), icd2phewas['745'], icd2phewas['E11.40']

(98549, '747.11', '250.24')

In [26]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')
global_data_CC_01yr = pkl_load("aggCC01.pkl")
global_data_CC_3yr = pkl_load("aggCC3.pkl")
global_data_CC_5yr = pkl_load("aggCC5.pkl")
for k in global_data_CC_01yr:
    global_data_CC_01yr[k]['diag'] = dict()
#print(global_data_CC_01yr['11e750610fb3ce06bc3a0050569ea8fb'])

for k in global_data_CC_3yr:
    global_data_CC_3yr[k]['diag'] = dict()
#print(global_data_CC_3yr['11e750610fb3ce06bc3a0050569ea8fb'])

for k in global_data_CC_5yr:
    global_data_CC_5yr[k]['diag'] = dict()
#print(global_data_CC_5yr['11e750610fb3ce06bc3a0050569ea8fb'])

In [27]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/EHRs/')
df_dx = pd.read_csv("DIAGNOSIS.csv", chunksize = 100000, dtype = str)

In [28]:
def f_dx01(x):
    pid = x['PATID']
    if pid in global_data_CC_01yr:
        date = x['ADMIT_DATE']
        dx = x['DX']
        if dx in icd2phewas:
            dx = list({icd2phewas[dx]})
        else:
            dx = [dx]
        if date in global_data_CC_01yr[pid]['diag']:
            global_data_CC_01yr[pid]['diag'][date].extend(dx)
        else:
            global_data_CC_01yr[pid]['diag'][date] = dx
            
def f_dx3(x):
    pid = x['PATID']
    if pid in global_data_CC_3yr:
        date = x['ADMIT_DATE']
        dx = x['DX']
        if dx in icd2phewas:
            dx = list({icd2phewas[dx]})
        else:
            dx = [dx]
        if date in global_data_CC_3yr[pid]['diag']:
            global_data_CC_3yr[pid]['diag'][date].extend(dx)
        else:
            global_data_CC_3yr[pid]['diag'][date] = dx
            
def f_dx5(x):
    pid = x['PATID']
    if pid in global_data_CC_5yr:
        date = x['ADMIT_DATE']
        dx = x['DX']
        if dx in icd2phewas:
            dx = list({icd2phewas[dx]})
        else:
            dx = [dx]
        if date in global_data_CC_5yr[pid]['diag']:
            global_data_CC_5yr[pid]['diag'][date].extend(dx)
        else:
            global_data_CC_5yr[pid]['diag'][date] = dx

In [29]:
for each in df_dx:
    xx = each.apply(f_dx01, axis =1)
    yy = each.apply(f_dx3, axis =1)
    zz = each.apply(f_dx5, axis = 1)

In [30]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')

pkl_dump(global_data_CC_01yr, "aggCC01.pkl")
pkl_dump(global_data_CC_3yr, "aggCC3.pkl")
pkl_dump(global_data_CC_5yr, "aggCC5.pkl")

#### PROCEDURE

In [31]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/02_MQ_Pre_Processing_Agg_Files/01_MQ_Mapping_Files')
proc2ccs = pkl_load("icd2ccs.pkl")
len(proc2ccs), proc2ccs['009B00Z']

(138930, '1')

In [32]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/EHRs/')
for each in pd.read_csv("PROCEDURES.csv", chunksize=5, dtype=str):
    x = each
    break

In [33]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')
global_data_CC_01yr = pkl_load("aggCC01.pkl")
global_data_CC_3yr = pkl_load("aggCC3.pkl")
global_data_CC_5yr = pkl_load("aggCC5.pkl")

In [34]:
for k in global_data_CC_01yr:
    global_data_CC_01yr[k]['proc'] = dict()
#print(global_data_CC_01yr['11e750610fb3ce06bc3a0050569ea8fb'].keys())

for k in global_data_CC_3yr:
    global_data_CC_3yr[k]['proc'] = dict()
#print(global_data_CC_3yr['11e750610fb3ce06bc3a0050569ea8fb'].keys())

for k in global_data_CC_5yr:
    global_data_CC_5yr[k]['proc'] = dict()
#print(global_data_CC_5yr['11e750610fb3ce06bc3a0050569ea8fb'].keys())

In [35]:
def f_px01(x):
    pid = x['PATID']
    if pid in global_data_CC_01yr:
        date = x['PX_DATE']
        dx = x['PX']
        if dx in proc2ccs:
            dx = list({proc2ccs[dx]})
        else:
            dx = [dx]
        if date in global_data_CC_01yr[pid]['proc']:
            global_data_CC_01yr[pid]['proc'][date].extend(dx)
        else:
            global_data_CC_01yr[pid]['proc'][date] = dx
            
def f_px3(x):
    pid = x['PATID']
    if pid in global_data_CC_3yr:
        date = x['PX_DATE']
        dx = x['PX']
        if dx in proc2ccs:
            dx = list({proc2ccs[dx]})
        else:
            dx = [dx]
        if date in global_data_CC_3yr[pid]['proc']:
            global_data_CC_3yr[pid]['proc'][date].extend(dx)
        else:
            global_data_CC_3yr[pid]['proc'][date] = dx

def f_px5(x):
    pid = x['PATID']
    if pid in global_data_CC_5yr:
        date = x['PX_DATE']
        dx = x['PX']
        if dx in proc2ccs:
            dx = list({proc2ccs[dx]})
        else:
            dx = [dx]
        if date in global_data_CC_5yr[pid]['proc']:
            global_data_CC_5yr[pid]['proc'][date].extend(dx)
        else:
            global_data_CC_5yr[pid]['proc'][date] = dx

In [36]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/EHRs/')
df_proc = pd.read_csv("PROCEDURES.csv", chunksize=100000, dtype=str)

In [37]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')
for each in df_proc:
    xx = each.apply(f_px01, axis=1)
    xx = each.apply(f_px3, axis=1)
    xx = each.apply(f_px5, axis=1)

In [38]:
pkl_dump(global_data_CC_01yr, "aggCC01.pkl")
pkl_dump(global_data_CC_3yr, "aggCC3.pkl")
pkl_dump(global_data_CC_5yr, "aggCC5.pkl")

#### MEDICATION

In [39]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/02_MQ_Pre_Processing_Agg_Files/01_MQ_Mapping_Files')
r2i = pkl_load("rxcode2ingred.pkl")

In [42]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/EHRs')
for each in pd.read_csv("PRESCRIBING.csv",chunksize=5, dtype =str):
    x = each
    break
    
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')
global_data_CC_01yr = pkl_load("aggCC01.pkl")
global_data_CC_3yr = pkl_load("aggCC3.pkl")
global_data_CC_5yr = pkl_load("aggCC5.pkl")

In [43]:
#create keys for prescribing and dispensing medications for each patient

for k in global_data_CC_01yr:
    global_data_CC_01yr[k]['med_p'] = dict()
    global_data_CC_01yr[k]['med_d'] = dict()
    
#global_data_CC_01yr['11e750610fb3ce06bc3a0050569ea8fb'].keys()

In [44]:
for k in global_data_CC_3yr:
    global_data_CC_3yr[k]['med_p'] = dict()
    global_data_CC_3yr[k]['med_d'] = dict()
    
#global_data_CC_3yr['11e750610fb3ce06bc3a0050569ea8fb'].keys()

In [45]:
for k in global_data_CC_5yr:
    global_data_CC_5yr[k]['med_p'] = dict()
    global_data_CC_5yr[k]['med_d'] = dict()
    
#global_data_CC_5yr['11e750610fb3ce06bc3a0050569ea8fb'].keys()

In [46]:
# read in prescibing data

os.chdir('/mnt/data1/chong/2021-CRC/original_data/EHRs')
px_df = pd.read_csv("PRESCRIBING.csv", chunksize=100000, dtype=str)
ndc2rxcui = dict()

In [47]:
# input prescibing data into aggregate data function

def px01(x):
    pid = x['PATID']
    ndc = x['RAW_RX_NDC']
    dx = x['RXNORM_CUI']
    if pd.isna(dx):
        return
    ndc2rxcui[ndc] = dx
    if pid in global_data_CC_01yr:
        date = x['RX_ORDER_DATE']
        if pd.isna(date):
            date = x['RX_START_DATE']
        if dx in r2i:
            dx = list(r2i[dx])
        else:
            dx = [dx]
        if date in global_data_CC_01yr[pid]['med_p']:
            global_data_CC_01yr[pid]['med_p'][date].extend(dx)
        else:
            global_data_CC_01yr[pid]['med_p'][date] = dx

def px3(x):
    pid = x['PATID']
    ndc = x['RAW_RX_NDC']
    dx = x['RXNORM_CUI']
    if pd.isna(dx):
        return
    ndc2rxcui[ndc] = dx
    if pid in global_data_CC_3yr:
        date = x['RX_ORDER_DATE']
        if pd.isna(date):
            date = x['RX_START_DATE']
        if dx in r2i:
            dx = list(r2i[dx])
        else:
            dx = [dx]
        if date in global_data_CC_3yr[pid]['med_p']:
            global_data_CC_3yr[pid]['med_p'][date].extend(dx)
        else:
            global_data_CC_3yr[pid]['med_p'][date] = dx

def px5(x):
    pid = x['PATID']
    ndc = x['RAW_RX_NDC']
    dx = x['RXNORM_CUI']
    if pd.isna(dx):
        return
    ndc2rxcui[ndc] = dx
    if pid in global_data_CC_5yr:
        date = x['RX_ORDER_DATE']
        if pd.isna(date):
            date = x['RX_START_DATE']
        if dx in r2i:
            dx = list(r2i[dx])
        else:
            dx = [dx]
        if date in global_data_CC_5yr[pid]['med_p']:
            global_data_CC_5yr[pid]['med_p'][date].extend(dx)
        else:
            global_data_CC_5yr[pid]['med_p'][date] = dx

In [48]:
for each in px_df:
    xx = each.apply(px01, axis=1)
    xx = each.apply(px3, axis=1)
    xx = each.apply(px5, axis=1)

In [49]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')
pkl_dump(global_data_CC_01yr, "aggCC01.pkl")
pkl_dump(global_data_CC_3yr, "aggCC3.pkl")
pkl_dump(global_data_CC_5yr, "aggCC5.pkl")

#### DISPENSING

In [50]:
# view dispensing data
os.chdir('/mnt/data1/chong/2021-CRC/original_data/EHRs')
for each in pd.read_csv("DISPENSING.csv", chunksize=5, dtype=str):
    x = each
    break

In [51]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/02_MQ_Pre_Processing_Agg_Files/01_MQ_Mapping_Files')
n2r = pkl_load("ndc2rxcui.pkl")

for k, v in n2r.items():
    if k not in ndc2rxcui:
         ndc2rxcui[k] = v

pkl_dump(ndc2rxcui, "ndc2rxcuifinalCC.pkl")

In [52]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/EHRs')

df_dp = pd.read_csv("DISPENSING.csv", chunksize=1000000,dtype =str)

unmapped_ndc = set()

def unmapndcs(x):
    ndc = x['NDC']
    if not (ndc in ndc2rxcui or '0' + ndc in ndc2rxcui):
        unmapped_ndc.add(ndc)

for each in df_dp:
    each.apply(unmapndcs,axis =1)

In [53]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')
global_data_CC_01yr = pkl_load("aggCC01.pkl")
global_data_CC_3yr = pkl_load("aggCC3.pkl")
global_data_CC_5yr = pkl_load("aggCC5.pkl")

In [54]:
def fx_disp01(x):
    pid = x['PATID']
    if pid in global_data_CC_01yr:
        ndc = x['NDC']
        mndc = '0' + ndc
        if ndc in ndc2rxcui:
            dx = ndc2rxcui[ndc]
        elif mndc in ndc2rxcui:
            dx = ndc2rxcui[mndc]
        else:
            return
    
        date = x['DISPENSE_DATE']
        if dx in r2i:
            dx = list(r2i[dx])
        else:
            dx = [dx]
        if date in global_data_CC_01yr[pid]['med_d']:
            global_data_CC_01yr[pid]['med_d'][date].extend(dx)
        else:
            global_data_CC_01yr[pid]['med_d'][date] = dx

def fx_disp3(x):
    pid = x['PATID']
    if pid in global_data_CC_3yr:
        ndc = x['NDC']
        mndc = '0' + ndc
        if ndc in ndc2rxcui:
            dx = ndc2rxcui[ndc]
        elif mndc in ndc2rxcui:
            dx = ndc2rxcui[mndc]
        else:
            return
    
        date = x['DISPENSE_DATE']
        if dx in r2i:
            dx = list(r2i[dx])
        else:
            dx = [dx]
        if date in global_data_CC_3yr[pid]['med_d']:
            global_data_CC_3yr[pid]['med_d'][date].extend(dx)
        else:
            global_data_CC_3yr[pid]['med_d'][date] = dx

def fx_disp5(x):
    pid = x['PATID']
    if pid in global_data_CC_5yr:
        ndc = x['NDC']
        mndc = '0' + ndc
        if ndc in ndc2rxcui:
            dx = ndc2rxcui[ndc]
        elif mndc in ndc2rxcui:
            dx = ndc2rxcui[mndc]
        else:
            return
    
        date = x['DISPENSE_DATE']
        if dx in r2i:
            dx = list(r2i[dx])
        else:
            dx = [dx]
        if date in global_data_CC_5yr[pid]['med_d']:
            global_data_CC_5yr[pid]['med_d'][date].extend(dx)
        else:
            global_data_CC_5yr[pid]['med_d'][date] = dx
            
            

In [55]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/EHRs')
df_dp = pd.read_csv("DISPENSING.csv", chunksize=1000000,dtype =str)
for each in df_dp:
    xx = each.apply(fx_disp01, axis=1)
    xx = each.apply(fx_disp3, axis=1)
    xx = each.apply(fx_disp5, axis=1)

In [56]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')
pkl_dump(global_data_CC_01yr, "aggCC01.pkl")
pkl_dump(global_data_CC_3yr, "aggCC3.pkl")
pkl_dump(global_data_CC_5yr, "aggCC5.pkl")

#### VITALS

In [57]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/02_MQ_Pre_Processing_Agg_Files')

In [58]:
df_vitX = pkl_load("vitalspro.pkl")

In [59]:
df_vitX.columns

Index(['PATID', 'BMI_mean', 'DIASTOLIC_mean', 'SYSTOLIC_mean'], dtype='object')

In [60]:
vital_cols = ['BMI_mean', 'DIASTOLIC_mean', 'SYSTOLIC_mean']

In [61]:
d_l_v01 = defaultdict(dict)
d_l_v3 = defaultdict(dict)
d_l_v5 = defaultdict(dict)

In [62]:
def fx_v01(x):
    pid = x['PATID']
    for k in vital_cols:
        val = x[k]
        d_l_v01[pid][k] = val
    
def fx_v3(x):
    pid = x['PATID']
    for k in vital_cols:
        val = x[k]
        d_l_v3[pid][k] = val

def fx_v5(x):
    pid = x['PATID']
    for k in vital_cols:
        val = x[k]
        d_l_v5[pid][k] = val

In [63]:
xx = df_vitX.apply(fx_v01, axis=1)
yy = df_vitX.apply(fx_v3, axis=1)
zz = df_vitX.apply(fx_v5, axis=1)

In [64]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')
pkl_dump(d_l_v01, "d_l_v_CC_01.pkl")
pkl_dump(d_l_v3, "d_l_v_CC_3.pkl")
pkl_dump(d_l_v5, "d_l_v_CC_5.pkl")

#### LABS

In [65]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')

global_data_CC_01yr = pkl_load("aggCC01.pkl")
global_data_CC_3yr = pkl_load("aggCC3.pkl")
global_data_CC_5yr = pkl_load("aggCC5.pkl")

In [66]:
for k in global_data_CC_01yr:
    global_data_CC_01yr[k]['lab'] = dict()
#print(global_data_CC_01yr['11e750610fb3ce06bc3a0050569ea8fb'].keys())

for k in global_data_CC_3yr:
    global_data_CC_3yr[k]['lab'] = dict()
#print(global_data_CC_3yr['11e750610fb3ce06bc3a0050569ea8fb'].keys())

for k in global_data_CC_5yr:
    global_data_CC_5yr[k]['lab'] = dict()
#print(global_data_CC_5yr['11e750610fb3ce06bc3a0050569ea8fb'].keys())

In [67]:
def f_lv01(x):
    pid = x['PATID']
    if pid in global_data_CC_01yr:
        date = x['RESULT_DATE']
        lab = [x['LAB_LOINC']]
        if date in global_data_CC_01yr[pid]['lab']:
            global_data_CC_01yr[pid]['lab'][date].extend(lab)
        else:
            global_data_CC_01yr[pid]['lab'][date] = lab
            
def f_lv3(x):
    pid = x['PATID']
    if pid in global_data_CC_3yr:
        date = x['RESULT_DATE']
        lab = [x['LAB_LOINC']]
        if date in global_data_CC_3yr[pid]['lab']:
            global_data_CC_3yr[pid]['lab'][date].extend(lab)
        else:
            global_data_CC_3yr[pid]['lab'][date] = lab
            
def f_lv5(x):
    pid = x['PATID']
    if pid in global_data_CC_5yr:
        date = x['RESULT_DATE']
        lab = [x['LAB_LOINC']]
        if date in global_data_CC_5yr[pid]['lab']:
            global_data_CC_5yr[pid]['lab'][date].extend(lab)
        else:
            global_data_CC_5yr[pid]['lab'][date] = lab

In [68]:
os.chdir('/mnt/data1/chong/2021-CRC/original_data/EHRs')
labs = pd.read_csv("LAB_RESULT_CM.csv",dtype= str)

In [69]:
labs['ABN_IND'] = labs['ABN_IND'].replace('NI',np.NaN)
labs['ABN_IND'] = labs['ABN_IND'].replace('UN',np.NaN)
labs['ABN_IND'] = labs['ABN_IND'].replace('NL',np.NaN)
labs['ABN_IND'] = labs['ABN_IND'].replace('OT',np.NaN)
labs['ABN_IND'] = labs['ABN_IND'].replace('AH','AB')
labs['ABN_IND'] = labs['ABN_IND'].replace('AL','AB')
labs['ABN_IND'] = labs['ABN_IND'].replace('CR','AB')
labs['ABN_IND'] = labs['ABN_IND'].replace('CH','AB')
labs['ABN_IND'] = labs['ABN_IND'].replace('CL','AB')

labs = labs[labs['ABN_IND'].notna()]

In [72]:
os.chdir('/mnt/data1/chong/2021-CRC/updated_data/agg_files')
labs.to_csv("ablabsCC.csv")
labs = pd.read_csv("ablabsCC.csv", chunksize=1000000,dtype =str)

In [73]:
for each in labs:
    xx = each.apply(f_lv01, axis=1)
    xx = each.apply(f_lv3, axis=1)
    xx = each.apply(f_lv5, axis=1)

In [74]:
pkl_dump(global_data_CC_01yr, "aggCC01.pkl")
pkl_dump(global_data_CC_3yr, "aggCC3.pkl")
pkl_dump(global_data_CC_5yr, "aggCC5.pkl")

In [75]:
# IMMUNIZATIONS - decide not to use because of bad data
# os.chdir('/mnt/data1/chong/2021-CRC/original_data/EHRs')
# df_im = pd.read_csv("IMMUNIZATION.csv")