In [1]:
import os
import json

In [2]:
from dask_jobqueue import SLURMCluster
import dask.dataframe as dd
from dask.distributed import Client

In [3]:
import numpy as np
from glob import glob

In [4]:
symptom_db_json = os.path.join("/home/oagba/bulk/data/output_new_100k/json", "symptom_db.json")
condition_db_json = os.path.join("/home/oagba/bulk/data/output_new_100k/json", "condition_db.json")

In [5]:
with open(symptom_db_json) as fp:
    symptom_db = json.load(fp)
with open(condition_db_json) as fp:
    condition_db = json.load(fp)

In [6]:
symptom_vector = set(symptom_db.keys())
condition_codes = set(condition_db.keys())
condition_labels = {code: idx for idx, code in enumerate(condition_codes)}

In [7]:
cluster = SLURMCluster(
    queue='general',
    # project='medvice_parse',
    cores=8,
    memory='34 GB',
    walltime='02:00:00',
    death_timeout=60
)

In [8]:
client = Client(cluster)
cluster.scale(8)

In [9]:
csv_dir = "/shares/bulk/oagba/data/output_new_100k/csv"
patients_csv = os.path.join(csv_dir, "patients.csv")
conditions_csv = os.path.join(csv_dir, "conditions/x*")
symptoms_csv = os.path.join(csv_dir, "symptoms/x*")

In [10]:
patient_sel_columns = ['Id', 'BIRTHDATE', 'RACE', 'GENDER']
    
patients = dd.read_csv(
    patients_csv,
    usecols=patient_sel_columns,
    parse_dates=['BIRTHDATE'],
    infer_datetime_format=True,
    dtype={
        'GENDER': 'category',
        'RACE': 'category'
    }
)



In [11]:
condition_columns = ['START','STOP','PATIENT','ENCOUNTER','CODE','DESCRIPTION']
condition_sel_columns = ['ENCOUNTER', 'PATIENT', 'CODE', 'START']
conditions = dd.read_csv(
    conditions_csv, 
    names=condition_columns,
    usecols=condition_sel_columns, 
    parse_dates=['START'], 
    infer_datetime_format=True
)

In [12]:
symptom_columns = ['SYMPTOM_CODE','SYMPTOM_DISPLAY','ENCOUNTER','PATIENT']
symptom_sel_colums = ['ENCOUNTER', 'PATIENT', 'SYMPTOM_CODE']
symptoms = dd.read_csv(symptoms_csv, names=symptom_columns, usecols=symptom_sel_colums)

In [13]:
def _race_txform(val):
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    return race_code.get(val)
def _label_txform(val, labels):
    return labels.get(val)

In [14]:
patients['RACE'] = patients['RACE'].apply(_race_txform, meta=('RACE', np.uint8))
patients['GENDER'] = patients['GENDER'].apply(lambda gender: 0 if gender == 'F' else 1, meta=('GENDER', np.bool))

In [15]:
patients = client.persist(patients)

In [16]:
conditions['LABEL'] = conditions['CODE'].apply(_label_txform, labels=condition_labels, meta=('CODE', np.uint16))

In [17]:
label_map = {}
for idx, item in enumerate(symptom_vector):
    label_map[item] = idx + 5

In [19]:
def transform_symptom_codes(item, label_map):
    return label_map.get(item)

In [20]:
symptoms['SYMPTOM_CODE'] = symptoms['SYMPTOM_CODE'].apply(transform_symptom_codes, label_map=label_map, meta=('SYMPTOM_CODE', np.uint16))

In [21]:
df = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
df = symptoms.merge(df, how='left', left_on='ENCOUNTER', right_on='ENCOUNTER', suffixes=('_symp', ''))

In [22]:
df['AGE'] = ((df['START'] - df['BIRTHDATE']).astype('timedelta64[M]')/12).astype(np.float32)

In [23]:
ordered_keys = ['ENCOUNTER', 'LABEL', 'RACE', 'GENDER', 'AGE', 'SYMPTOM_CODE']

In [24]:
df = df[ordered_keys]

In [25]:
bag = df.to_bag()

In [27]:
num_columns = 5 + len(label_map)

In [28]:
initial_entry = ['0' for idx in range(num_columns)]

In [30]:
def bin_op(v1, v2):
    # Index(['ENCOUNTER', 'LABEL', 'RACE', 'GENDER', 'AGE', 'SYMPTOM_CODE'], dtype='object')
    if v1[0] == '0':
        base = list(v1)
        init = True
    else:
        base = v1
        init = False
    if init:
        for idx in range(5):
            base[idx] = str(v2[idx])
    base[v2[5]] = '1'
    return base

def combine_op(v1, v2):
    for idx, item in enumerate(v2[5:]):
        if item == '1':
            v1[idx + 5] = '1'
    return v1

In [32]:
folded_w_key= bag.foldby(lambda v: v[0], binop=bin_op, initial=initial_entry, combine=combine_op)

In [34]:
folded_w_key = folded_w_key.repartition(256)

In [35]:
text_folded = folded_w_key.map(lambda v: ",".join(v[1]))

In [37]:
csv_op = "/home/oagba/bulk/data/output_new_100k/parsed/data-*.csv"

In [None]:
op_files = text_folded.to_textfiles(csv_op)

In [None]:
client.close()