In [36]:
import os
import json

In [37]:
from dask_jobqueue import SLURMCluster
import dask.dataframe as dd
from dask.distributed import Client

In [38]:
import numpy as np
from copy import deepcopy
from glob import glob

In [39]:
data_dir = "/home/oagba/bulk/data/output_100k/csv"
output_dir = "/home/oagba/bulk/data/output_100k/parsed"

if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

In [40]:
# file definitions
condition_codes_json = os.path.join(data_dir, "condition_codes.json")
symptom_codes_json = os.path.join(data_dir, "symptom_vector.json")
condition_label_json = os.path.join(data_dir, "condition_labels.json")

In [41]:
with open(condition_codes_json) as fp:
    condition_codes = set(json.load(fp))
with open(symptom_codes_json) as fp:
    symptom_vector = set(json.load(fp))

condition_label = {item: idx for idx, item in enumerate(condition_codes)}
with open(condition_label_json, "w") as fp:
    json.dump(condition_label, fp, indent=4)

In [42]:
patients_csv = os.path.join(data_dir, "patients.csv")
conditions_csv_glob = os.path.join(data_dir, "conditions", "x*")
symptoms_csv_glob = os.path.join(data_dir, "symptoms", "x*")

In [43]:
def _race_txform(val):
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    return race_code.get(val)

def _label_txform(val, labels):
    return labels.get(val)

def _key_func(item):
    return item[0]

def _mapper(v1, initial, _key_map, _symp_indx):
    _base = deepcopy(initial)
    _cnd, items = v1

    for _k, _v in _key_map.items():
        _base[_k] = items[0][_v]
    
    for _arg in items:
        _base[_arg[_symp_indx]] = 1
    
    return _cnd, _base

def _dict_to_csv(data, keys):
    return ",".join([str(data[_k]) for _k in keys])

In [44]:
def parser(patients_csv, conditions_csv, symptoms_csv, condition_label, symptom_vector, output_dir):
    
    _num_condition_files = len(glob(conditions_csv))
    _num_symptom_files = len(glob(symptoms_csv))
    _min_num_partitions = max(4, min(_num_condition_files, _num_symptom_files))
    
    patient_sel_columns = ['Id', 'BIRTHDATE', 'RACE', 'GENDER']
    
    patients = dd.read_csv(
        patients_csv,
        usecols=patient_sel_columns,
        parse_dates=['BIRTHDATE'],
        infer_datetime_format=True,
        dtype={
            'GENDER': 'category',
            'RACE': 'category'
        }
    )
    
    condition_columns = ['Id', 'PATIENT', 'CODE', 'DESCRIPTION', 'ONSET', 'DIAGNOSED']
    condition_sel_columns = ['Id', 'PATIENT', 'CODE', 'ONSET']
    conditions = dd.read_csv(
        conditions_csv,
        usecols=condition_sel_columns, 
        parse_dates=['ONSET'], 
        infer_datetime_format=True,
        names=condition_columns
    )
    
    symptom_sel_colums = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE']
    symptom_columns = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE', 'SYMPTOM_DISPLAY', 'VALUE_CODE', 'VALUE_DISPLAY']
    symptoms = dd.read_csv(symptoms_csv, usecols=symptom_sel_colums, names=symptom_columns)
    
    patients['RACE'] = patients['RACE'].apply(_race_txform, meta=('RACE', np.uint8))
    patients['GENDER'] = patients['GENDER'].apply(lambda gender: 0 if gender == 'F' else 1, meta=('GENDER', np.bool))
    
    df = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
    df = symptoms.merge(df, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))
    
    df['AGE'] = abs((df['ONSET'] - df['BIRTHDATE']).astype('timedelta64[Y]')).astype(np.uint8)
    df['LABEL'] = df['CODE'].apply(_label_txform, labels=condition_label, meta=('CODE', np.uint16))
    
    df = df.drop(columns=['PATIENT_symp', 'Id', 'PATIENT', 'CODE', 'ONSET', 'Id_pat', 'BIRTHDATE'])
    
    _interest_keys = {'LABEL', 'RACE', 'GENDER', 'AGE'}
    _symptom_code_idx = None
    key_map = {}
    
    for idx, itm in enumerate(df.columns):
        if itm == 'SYMPTOM_CODE':
            _symptom_code_idx = idx
        elif itm in _interest_keys:
            key_map[itm] = idx
    # LABEL, RACE, GENDER, AGE
    _initial = {
        'LABEL': None,
        'RACE': None,
        'GENDER': None,
        'AGE': None
    }

    for itm in symptom_vector:
        _initial[itm] = 0
    
    _grp = df.to_bag()
    
    _data_keys = list(_initial.keys())
    num_paritions =  _min_num_partitions
    
    _res_g = _grp.groupby(_key_func).map(_mapper, initial=_initial, _key_map=key_map, _symp_indx=_symptom_code_idx)
    _res_g = _res_g.map(lambda x: x[1]).map(_dict_to_csv, keys=_data_keys)
    _res_g = _res_g.repartition(npartitions=num_paritions)
    output_files = os.path.join(output_dir, "data-*.csv")
    return _res_g.to_textfiles(output_files, last_endline=True)

In [45]:
cluster = SLURMCluster(
    queue='general',
    # project='medvice_parse',
    cores=8,
    memory='34 GB',
    walltime='02:00:00',
    death_timeout=60
)

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


In [11]:
client = Client(cluster)
cluster.scale(8)

parser(
    patients_csv=patients_csv,
    conditions_csv=conditions_csv_glob,
    symptoms_csv=symptoms_csv_glob,
    condition_label=condition_label,
    symptom_vector=symptom_vector,
    output_dir=output_dir
)

In [12]:
conditions_csv = conditions_csv_glob
symptoms_csv = symptoms_csv_glob

In [13]:
patient_sel_columns = ['Id', 'BIRTHDATE', 'RACE', 'GENDER']
    
patients = dd.read_csv(
    patients_csv,
    usecols=patient_sel_columns,
    parse_dates=['BIRTHDATE'],
    infer_datetime_format=True,
    dtype={
        'GENDER': 'category',
        'RACE': 'category'
    }
)



In [14]:
patients['RACE'] = patients['RACE'].apply(_race_txform, meta=('RACE', np.uint8))
patients['GENDER'] = patients['GENDER'].apply(lambda gender: 0 if gender == 'F' else 1, meta=('GENDER', np.bool))

In [15]:
condition_columns = ['Id', 'PATIENT', 'CODE', 'DESCRIPTION', 'ONSET', 'DIAGNOSED']
condition_sel_columns = ['Id', 'PATIENT', 'CODE', 'ONSET']
conditions = dd.read_csv(
    conditions_csv,
    usecols=condition_sel_columns, 
    parse_dates=['ONSET'], 
    infer_datetime_format=True,
    names=condition_columns
)

In [16]:
symptom_sel_colums = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE']
symptom_columns = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE', 'SYMPTOM_DISPLAY', 'VALUE_CODE', 'VALUE_DISPLAY']
symptoms = dd.read_csv(symptoms_csv, usecols=symptom_sel_colums, names=symptom_columns)

In [17]:
df = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
df = symptoms.merge(df, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))

In [18]:
df['AGE'] = abs((df['ONSET'] - df['BIRTHDATE']).astype('timedelta64[Y]')).astype(np.uint8)
df['LABEL'] = df['CODE'].apply(_label_txform, labels=condition_label, meta=('CODE', np.uint16))

In [19]:
# don't drop anything
# df = df.drop(columns=['PATIENT_symp', 'Id', 'PATIENT', 'CODE', 'ONSET', 'Id_pat', 'BIRTHDATE'])

In [20]:
_interest_keys = {'LABEL', 'RACE', 'GENDER', 'AGE'}
_symptom_code_idx = None
key_map = {}

In [21]:
for idx, itm in enumerate(df.columns):
    if itm == 'SYMPTOM_CODE':
        _symptom_code_idx = idx
    elif itm in _interest_keys:
        key_map[itm] = idx
# LABEL, RACE, GENDER, AGE
_initial = {
    'LABEL': None,
    'RACE': None,
    'GENDER': None,
    'AGE': None
}

In [22]:
for itm in symptom_vector:
    _initial[itm] = 0

In [23]:
_grp = df.to_bag()

In [24]:
_data_keys = list(_initial.keys())

In [25]:
_res_g = _grp.groupby(_key_func).map(_mapper, initial=_initial, _key_map=key_map, _symp_indx=_symptom_code_idx)
_res_g = _res_g.map(lambda x: x[1]).map(_dict_to_csv, keys=_data_keys)

In [None]:
output_files = os.path.join(output_dir, "data-*.csv")
op = _res_g.to_textfiles(output_files, last_endline=True)