Trying out dask as an alternative to joblib + pandas combination

In a way it makes sense that the joblib panda combo didn't work. Pandas already uses joblib internally I believe. So I was probably serverly limiting it's capabilities

In [1]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from thesislib.utils import pathutils

In [3]:
import json
import dask.dataframe as dd

In [4]:
with open(pathutils.get_data_file("prob-synthea-1/data/condition_codes.json")) as fp:
    condition_codes = json.load(fp)
with open(pathutils.get_data_file("prob-synthea-1/data/conditions_db.json")) as fp:
    condition_db = json.load(fp)
with open(pathutils.get_data_file("prob-synthea-1/data/symptom_vector.json")) as fp:
    symptom_vector = json.load(fp)
with open(pathutils.get_data_file("prob-synthea-1/data/symptoms_db.json")) as fp:
    symptoms_db = json.load(fp)

condition_label = {item: idx for idx, item in enumerate(condition_codes)}

In [5]:
from dask.distributed import Client, progress
client = Client(n_workers=2, threads_per_worker=2, memory_limit='1GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:54413  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 4  Memory: 2.00 GB


In [6]:
import dask.dataframe as dd

In [7]:
patients_csv = pathutils.get_data_file("prob-synthea-1/data/patients.csv")
patient_conditions_csv = pathutils.get_data_file("prob-synthea-1/data/patient_conditions.csv")

In [8]:
patient_columns = ['Id', 'BIRTHDATE', 'RACE', 'GENDER']
patients = dd.read_csv(patients_csv, usecols=patient_columns, parse_dates=['BIRTHDATE'], infer_datetime_format=True)

condition_columns = ['Id', 'PATIENT', 'CODE', 'ONSET']
conditions = dd.read_csv(patient_conditions_csv, usecols=condition_columns, parse_dates=['ONSET'], infer_datetime_format=True)

In [9]:
from glob import glob

In [10]:
symptom_dir = pathutils.get_data_file("prob-synthea-1/data/symptoms")
symptom_files = glob(symptom_dir + "/symp_*")

In [11]:
symptom_colums = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE']
_cols = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE', 'SYMPTOM_DISPLAY', 'VALUE_CODE', 'VALUE_DISPLAY']
symptoms = dd.read_csv(symptom_files,names=_cols, usecols=symptom_colums)

In [12]:
import numpy as np
def _race_txform(item):
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    return race_code.get(item, np.nan)
    
patients['RACE'] = patients['RACE'].apply(_race_txform, meta=('RACE', np.uint8))
patients['GENDER'] = patients['GENDER'].apply(lambda x: 0 if x == 'F' else 1, meta=('GENDER', np.bool))

In [13]:
df = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'), )

In [14]:
df = symptoms.merge(df, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))

In [15]:
df['age'] = abs((df['ONSET'] - df['BIRTHDATE']).astype('timedelta64[Y]'))

In [16]:
df = df.drop(columns=['ONSET', 'BIRTHDATE', 'PATIENT_symp', 'Id', 'Id_pat', 'PATIENT'])

In [17]:
def _transform_condition_codes(val, condition_label):
    return condition_label[val]

In [18]:
df['CODE'] = df['CODE'].apply(_transform_condition_codes, condition_label=condition_label, meta=('CODE', np.uint16))

In [19]:
_grp_meta = {'RACE': np.uint8, 'GENDER': np.bool, 'age': np.uint16, 'CODE': np.uint16}
_grp_meta.update({symp: np.bool for symp in symptom_vector})

In [20]:
def grp_apply(_df, symptomcodes):
    cols = ['RACE', 'GENDER', 'age', 'CODE']
    symptoms = {itm: 1 for itm in df['SYMPTOM_CODE'].values}
    
    _s = _df.head(1)[cols].copy()
    for _symp in symptomcodes:
        _s[_symp] = symptoms.get(_symp, 0)
    
    return _s

In [25]:
_grp = df.groupby('CONDITION_ID')

In [26]:
res = _grp.apply(grp_apply, symptomcodes=symptom_vector, meta=_grp_meta)

In [27]:
op = pathutils.get_data_file("prob-synthea-1/output/dask/") + "export-*.csv"
res.to_csv(op)

ValueError: Arrays chunk sizes are unknown: (nan,)

A possible solution: https://docs.dask.org/en/latest/array-chunks.html#unknown-chunks
Summary: to compute chunks sizes, use

   x.compute_chunk_sizes()  # for Dask Array `x`
   ddf.to_dask_array(lengths=True)  # for Dask DataFrame `ddf`