### Description
Going to try doing a dask implementation of the parser. Also hand over to dask bag when the computation stops being dataframe friendly

In [5]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [6]:
from thesislib.utils import pathutils
import json

In [7]:
import dask.dataframe as dd
import numpy as np
from dask import delayed, compute
import pandas as pd

In [8]:
with open(pathutils.get_data_file("prob-synthea-1/data/condition_codes.json")) as fp:
    condition_codes = set(json.load(fp))
with open(pathutils.get_data_file("prob-synthea-1/data/conditions_db.json")) as fp:
    condition_db = json.load(fp)
with open(pathutils.get_data_file("prob-synthea-1/data/symptom_vector.json")) as fp:
    symptom_vector = set(json.load(fp))
with open(pathutils.get_data_file("prob-synthea-1/data/symptoms_db.json")) as fp:
    symptoms_db = json.load(fp)

condition_label = {item: idx for idx, item in enumerate(condition_codes)}

In [5]:
from dask.distributed import Client, progress
client = Client(n_workers=2, threads_per_worker=2, memory_limit='3GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:53229  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 4  Memory: 6.00 GB


In [9]:
patients_csv = pathutils.get_data_file("prob-synthea-1/data/patients.csv")
conditions_csv = pathutils.get_data_file("prob-synthea-1/data/patient_conditions.csv")
symptoms_csv = pathutils.get_data_file("prob-synthea-1/data/patient_condition_symptoms.csv")

In [12]:
cd = pd.read_csv(symptoms_csv)
cd.columns

Index(['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE', 'SYMPTOM_DISPLAY',
       'VALUE_CODE', 'VALUE_DISPLAY'],
      dtype='object')

In [7]:
patient_columns = ['Id', 'BIRTHDATE', 'RACE', 'GENDER']
patients = dd.read_csv(
    patients_csv,
    usecols=patient_columns,
    parse_dates=['BIRTHDATE'],
    infer_datetime_format=True,
    dtype={
        'GENDER': 'category',
        'RACE': 'category'
    }
)

condition_columns = ['Id', 'PATIENT', 'CODE', 'ONSET']
conditions = dd.read_csv(conditions_csv, usecols=condition_columns, parse_dates=['ONSET'], infer_datetime_format=True)

symptom_colums = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE']
symptoms = dd.read_csv(symptoms_csv, usecols=symptom_colums)
symptoms = symptoms.repartition(npartitions=4)



In [8]:
def _race_txform(val):
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    return race_code.get(val)
def _label_txform(val, labels):
    return labels.get(val)

In [9]:
patients['RACE'] = patients['RACE'].apply(_race_txform, meta=('RACE', np.uint8))
patients['GENDER'] = patients['GENDER'].apply(lambda gender: 0 if gender == 'F' else 1, meta=('GENDER', np.bool))

In [10]:
df = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
df = symptoms.merge(df, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))

In [11]:
df['AGE'] = abs((df['ONSET'] - df['BIRTHDATE']).astype('timedelta64[Y]')).astype(np.uint8)
df['LABEL'] = df['CODE'].apply(_label_txform, labels=condition_label, meta=('CODE', np.uint16))

In [12]:
df = df.drop(columns=['PATIENT_symp', 'Id', 'PATIENT', 'CODE',
       'ONSET', 'Id_pat', 'BIRTHDATE'])

In [63]:
_interest_keys = {'LABEL', 'RACE', 'GENDER', 'AGE'}
_symptom_code_idx = None
key_map = {}

In [67]:
for idx, itm in enumerate(df.columns):
    if itm == 'SYMPTOM_CODE':
        _symptom_code_idx = idx
    elif itm in _interest_keys:
        key_map[itm] = idx

In [13]:
_grp = df.to_bag()

In [14]:
def _key_func(item):
    return item[0]

# LABEL, RACE, GENDER, AGE
_initial = {
    'LABEL': None,
    'RACE': None,
    'GENDER': None,
    'AGE': None
}

for itm in symptom_vector:
    _initial[itm] = 0

In [15]:
from copy import deepcopy

```python
def _bin_op(v1, v2):
    # ['CONDITION_ID', 'SYMPTOM_CODE', 'LABEL', 'RACE', 'GENDER', 'AGE']
    _key_map = {
        'LABEL': 2,
        'RACE': 3,
        'GENDER': 4,
        'AGE': 5
    }
    if v1['LABEL'] is None:
        _base = deepcopy(v1)
    else:
        _base = v1
    _arg = v2
    _symp_indx = 1
    
    _base[_arg[_symp_indx]] = 1
    for _k, _v in _key_map.items():
        _base[_k] = _arg[_v]
    return _base

def _combine_op(v1, v2):
    _keys = [_k for _k in v1.keys() if len(_k) == 56]
    for _k in _keys:
        _val = v2.get(_k)
        if _val == 1:
            v1[_k] = 1
    return v1
```

In [17]:
_res_f = _grp.foldby(_key_func, binop=_bin_op, initial=_initial, combine=_combine_op)
_res_f = _res_f.map(lambda x: x[1])
_res_f = _res_f.repartition(npartitions=4)
output_files = pathutils.get_data_file("compare-parsing/output/dask-soln/data-*.csv")
_res_f.to_textfiles(output_files, last_endline=True)

### An Alternative Solution

- Use the groupby (let the full shuffle happen, and then do the reduction)

In [71]:
def _mapper(v1, initial, _key_map, _symp_indx):
    _base = deepcopy(initial)
    _cnd, items = v1

    for _k, _v in _key_map.items():
        _base[_k] = items[0][_v]
    
    for _arg in items:
        _base[_arg[_symp_indx]] = 1
    
    return _cnd, _base

def _dict_to_csv(data, keys):
    return ",".join([str(data[_k]) for _k in keys])

In [72]:
_data_keys = list(_initial.keys())

In [73]:
_res_g = _grp.groupby(_key_func).map(_mapper, initial=_initial, _key_map=key_map, _symp_indx=_symptom_code_idx)
_res_g = _res_g.map(lambda x: x[1]).map(_dict_to_csv, keys=_data_keys)
_res_g = _res_g.repartition(npartitions=4)
output_files = pathutils.get_data_file("compare-parsing/output/dask-soln/data-*.csv")
_res_g.to_textfiles(output_files, last_endline=True)

['/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/compare-parsing/output/dask-soln/data-0.csv',
 '/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/compare-parsing/output/dask-soln/data-1.csv',
 '/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/compare-parsing/output/dask-soln/data-2.csv',
 '/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/compare-parsing/output/dask-soln/data-3.csv']

In [1]:
import distributed

In [2]:
distributed.client.DEFAULT_EXTENSIONS

[distributed.pubsub.PubSubClientExtension]

In [13]:
import fsspec

In [15]:
fsspec.__version__

'0.6.3'

In [17]:
min([1, 2])

1