### Description
Going to try doing a dask implementation of the parser. Also hand over to dask bag when the computation stops being dataframe friendly

In [1]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from thesislib.utils import pathutils
import json

In [3]:
import dask.dataframe as dd
import numpy as np
from dask import delayed, compute
import pandas as pd

In [4]:
with open(pathutils.get_data_file("prob-synthea-1/data/condition_codes.json")) as fp:
    condition_codes = set(json.load(fp))
with open(pathutils.get_data_file("prob-synthea-1/data/conditions_db.json")) as fp:
    condition_db = json.load(fp)
with open(pathutils.get_data_file("prob-synthea-1/data/symptom_vector.json")) as fp:
    symptom_vector = set(json.load(fp))
with open(pathutils.get_data_file("prob-synthea-1/data/symptoms_db.json")) as fp:
    symptoms_db = json.load(fp)

condition_label = {item: idx for idx, item in enumerate(condition_codes)}

In [5]:
from dask.distributed import Client, progress
client = Client(n_workers=2, threads_per_worker=2, memory_limit='3GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:65474  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 4  Memory: 6.00 GB


In [6]:
patients_csv = pathutils.get_data_file("prob-synthea-1/data/patients.csv")
conditions_csv = pathutils.get_data_file("prob-synthea-1/data/patient_conditions.csv")
symptoms_csv = pathutils.get_data_file("prob-synthea-1/data/patient_condition_symptoms.csv")

In [7]:
cd = pd.read_csv(symptoms_csv)
cd.columns

Index(['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE', 'SYMPTOM_DISPLAY',
       'VALUE_CODE', 'VALUE_DISPLAY'],
      dtype='object')

In [8]:
patient_columns = ['Id', 'BIRTHDATE', 'RACE', 'GENDER']
patients = dd.read_csv(
    patients_csv,
    usecols=patient_columns,
    parse_dates=['BIRTHDATE'],
    infer_datetime_format=True,
    dtype={
        'GENDER': 'category',
        'RACE': 'category'
    }
)

condition_columns = ['Id', 'PATIENT', 'CODE', 'ONSET']
conditions = dd.read_csv(conditions_csv, usecols=condition_columns, parse_dates=['ONSET'], infer_datetime_format=True)

symptom_colums = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE']
symptoms = dd.read_csv(symptoms_csv, usecols=symptom_colums)
symptoms = symptoms.repartition(npartitions=4)



In [9]:
def _race_txform(val):
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    return race_code.get(val)
def _label_txform(val, labels):
    return labels.get(val)

In [10]:
patients['RACE'] = patients['RACE'].apply(_race_txform, meta=('RACE', np.uint8))
patients['GENDER'] = patients['GENDER'].apply(lambda gender: 0 if gender == 'F' else 1, meta=('GENDER', np.bool))

In [11]:
df = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
df = symptoms.merge(df, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))

In [12]:
df['AGE'] = abs((df['ONSET'] - df['BIRTHDATE']).astype('timedelta64[Y]')).astype(np.uint8)
df['LABEL'] = df['CODE'].apply(_label_txform, labels=condition_label, meta=('CODE', np.uint16))

In [13]:
df = df.drop(columns=['PATIENT_symp', 'Id', 'PATIENT', 'CODE',
       'ONSET', 'Id_pat', 'BIRTHDATE'])

In [15]:
interest_keys = {'LABEL', 'RACE', 'GENDER', 'AGE', 'CONDITION_ID', 'SYMPTOM_CODE'}
key_map = {}
for idx, column in enumerate(df.columns):
    if column in interest_keys:
        key_map[idx] = column

In [18]:
from collections import OrderedDict

In [19]:
parts = ['CONDITION_ID', 'LABEL', 'RACE', 'GENDER', 'AGE']
label_map = OrderedDict({itm: idx for idx, itm in enumerate(parts)})
symptom_start = symptom_idx =  len(parts)
for item in symptom_vector:
    label_map[item] = symptom_idx
    symptom_idx += 1

In [20]:
_grp = df.to_bag()

In [22]:
def append_index(row, key_map, label_map):
    transformed = []
    for idx, item in enumerate(row):
        if idx not in key_map:
            continue
        column_name = key_map[idx]
        if column_name == 'SYMPTOM_CODE':
            transformed_index = label_map[item]
        else:
            transformed_index = label_map[column_name]

        transformed_item = "%d|%s" % (transformed_index, item)
        transformed.append(transformed_item)
    return transformed

In [23]:
def key_func(row):
    _, condition = row[0].split("|")
    return condition

In [25]:
transformed_grp = _grp.map(append_index, key_map=key_map, label_map=label_map)

In [26]:
x_xformed = transformed_grp.take(2)

In [29]:
res_g = transformed_grp.groupby(key_func)

In [34]:
num_labels = len(label_map)

In [38]:
def do_reduce(item, num_labels, symptom_start):
    condition_id, rows = item
    reduction = ['0' for idx in range(num_labels)]
    for row in rows:
        for col in row:
            idx, val = col.split("|")
            idx = int(idx)
            if idx < symptom_start:
                reduction[idx] = val
            else:
                reduction[idx] = '1'
    return condition_id, reduction

In [39]:
reduced_g = res_g.map(do_reduce, num_labels=num_labels, symptom_start=symptom_start)

In [45]:
x_reduced = reduced_g.take(1)

In [48]:
reduced_filtered = reduced_g.filter(lambda k: k[0] == 'b59b83f9-0ee9-420f-83cd-531f0cb6bc0e')
x_reduced = reduced_filtered.take(1)

In [61]:
x_reduced_row = x_reduced[0][1]

In [66]:
sum([int(val) for val in x_reduced_row[5:]])

1

In [69]:
x_grp = res_g.filter(lambda k: k[0] == 'b59b83f9-0ee9-420f-83cd-531f0cb6bc0e').compute()

In [70]:
x_grp

[('b59b83f9-0ee9-420f-83cd-531f0cb6bc0e',
  [['0|b59b83f9-0ee9-420f-83cd-531f0cb6bc0e',
    '44|f1e386f5773e030aa2451368ccc79679145f798b75fa38179b2e1bce',
    '2|0',
    '3|0',
    '4|19',
    '1|248']])]

In [71]:
df[df['CONDITION_ID'] == 'b59b83f9-0ee9-420f-83cd-531f0cb6bc0e'].head()

Unnamed: 0,CONDITION_ID,SYMPTOM_CODE,RACE,GENDER,AGE,LABEL
6,b59b83f9-0ee9-420f-83cd-531f0cb6bc0e,f1e386f5773e030aa2451368ccc79679145f798b75fa38...,0,0,19,248


In [72]:
import pandas as pd
symptoms_pd = pd.read_csv(symptoms_csv)

In [73]:
symptoms_pd[symptoms_pd['CONDITION_ID'] == 'b59b83f9-0ee9-420f-83cd-531f0cb6bc0e']

Unnamed: 0,CONDITION_ID,PATIENT,SYMPTOM_CODE,SYMPTOM_DISPLAY,VALUE_CODE,VALUE_DISPLAY
6,b59b83f9-0ee9-420f-83cd-531f0cb6bc0e,b3d23730-2323-416e-9fb6-190c7adf8b02,f1e386f5773e030aa2451368ccc79679145f798b75fa38...,Back pain,f1e386f5773e030aa2451368ccc79679145f798b75fa38...,Back pain (finding)


In [74]:
conditions_pd = pd.read_csv(conditions_csv)

In [75]:
conditions_pd[conditions_pd['Id'] == 'b59b83f9-0ee9-420f-83cd-531f0cb6bc0e']

Unnamed: 0,Id,PATIENT,CODE,DESCRIPTION,ONSET,DIAGNOSED
2,b59b83f9-0ee9-420f-83cd-531f0cb6bc0e,b3d23730-2323-416e-9fb6-190c7adf8b02,fa8aec47e3893d0a9acb2a67e6cdab4d2e79ecb1ab9ceb...,Cystitis,1990-11-20,1990-11-20
