This notebook parses two dask implementations: groupby and foldby. Comparisons are made in terms of memory consumption and CPU run time. 

Also we progressively increase the data size to see what the effects of increased data size are on the performance of the respective algorithms

In [4]:
%matplotlib inline
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
import dask.dataframe as dd

In [5]:
from thesislib.utils import pathutils
from glob import glob

In [5]:
output_directory = pathutils.get_data_file("exploration_II/output")

if not os.path.isdir(output_directory):
    os.mkdir(output_directory)

In [6]:
def extract_condition_symptom_from_modules(module_name):
    with open(module_name) as fp:
        module = json.load(fp)
    condition_code = None
    condition_name = None
    symptoms = {}
    
    states = module.get('states')
    for state in states.values():
        state_type = state.get('type')
        if state_type == 'ConditionOnset':
            condition_code = state.get('codes')[0].get('code')
            condition_name = state.get('codes')[0].get('display')
        elif state_type == 'Symptom':
            symptom_code = state.get('symptom_code').get('code')
            symptom_name = state.get('symptom')
            symptoms[symptom_code] = symptom_name
    return condition_code, condition_name, symptoms

In [7]:
# symptom db and conditions db files
symptom_db_json = pathutils.get_data_file("exploration_II/output/symptom_db.json")
condition_db_json = pathutils.get_data_file("exploration_II/output/condition_db.json")

In [8]:
if not os.path.isfile(symptom_db_json) or not os.path.isfile(condition_db_json):
    module_files = glob(pathutils.get_data_file("exploration_II/data/all_modules/*.json"))
    symptom_db = {}
    condition_db = {}
    
    for file in module_files:
        c_code, c_name, syms = extract_condition_symptom_from_modules(file)
        condition_db[c_code] = c_name
        symptom_db.update(syms)
    with open(symptom_db_json, "w") as fp:
        json.dump(symptom_db, fp)
    with open(condition_db_json, "w") as fp:
        json.dump(condition_db, fp)
else:
    with open(symptom_db_json) as fp:
        symptom_db = json.load(fp)
    with open(condition_db_json) as fp:
        condition_db = json.load(fp)

In [9]:
symptom_vector = set(symptom_db.keys())
condition_codes = set(condition_db.keys())
condition_labels = {code: idx for idx, code in enumerate(condition_codes)}

In [10]:
patients_csv = pathutils.get_data_file("exploration_II/data/new/patients.csv")
conditions_csv = pathutils.get_data_file("exploration_II/data/new/conditions.csv")
symptoms_csv = pathutils.get_data_file("exploration_II/data/new/symptoms.csv")

In [11]:
from dask.distributed import Client, progress
client = Client(n_workers=2, threads_per_worker=2, memory_limit='4GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:63669  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 4  Memory: 8.00 GB


In [12]:
patient_columns = ['Id', 'BIRTHDATE', 'RACE', 'GENDER']
condition_columns = ['ENCOUNTER', 'PATIENT', 'CODE', 'START']
symptom_colums = ['ENCOUNTER', 'PATIENT', 'SYMPTOM_CODE']

In [34]:
# let's try the group by first

patients = dd.read_csv(
    patients_csv,
    usecols=patient_columns,
    parse_dates=['BIRTHDATE'],
    infer_datetime_format=True,
    dtype={
        'GENDER': 'category',
        'RACE': 'category'
    }
)


conditions = dd.read_csv(conditions_csv, usecols=condition_columns, parse_dates=['START'], infer_datetime_format=True)


symptoms = dd.read_csv(symptoms_csv, usecols=symptom_colums)
symptoms = symptoms.repartition(npartitions=4)



In [13]:
def _race_txform(val):
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    return race_code.get(val)
def _label_txform(val, labels):
    return labels.get(val)

In [None]:
patients['RACE'] = patients['RACE'].apply(_race_txform, meta=('RACE', np.uint8))
patients['GENDER'] = patients['GENDER'].apply(lambda gender: 0 if gender == 'F' else 1, meta=('GENDER', np.bool))

In [38]:
conditions['LABEL'] = conditions['CODE'].apply(_label_txform, labels=condition_labels, meta=('CODE', np.uint16))

In [39]:
df = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
df = symptoms.merge(df, how='left', left_on='ENCOUNTER', right_on='ENCOUNTER', suffixes=('_symp', ''))

In [41]:
df['AGE'] = ((df['START'] - df['BIRTHDATE']).astype('timedelta64[M]')/12).astype(np.float32)

In [None]:
interest_keys = {'LABEL', 'RACE', 'GENDER', 'AGE', 'ENCOUNTER', 'SYMPTOM_CODE'}
key_map = {}
for idx, column in enumerate(df.columns):
    if column in interest_keys:
        key_map[idx] = column

In [14]:
from collections import OrderedDict

In [25]:
parts = ['ENCOUNTER', 'LABEL', 'RACE', 'GENDER', 'AGE']
label_map = OrderedDict({itm: idx for idx, itm in enumerate(parts)})
symptom_start = symptom_idx =  len(parts)
for item in sorted(symptom_vector):
    label_map[item] = symptom_idx
    symptom_idx += 1

In [62]:
# enforce the order of the keys, and select just those we want
ordered_keys = parts + ['SYMPTOM_CODE']
df = df[ordered_keys]

In [63]:
# change to a bag because we're going to do some non dataframe friendly operations
bag = df.to_bag()

In [65]:
# these are the columns in the order we want them:
# Index(['ENCOUNTER', 'LABEL', 'RACE', 'GENDER', 'AGE', 'SYMPTOM_CODE'], dtype='object')

In [86]:
# we need to append the index of the symptom in the final reduction so we can pick it up 
def append_index_to_symptom(row, label_map):
    val = list(row)
    symptom = val[5] # we know it would be the fifth
    idx = label_map[symptom]
    val[5] = idx
    return val

In [87]:
t_bag = bag.map(append_index_to_symptom, label_map=label_map)

In [101]:
def do_reduce(item, num_columns):
    encounter, rows = item
    reduction =n  ['0' for idx in range(num_columns)]
    for row in rows:
        for idx, val in enumerate(row):
            if idx <= 4:
                reduction[idx] = str(val)
            else:
                reduction[val] = '1'
    return encounter, reduction

In [102]:
# group by the first coolum i.e the encounter
grouped = t_bag.groupby(lambda v: v[0])

In [16]:
num_columns = len(label_map)

In [108]:
# and the reduction ??
reduced_w_keys = grouped.map(do_reduce, num_columns=num_columns)
reduced = reduced_w_keys.map(lambda v: v[1])

In [109]:
# get it into a string so we can put it in text files
text_reduced = reduced.map(lambda v: ",".join(v))

In [110]:
# output dir
csv_output = pathutils.get_data_file("exploration_II/output/parsed/data-*.csv")
op_files = text_reduced.to_textfiles(csv_output)

['/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/exploration_II/output/parsed/data-0.csv',
 '/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/exploration_II/output/parsed/data-1.csv',
 '/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/exploration_II/output/parsed/data-2.csv',
 '/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/exploration_II/output/parsed/data-3.csv']

In [114]:
column_names = list(label_map.keys())

In [115]:
d1 = pd.read_csv('/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/exploration_II/output/parsed/data-0.csv', names=column_names)

In [146]:
symp_pd = pd.read_csv(symptoms_csv)

## Very slow comparison
- did we get it right?
- compare read results, make sure the right symptoms were encoded!
```python
for val in d1.itertuples():
    encounterId = val[1]
    recorded_symptoms = []
    for idx, item in enumerate(val[6:]):
        if item == 1:
            recorded_symptoms.append(column_names[idx + 5])
    # get the symptoms that were present previously
    encounter_symptoms = symp_pd[symp_pd.ENCOUNTER == encounterId].SYMPTOM_CODE
    num_recorded_symptoms = len(recorded_symptoms)
    num_actual_symptoms = len(encounter_symptoms)
    assert num_recorded_symptoms == num_actual_symptoms, "Expected %d but got %d symptoms for %s" % (num_actual_symptoms, num_recorded_symptoms, encounterId)
    
    # make sure we have the same set
    recorded_symptoms = set(recorded_symptoms)
    encounter_symptoms = set(encounter_symptoms)
    same = recorded_symptoms & encounter_symptoms
    num_similar = len(same)
    assert num_similar == num_actual_symptoms, "Expected both recorded symptoms and actual symptoms to be the same"
```

In [None]:
# can we make the test faster !?
np_column_names = np.array(column_names)
grouped_symptoms = symp_pd.groupby('ENCOUNTER')
for val in d1.itertuples():
    encounterId = val[1]
    recorded_symptoms = np_column_names[np.nonzero(val[6:])[0] + 5]
    encounter_symptoms = grouped_symptoms.get_group(encounterId).SYMPTOM_CODE
    same_length = len(recorded_symptoms) == len(encounter_symptoms)
    same_elem = set(recorded_symptoms) == set(encounter_symptoms)
    assert same_length and same_elem, "Expected both to be the same"

So we know now that this method words. And it seemed much faster than previous attempts. 

Memory consumption was much better and it was (in my opinion) almost as fast if not faster than the pd approach

Can we try the foldBy approach? Would it be better (since it's supposed to avoid expensive operations as per the dask documentation)

We'll start all over so we get a fair comparison

In [228]:
# and now for foldby
# let's try the group by first
fpatients = dd.read_csv(
    patients_csv,
    usecols=patient_columns,
    parse_dates=['BIRTHDATE'],
    infer_datetime_format=True,
    dtype={
        'GENDER': 'category',
        'RACE': 'category'
    }
)

fconditions = dd.read_csv(conditions_csv, usecols=condition_columns, parse_dates=['START'], infer_datetime_format=True)

fsymptoms = dd.read_csv(symptoms_csv, usecols=symptom_colums)
fsymptoms = fsymptoms.repartition(npartitions=4)

In [229]:
fpatients['RACE'] = fpatients['RACE'].apply(_race_txform, meta=('RACE', np.uint8))
fpatients['GENDER'] = fpatients['GENDER'].apply(lambda gender: 0 if gender == 'F' else 1, meta=('GENDER', np.bool))

In [230]:
fconditions['LABEL'] = conditions['CODE'].apply(_label_txform, labels=condition_labels, meta=('CODE', np.uint16))

In [231]:
fdf = fconditions.merge(fpatients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
fdf = fsymptoms.merge(fdf, how='left', left_on='ENCOUNTER', right_on='ENCOUNTER', suffixes=('_symp', ''))

In [232]:
fdf['AGE'] = ((fdf['START'] - fdf['BIRTHDATE']).astype('timedelta64[M]')/12).astype(np.float32)

In [17]:
# transform the symptom codes as well
def transform_symptom_codes(item, label_map):
    return label_map.get(item)

In [234]:
fdf['SYMPTOM_CODE'] = fdf['SYMPTOM_CODE'].apply(transform_symptom_codes, label_map=label_map, meta=('SYMPTOM_CODE', np.uint16))

In [236]:
# force the column order
fdf = fdf[ordered_keys]

In [237]:
fdf.columns

Index(['ENCOUNTER', 'LABEL', 'RACE', 'GENDER', 'AGE', 'SYMPTOM_CODE'], dtype='object')

In [238]:
initial_entry = ['0' for idx in range(num_columns)]

In [244]:
def bin_op(v1, v2):
    # Index(['ENCOUNTER', 'LABEL', 'RACE', 'GENDER', 'AGE', 'SYMPTOM_CODE'], dtype='object')
    if v1[0] == '0':
        base = list(v1)
        init = True
    else:
        base = v1
        init = False
    if init:
        for idx in range(5):
            base[idx] = str(v2[idx])
    base[v2[5]] = '1'
    return base

def combine_op(v1, v2):
    for idx, item in enumerate(v2[5:]):
        if item == '1':
            v1[idx + 5] = '1'
    return v1

In [245]:
fbag = fdf.to_bag()

In [246]:
folded_w_key= fbag.foldby(lambda v: v[0], binop=bin_op, initial=initial_entry, combine=combine_op)

In [247]:
text_folded = folded_w_key.map(lambda v: ",".join(v[1]))

In [248]:
# output dir
fcsv_output = pathutils.get_data_file("exploration_II/output/parsed_foldby/data-*.csv")
op_files = text_folded.to_textfiles(fcsv_output)

In [253]:
fd1 = pd.read_csv(op_files[0], names=column_names)

In [254]:
# is it correct!
for val in fd1.itertuples():
    encounterId = val[1]
    recorded_symptoms = np_column_names[np.nonzero(val[6:])[0] + 5]
    encounter_symptoms = grouped_symptoms.get_group(encounterId).SYMPTOM_CODE
    same_length = len(recorded_symptoms) == len(encounter_symptoms)
    same_elem = set(recorded_symptoms) == set(encounter_symptoms)
    assert same_length and same_elem, "Expected both to be the same"

Foldby was blazing fast!!! 

I'll add a timer for quantitative evaluation. But this was incredible speed.

**Update**

Sadly these methods though they work well on the LocalCluster, they seem to fail in the distributed scheduler. 

I am going to try for a dataframe only approach and see. I have an idea.

In [18]:
# let's start all over.
dfpatients = dd.read_csv(
    patients_csv,
    usecols=patient_columns,
    parse_dates=['BIRTHDATE'],
    infer_datetime_format=True,
)

dfconditions = dd.read_csv(conditions_csv, usecols=condition_columns, parse_dates=['START'], infer_datetime_format=True)

dfsymptoms = dd.read_csv(symptoms_csv, usecols=symptom_colums)
dfsymptoms = dfsymptoms.repartition(npartitions=4)



In [19]:
dfpatients['RACE'] = dfpatients['RACE'].apply(_race_txform, meta=('RACE', np.uint8))
dfpatients['GENDER'] = dfpatients['GENDER'].apply(lambda gender: 0 if gender == 'F' else 1, meta=('GENDER', np.bool))

In [20]:
dfconditions['LABEL'] = dfconditions['CODE'].apply(_label_txform, labels=condition_labels, meta=('CODE', np.uint16))

In [28]:
dfsymptoms['SYMPTOM_CODE'] = dfsymptoms['SYMPTOM_CODE'].apply(transform_symptom_codes, label_map=label_map, meta=('SYMPTOM_CODE', np.uint16))

In [29]:
comb = dfconditions.merge(dfpatients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
comb = dfsymptoms.merge(comb, how='left', left_on='ENCOUNTER', right_on='ENCOUNTER', suffixes=('_symp', ''))

In [30]:
comb['AGE'] = ((comb['START'] - comb['BIRTHDATE']).astype('timedelta64[M]')/12).astype(np.uint8)

In [31]:
ordered_keys = parts + ['SYMPTOM_CODE']

In [32]:
comb = comb[ordered_keys]

In [33]:
comb['counter'] = 1

In [34]:
label_keys = list(label_map.keys())

In [36]:
# need to grow the data frame
for idx in range(5, 381):
    comb[label_keys[idx]] = (comb.SYMPTOM_CODE == idx).astype(np.uint8)

In [37]:
# group by the encounterId
comb_grouped = comb.groupby('ENCOUNTER')

In [38]:
sumed = comb_grouped.agg('sum')

In [39]:
sumed['LABEL'] = (sumed['LABEL']/sumed['counter']).astype(np.uint16)

In [40]:
sumed['RACE'] = (sumed['RACE']/sumed['counter']).astype(np.uint8)

In [41]:
sumed['AGE'] = (sumed['AGE']/sumed['counter']).astype(np.uint8)

In [42]:
sumed['GENDER'] = (sumed['GENDER']/sumed['counter']).astype(np.bool)

In [43]:
# and now we should have what we want ...
# but we got into the one dataframe issue again. And it'll be tricky to verify the results because we also lost the condition index. 
# but let's see ..
dffcsv_output = pathutils.get_data_file("exploration_II/output/parsed_foldby/dfdata-*.csv")
sumed.to_csv(dffcsv_output)

['/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/exploration_II/output/parsed_foldby/dfdata-0.csv']

In [44]:
# This was quite fast too!
# but wwe also got the 1-partition business, so I don't know how this works on the cluster 
# but first let's see if this is correct!
symp_pd = pd.read_csv(symptoms_csv)

In [45]:
dfsoln = pd.read_csv('/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/exploration_II/output/parsed_foldby/dfdata-0.csv')

In [46]:
column_names = list(label_map.keys())
np_column_names = np.array(column_names)

In [48]:
grouped_symptoms = symp_pd.groupby('ENCOUNTER')

In [59]:
for val in dfsoln.itertuples():
    encounterId = val[1]
    recorded_symptoms = np_column_names[np.nonzero(val[8:])[0] + 5]
    encounter_symptoms = grouped_symptoms.get_group(encounterId).SYMPTOM_CODE
    same_length = len(recorded_symptoms) == len(encounter_symptoms)
    same_elem = set(recorded_symptoms) == set(encounter_symptoms)
    assert same_length and same_elem, "Expected both to be the same"

**Update**

The dataframe solution has one huge problem: memory explosion!

When you think about it, I am expanding every symptom, every single one of them only to sum them up in the end. 

If I could avoid this expansion and do it after grouping, then that would be great. 

It would of course require a different kind of symptom encoding so that I can use bit wise operators to recover the actual symptoms that were combined.

Memory conservation

In [368]:
import pandas as pd
import numpy as np
import json

In [369]:
import dask.dataframe as dd

In [370]:
from thesislib.utils import pathutils
from glob import glob

In [371]:
output_directory = pathutils.get_data_file("exploration_II/output")

if not os.path.isdir(output_directory):
    os.mkdir(output_directory)

In [372]:
def extract_condition_symptom_from_modules(module_name):
    with open(module_name) as fp:
        module = json.load(fp)
    condition_code = None
    condition_name = None
    symptoms = {}
    
    states = module.get('states')
    for state in states.values():
        state_type = state.get('type')
        if state_type == 'ConditionOnset':
            condition_code = state.get('codes')[0].get('code')
            condition_name = state.get('codes')[0].get('display')
        elif state_type == 'Symptom':
            symptom_code = state.get('symptom_code').get('code')
            symptom_name = state.get('symptom')
            symptoms[symptom_code] = symptom_name
    return condition_code, condition_name, symptoms

In [373]:
# symptom db and conditions db files
symptom_db_json = pathutils.get_data_file("exploration_II/output/symptom_db.json")
condition_db_json = pathutils.get_data_file("exploration_II/output/condition_db.json")

In [374]:
if not os.path.isfile(symptom_db_json) or not os.path.isfile(condition_db_json):
    module_files = glob(pathutils.get_data_file("exploration_II/data/all_modules/*.json"))
    symptom_db = {}
    condition_db = {}
    
    for file in module_files:
        c_code, c_name, syms = extract_condition_symptom_from_modules(file)
        condition_db[c_code] = c_name
        symptom_db.update(syms)
    with open(symptom_db_json, "w") as fp:
        json.dump(symptom_db, fp)
    with open(condition_db_json, "w") as fp:
        json.dump(condition_db, fp)
else:
    with open(symptom_db_json) as fp:
        symptom_db = json.load(fp)
    with open(condition_db_json) as fp:
        condition_db = json.load(fp)

In [375]:
symptom_vector = sorted(symptom_db.keys())
condition_codes = sorted(condition_db.keys())
condition_labels = {code: idx for idx, code in enumerate(condition_codes)}

In [376]:
patients_csv = pathutils.get_data_file("exploration_II/data/new/patients.csv")
conditions_csv = pathutils.get_data_file("exploration_II/data/new/conditions.csv")
symptoms_csv = pathutils.get_data_file("exploration_II/data/new/symptoms.csv")

In [377]:
from dask.distributed import Client, progress
client = Client(n_workers=2, threads_per_worker=2, memory_limit='4GB')
client

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


0,1
Client  Scheduler: tcp://127.0.0.1:63640  Dashboard: http://127.0.0.1:63641/status,Cluster  Workers: 2  Cores: 4  Memory: 8.00 GB


In [378]:
patient_columns = ['Id', 'BIRTHDATE', 'RACE', 'GENDER']
condition_columns = ['ENCOUNTER', 'PATIENT', 'CODE', 'START']
symptom_colums = ['ENCOUNTER', 'PATIENT', 'SYMPTOM_CODE']

In [379]:
def _race_txform(val):
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    return race_code.get(val)
def _label_txform(val, labels):
    return labels.get(val)

In [380]:
# let's start all over.
df2patients = dd.read_csv(
    patients_csv,
    usecols=patient_columns,
    parse_dates=['BIRTHDATE'],
    infer_datetime_format=True,
)

df2conditions = dd.read_csv(conditions_csv, usecols=condition_columns, parse_dates=['START'], infer_datetime_format=True)

df2symptoms = dd.read_csv(symptoms_csv, usecols=symptom_colums)
df2symptoms = df2symptoms.repartition(npartitions=4)

In [381]:
df2patients['RACE'] = df2patients['RACE'].apply(_race_txform, meta=('RACE', np.uint8))
df2patients['GENDER'] = df2patients['GENDER'].apply(lambda gender: 0 if gender == 'F' else 1, meta=('GENDER', np.bool))

In [382]:
df2conditions['LABEL'] = df2conditions['CODE'].apply(_label_txform, labels=condition_labels, meta=('CODE', np.uint16))

In [383]:
df2 = df2conditions.merge(df2patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
df2 = df2symptoms.merge(df2, how='left', left_on='ENCOUNTER', right_on='ENCOUNTER', suffixes=('_symp', ''))

In [384]:
df2['AGE'] = ((df2['START'] - df2['BIRTHDATE']).astype('timedelta64[M]')/12).astype(np.uint16)

In [385]:
from collections import OrderedDict

In [386]:
parts = ['ENCOUNTER', 'LABEL', 'RACE', 'GENDER', 'AGE']

In [387]:
label_map = OrderedDict({})
power = np.array([2**idx for idx in range(len(symptom_vector))])
for idx, item in enumerate(symptom_vector):
    label_map[item] = power[idx]

In [388]:
# enforce the order of the keys, and select just those we want
ordered_keys = parts + ['SYMPTOM_CODE']

In [389]:
num_columns = len(symptom_vector) + len(parts)

In [390]:
# transform the symptom codes as well
def transform_symptom_codes(item, label_map):
    return label_map.get(item)

In [391]:
pp = np.array([2**376])

In [392]:
df2symptoms['SYMPTOM_CODE'] = df2symptoms['SYMPTOM_CODE'].apply(transform_symptom_codes, label_map=label_map, meta=('SYMPTOM_CODE', power.dtype))

In [393]:
comb = df2conditions.merge(df2patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
comb = df2symptoms.merge(comb, how='left', left_on='ENCOUNTER', right_on='ENCOUNTER', suffixes=('_symp', ''))

In [394]:
comb['AGE'] = ((comb['START'] - comb['BIRTHDATE']).astype('timedelta64[M]')/12).astype(np.uint8)

In [395]:
comb = comb[ordered_keys]

In [396]:
comb['counter'] = 1

In [397]:
# group by the encounterId
comb_grouped = comb.groupby('ENCOUNTER')

In [398]:
sumed = comb_grouped.agg('sum', split_out=2)

In [399]:
dtypes = {
    'LABEL': np.uint16,
    'RACE': np.uint8,
    'AGE': np.uint8,
    'GENDER': np.uint8,
    'SYMPTOM_CODE': np.object
}

In [400]:
def map_agg(df):
    df.LABEL = (df.LABEL/df.counter).astype(np.uint16)
    df.RACE = (df.RACE/df.counter).astype(np.uint8)
    df.AGE = (df.AGE/df.counter).astype(np.uint8)
    df.GENDER = (df.GENDER/df.counter).astype(np.uint8)
    return df[['LABEL', 'RACE', 'AGE', 'GENDER', 'SYMPTOM_CODE']]

In [401]:
sumed = sumed.map_partitions(map_agg, meta=dtypes)

In [402]:
def check(val, comp):
    c = val & comp
    if c > 0:
        return 1
    else:
        return 0

In [403]:
def map_expand(df, label_map):
    def check_inner(val, comp):
        c = val & comp
        if c > 0:
            return 1
        else:
            return 0
    for k, v in label_map.items():
        df[k] = df.SYMPTOM_CODE.apply(check_inner, comp=v)
    return df

In [404]:
full_dtype = {item: val for item, val in dtypes.items()}
full_dtype.update({item: np.uint8 for item in label_map})

In [405]:
sumed = sumed.map_partitions(map_expand, label_map=label_map, meta=full_dtype)

In [406]:
# for item in symptom_vector:
#     sumed[item] = sumed.SYMPTOM_CODE.apply(check, comp=label_map.get(item), meta=('SYMPTOM_CODE', np.uint8))

In [407]:
# and now we should have what we want ...
# but we got into the one dataframe issue again. And it'll be tricky to verify the results because we also lost the condition index. 
# but let's see ..
dffcsv_output = pathutils.get_data_file("exploration_II/output/parsed_df/dfdata-*.csv")
sumed.to_csv(dffcsv_output)

['/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/exploration_II/output/parsed_df/dfdata-0.csv',
 '/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/exploration_II/output/parsed_df/dfdata-1.csv']

In [408]:
# This was quite fast too!
# but wwe also got the 1-partition business, so I don't know how this works on the cluster 
# but first let's see if this is correct!
symp_pd = pd.read_csv(symptoms_csv)

In [409]:
dd = {item: np.uint8 for item in symptom_vector}

In [410]:
dfsoln = pd.read_csv('/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/exploration_II/output/parsed_df/dfdata-0.csv')

In [411]:
np_column_names = np.array(dfsoln.columns)[7:]

In [412]:
grouped_symptoms = symp_pd.groupby('ENCOUNTER')

In [413]:
for val in dfsoln.itertuples():
    encounterId = val[1]
    recorded_symptoms = np_column_names[np.nonzero(val[8:])[0]]
    encounter_symptoms = grouped_symptoms.get_group(encounterId).SYMPTOM_CODE
    same_length = len(recorded_symptoms) == len(encounter_symptoms)
    same_elem = set(recorded_symptoms) == set(encounter_symptoms)
    assert same_length and same_elem, "Expected both to be the same"

**Comments**

This method does indeed work. 

It is a pity that I had to resort to using an apply function to get the bitwise and working as opposed to pandas most likely more efficient solution for handling these kind of broadcast operations.

None the less, the question remains: will this fall within the memory requirements on the cluster ?

For sure this method takes much less memory than the previous attempt. And I can do even more adjustments to make some more room, but we'll have to see if we can make something that does indeed work.

In [414]:
client.close()