### Description

Due to the difficulty in parsing the previous format of the data, a more compact version of the data was generated.

This format eliminates the need for a joins or merges and also for groupby which were the most troublesome operations in the previous method.

The data is also more compact (17G vs 60G) and memory is less likely to be an issue. 

The elimnation of the shuffle triggering operations means that the parallelism in this operation is also much more and should be well within the capabilities of the processor.

And in the worst case that there are still problems the data can easily be split and used as is.

It also readily lends itself to a dask dataframe solution (credit goes to some of the supposedly 'must work' solutions from the previous approach)

In [1]:
import os
import json

In [2]:
from dask_jobqueue import SLURMCluster
import dask.dataframe as dd
from dask.distributed import Client

In [3]:
import numpy as np
from glob import glob

In [4]:
import pandas as pd

In [5]:
symptom_db_json = os.path.join("/home/oagba/bulk/data/kk/json", "symptom_db.json")
condition_db_json = os.path.join("/home/oagba/bulk/data/kk/json", "condition_db.json")

In [6]:
with open(symptom_db_json) as fp:
    symptom_db = json.load(fp)
with open(condition_db_json) as fp:
    condition_db = json.load(fp)

In [7]:
symptom_vector = sorted(list(symptom_db.keys()))
condition_codes = sorted(list(condition_db.keys()))
condition_labels = {code: idx for idx, code in enumerate(condition_codes)}

In [8]:
cluster = SLURMCluster(
    queue='general',
    # project='medvice_parse',
    cores=16,
    memory='100 GB',
    walltime='02:00:00'
)

In [9]:
client = Client(cluster)
cluster.scale(1)

In [10]:
csv_dir = "/shares/bulk/oagba/data/output_new_100k/symptoms/csv"
symptoms_csv = os.path.join(csv_dir, "symptoms.csv")

In [11]:
def _race_txform(val):
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    return race_code.get(val)

def _label_txform(val, labels):
    return labels.get(val)

def _symptom_transform(val, labels):
    parts = val.split(";")
    res = sum([labels.get(item) for item in parts])
    return res

In [12]:
symptoms_df = dd.read_csv(symptoms_csv)



In [13]:
symptoms_df = symptoms_df.loc[symptoms_df.NUM_SYMPTOMS > 0]

In [14]:
# do not repartition, 
#symptoms_df = symptoms_df.repartition(npartitions=200)

In [15]:
symptoms_df['LABEL'] = symptoms_df.PATHOLOGY.apply(_label_txform, labels=condition_labels, meta=('PATHOLOGY', np.uint16))

In [16]:
symptoms_df['RACE'] = symptoms_df.RACE.apply(_race_txform, meta=('RACE', np.uint8))

In [17]:
symptoms_df['GENDER'] = symptoms_df.GENDER.apply(lambda gender: 0 if gender == 'F' else 1, meta=('GENDER', np.uint8))

In [18]:
symptoms_df = symptoms_df.rename(columns={'AGE_BEGIN': 'AGE'})

In [19]:
from collections import OrderedDict

In [20]:
symptom_index_map = OrderedDict({code: 2**idx for idx, code in enumerate(symptom_vector)})

In [21]:
symptoms_df['NSYMPTOMS'] = symptoms_df.SYMPTOMS.apply(_symptom_transform, labels=symptom_index_map, meta=('SYMPTOMS', np.object))

In [22]:
# now we grow the dataframe to the vector format that we want!
def handle_bit_wise(val, comp):
    if val & comp > 0:
        return 1
    else:
        return 0

In [23]:
def map_expand(df, vector):
    def handle_bit_wise(val, comp):
        c = val & comp
        if c > 0:
            return 1
        else:
            return 0
    for idx, k in enumerate(vector):
        v = 2 ** idx
        df[k] = df.NSYMPTOMS.apply(handle_bit_wise, comp=v)
    ordered_keys = ['LABEL', 'GENDER', 'RACE', 'AGE'] + vector
    return df[ordered_keys]

In [24]:
full_dtype = {
    'LABEL': np.uint16,
    'GENDER': np.uint8,
    'RACE': np.uint8,
    'AGE': np.uint16
}
full_dtype.update({code: np.uint8 for code in symptom_vector})

In [25]:
symptoms_df = symptoms_df.map_partitions(map_expand, vector=symptom_vector, meta=full_dtype)

In [26]:
csv_op = "/home/oagba/bulk/data/output_new_100k/parsed/data-*.csv"

In [None]:
symptoms_df.to_csv(csv_op)

In [None]:
client.close()

In [None]:
cluster.close()

In [None]:
os.cpu_count()