In [1]:
%matplotlib inline

In [2]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from thesislib.utils import pathutils
import json

In [4]:
import pandas as pd
import numpy as np

In [31]:
from collections import OrderedDict

In [103]:
# symptom db and conditions db files
symptom_db_json = pathutils.get_data_file("exploration_II/output/symptom_db.json")
condition_db_json = pathutils.get_data_file("exploration_II/output/condition_db.json")

In [104]:
with open(symptom_db_json) as fp:
    symptom_db = json.load(fp)
with open(condition_db_json) as fp:
    condition_db = json.load(fp)

In [105]:
symptom_vector = sorted(symptom_db.keys())
condition_codes = sorted(condition_db.keys())
condition_labels = {code: idx for idx, code in enumerate(condition_codes)}

In [106]:
symptoms_csv = pathutils.get_data_file("04_06_new_data/data/symptoms.csv")

In [107]:
symptoms_df = pd.read_csv(symptoms_csv)

In [108]:
def _race_txform(val):
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    return race_code.get(val)
def _label_txform(val, labels):
    return labels.get(val)

In [109]:
# first filter for cases where there are no symptoms
symptoms_df = symptoms_df.loc[symptoms_df.NUM_SYMPTOMS > 0]

In [110]:
symptoms_df['LABEL'] = symptoms_df.PATHOLOGY.apply(_label_txform, labels=condition_labels)

In [111]:
symptoms_df.RACE = symptoms_df.RACE.apply(_race_txform)

In [112]:
symptoms_df.GENDER = symptoms_df.GENDER.apply(lambda gender: 0 if gender == 'F' else 1)

In [113]:
symptoms_df = symptoms_df.rename(columns={'AGE_BEGIN': 'AGE'})

In [114]:
# handle the transformation of the symptoms ...
symptom_index_map = OrderedDict({code: 2**idx for idx, code in enumerate(symptom_vector)})

In [115]:
# next transform the symptom column
def _symptom_transform(val, labels):
    if type(val) is not str:
        print(val)
    parts = val.split(";")
    res = sum([labels.get(item) for item in parts])
    return res

In [116]:
symptoms_df['NSYMPTOMS'] = symptoms_df.SYMPTOMS.apply(_symptom_transform, labels=symptom_index_map)

In [64]:
# now we grow the dataframe to the vector format that we want!
def handle_bit_wise(val, comp):
    if val & comp > 0:
        return 1
    else:
        return 0

In [119]:
for idx, code in enumerate(symptom_vector):
    # symptoms_df[code] = symptoms_df.NSYMPTOMS.apply(handle_bit_wise, comp=2**idx)
    val = 2**idx
    symptoms_df[code] = (symptoms_df.NSYMPTOMS & 2**idx).gt(0).astype(np.uint8)

In [122]:
symptom_db[code]

'Vaginal bleeding after menopause'

In [123]:
ordered_keys = ['LABEL', 'GENDER', 'RACE', 'AGE'] + symptom_vector

In [124]:
symptoms_df = symptoms_df[ordered_keys]

In [125]:
# and we can save to csv
output_dir = pathutils.get_data_file("04_06_new_data/output")

if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

In [126]:
output_csv = os.path.join(output_dir, "data.csv")

In [127]:
symptoms_df.to_csv(output_csv)

In [75]:
# is this parsing correct??
# hmm, there's actuall no way to compare exactly because there is no unique identifier, either an encounter ID or a condition ID would be useful ..
# but there is no reason why it should not be the case
# what we can do is group by conditions and make an ordered set of symptoms and each element should be the same thing!

In [128]:
parsed_df = pd.read_csv(output_csv)

In [129]:
raw_df = pd.read_csv(symptoms_csv)

In [130]:
raw_df = raw_df.loc[raw_df.NUM_SYMPTOMS > 0]

In [131]:
unique_conditions = parsed_df.LABEL.unique()

In [132]:
for condition in unique_conditions:
    code = condition_codes[condition]
    
    num_orig_symptoms = sorted(raw_df.loc[raw_df.PATHOLOGY == code].NUM_SYMPTOMS.values)
    
    df = parsed_df.loc[parsed_df.LABEL == condition]
    num_parsed_symptoms = []
    for _, rows in df.iterrows():
        num_syms = sum(rows[6:])
        num_parsed_symptoms.append(num_syms)
    num_parsed_symptoms = sorted(num_parsed_symptoms)
    
    assert num_parsed_symptoms == num_orig_symptoms, "Expected both to be same"

In [96]:
# so at least we know the catch the same number of symptoms, 