In [1]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import json
import numpy as np
import joblib

In [3]:
from thesislib.utils import pathutils

In [4]:
with open(pathutils.get_data_file("prob-synthea-1/data/condition_codes.json")) as fp:
    condition_codes = json.load(fp)
with open(pathutils.get_data_file("prob-synthea-1/data/conditions_db.json")) as fp:
    condition_db = json.load(fp)
with open(pathutils.get_data_file("prob-synthea-1/data/symptom_vector.json")) as fp:
    symptom_vector = json.load(fp)
with open(pathutils.get_data_file("prob-synthea-1/data/symptoms_db.json")) as fp:
    symptoms_db = json.load(fp)

condition_label = {item: idx for idx, item in enumerate(condition_codes)}

In [86]:
patient_columns = ['Id', 'BIRTHDATE', 'RACE', 'ETHNICITY', 'GENDER']
patients = pd.read_csv(pathutils.get_data_file("prob-synthea-1/data/patients.csv"), usecols=patient_columns)

condition_columns = ['Id', 'PATIENT', 'CODE', 'ONSET']
conditions = pd.read_csv(pathutils.get_data_file("prob-synthea-1/data/patient_conditions.csv"), usecols=condition_columns)

In [88]:
patient_conditions = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))

In [12]:
import os
from dateutil.parser import parse as date_parser
from dateutil.relativedelta import relativedelta

In [92]:
symptom_dir = pathutils.get_data_file("prob-synthea-1/data/symptoms")
_temp_list = []
symptoms_list = []

for path in os.listdir(symptom_dir):
    if os.path.isdir(os.path.join(symptom_dir, path)):
        continue
        
    _temp_list.append(path)

_temp_list.sort()

symptoms_list = [(idx, os.path.join(symptom_dir, path), idx==0) for idx, path in enumerate(_temp_list)]

In [74]:
output_path = pathutils.get_data_file("prob-synthea-1/data/processed")

if not os.path.isdir(output_path):
    os.mkdir(output_path)

In [90]:
def parse_symptoms(patient_conditions, condition_labels, symptom_vector, output_path, file_data):
    
    file_index, symptom_file, pass_columns = file_data
    
    race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}
    
    columns = ["CONDITION_ID","PATIENT","SYMPTOM_CODE","SYMPTOM_DISPLAY","VALUE_CODE","VALUE_DISPLAY"]
    usecols = ['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE']
    
    if pass_columns:
        symptoms = pd.read_csv(symptom_file, names=columns, usecols=usecols)
    else:
        symptoms = pd.read_csv(symptom_file, usecols=usecols)
    
    
    if symptoms.shape[1] <= 0:
        return None
    
    _tmp = symptoms.merge(patient_conditions, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))
    
    grp = _tmp.groupby(['CONDITION_ID'])
    design_matrix = {
        "label": [],
        "age": [],
        "gender": [],
        "race": [],
    }
    
    for item in symptom_vector:
        design_matrix[item] = []
        
    for item, df in grp.__iter__():
        vector = {_: 0 for _ in symptom_vector}
        
        onset_date = date_parser(df['ONSET'].iloc[0])
        patient_birthdate = date_parser(df["BIRTHDATE"].iloc[0])
        vector['age'] =  abs(patient_birthdate.year - onset_date.year)
        vector['gender'] = 0 if df['GENDER'].iloc[0] == 'F' else 1
        vector['race'] = race_code[df['RACE'].iloc[0]]
        vector['label'] = condition_labels[df['CODE'].iloc[0]]
        
        for idx, symptom_code in df["SYMPTOM_CODE"].items():
            vector[symptom_code] = 1
        
        for k,v in vector.items():
            design_matrix[k].append(v)
    
    output_file = os.path.join(output_path, "processed_%d.json" % file_index)
    with open(output_file, 'w') as fp:
        json.dump(design_matrix, fp)

In [81]:
from joblib import Parallel, delayed

In [91]:
# def parse_symptoms(patient_conditions, condition_codes, symptom_vector, output_path, idx, symptom_file):
_ = Parallel(n_jobs=2)(delayed(parse_symptoms)(patient_conditions, condition_label, symptom_vector, output_path, file_data) for file_data in symptoms_list)

In [83]:
_syms.columns

Index(['CONDITION_ID', 'PATIENT', 'SYMPTOM_CODE', 'SYMPTOM_DISPLAY',
       'VALUE_CODE', 'VALUE_DISPLAY'],
      dtype='object')