In this notebook, we select the json synthea modules which we are interested in and extract all the possible symptom definitions (and also conditions from them)

In [1]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import json
import re
import hashlib
from collections import deque
import copy

In [3]:
from thesislib.utils import pathutils

In [19]:
# input files
asthma_json = pathutils.get_data_file("simple-synthea/data/synthea-modules/asthma.json")
bronchitis_json = pathutils.get_data_file("simple-synthea/data/synthea-modules/bronchitis.json")
sinusitis_json = pathutils.get_data_file("simple-synthea/data/synthea-modules/sinusitis.json")
sore_throat_json = pathutils.get_data_file("simple-synthea/data/synthea-modules/sore_throat.json")
urinary_infection_json = pathutils.get_data_file("simple-synthea/data/synthea-modules/urinary_tract_infections.json")

# output dir
output_dir = pathutils.get_data_file("simple-synthea/output/modules")
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

In [6]:
json_files = [asthma_json, bronchitis_json, sinusitis_json, sore_throat_json, urinary_infection_json]

In [7]:
symptoms = []
conditions = []

In [13]:
def parse_module(jsonfile):
    with open(jsonfile) as fp:
        data = json.load(fp)
    
    symptoms = []
    conditions = []
    states = data.get("states", None)
    if None:
        raise ValueError("No valid states in synthea module")
    
    for key, state in states.items():
        if state.get("type", None) == "Symptom":
            symptoms.append(state.get("symptom"))
        elif state.get("type", None) == "ConditionOnset":
            conditions.append(key)
    return symptoms, conditions

def transform_symptom(symptom):
    xformed = symptom.lower().strip()
    xformed = re.sub(r"\s+", "_", xformed)
    return xformed

def get_next_states(current):
    transition = None
    transition_name = None
    for k in current.keys():
        if "transition" in k:
            transition = current.get(k)
            transition_name = k
            break
    if transition is None:
        return []
    
    if  transition_name == "direct_transition":
        return [transition]
    elif transition_name == "distributed_transition" or transition_name == "conditional_transition":
        return [item["transition"] for item in transition]
    elif transition_name == "complex_transition":
        res = []
        for item in transition:
            distributions = item.get("distributions")
            res += [item.get("transition") for item in distributions]
        return res
    else:
        raise ValueError("Invalid transition: %s found!" % transition_name)
        
def transform_state(file):
    with open(file) as fp:
        data = json.load(fp)
    
    transformed_symptoms = {}
    condition_onsets = {}
    
    # follow all the condition states and add their codes to the symptoms
    states = data.get("states")
    for k,v in states.items():
        if v.get("type") == "ConditionOnset":
            condition_code = v.get("codes")[0]
            condition_onsets[k] = condition_code
    
    # now we have all the condition states, so we can run through the states and transform them
    for k, c_code in condition_onsets.items():
        queue = deque([k])
        visited = set([])
        while len(queue) > 0:
            name = queue.popleft()
            current = states.get(name)
            if current.get("type") == "Symptom":
                if name not in transformed_symptoms:
                    symptom = copy.deepcopy(current)
                    symptom_name = transform_symptom(symptom.get("symptom"))
                    symptom_hash = symptom_hash_dict.get(symptom_name)
                    symptom.update({
                        "symptom": symptom_name,
                        "symptom_code": {
                            "system": "sha224",
                            "code": symptom_hash,
                            "display": "is there %s" % symptom_name
                        },
                        "value_code": {
                            "system": "sha224",
                            "code": symptom_hash,
                            "display": "%s (finding)" % symptom_name
                        },
                        "condition_codes": []
                    })
                    
                else:
                    symptom = transformed_symptoms.get(name)
                
                condition_codes = symptom.get("condition_codes")
                will_append = True
                for code in condition_codes:
                    if code.get("code") == c_code.get("code"):
                        will_append = False
                        break
                if will_append:
                    symptom["condition_codes"].append(c_code)
                
                transformed_symptoms[name] = symptom
            
            visited.add(name)
            next_states = get_next_states(current)
            for item in next_states:
                if item not in visited:
                    queue.append(item)
    
    # we can now update all the states
    states.update(transformed_symptoms)
    
    data["states"] = states
    return data

In [14]:
for file in json_files:
    s, c = parse_module(file)
    symptoms += s
    conditions +=c

In [15]:
transformed_symptoms = [transform_symptom(item) for item in symptoms]

In [16]:
unique_symptoms = list(set(transformed_symptoms))
unique_symptoms.sort()

In [17]:
# compute the sha224 hash
symptom_hash_dict = {item: hashlib.sha224(item.encode("utf-8")).hexdigest() for item in unique_symptoms}

In [20]:
# make the synthea module files ready for plug-in into the patientcondition/condition symptom branch of synthea
for file in json_files:
    data = transform_state(file)
    filename = file.split("/")[-1]
    with open(os.path.join(output_dir, filename), "w") as fp:
        json.dump(data, fp, indent=4)