In [2]:
# setup definition file for RL
import json

In [3]:
synthea_file = "/Users/teliov/TUD/symcat-to-synthea/output/ai-med-extended.json"

In [4]:
with open(synthea_file) as fp:
    synthea_def = json.load(fp)

In [5]:
import hashlib

In [6]:
AGE_KEYS = [
    "age-1-years", "age-1-4-years", "age-5-14-years", "age-15-29-years",
    "age-30-44-years", "age-45-59-years", "age-60-74-years", "age-75-years"
]
RACE_KEYS = [
    "black", "hispanic",
    "white", "other"
]
SEX_KEYS = ["male", "female"]

In [15]:
RACE_PROB = {
    "black": 0.25,
    "white": 0.25,
    "hispanic": 0.25,
    "other": 0.25
}
AGE_PROB = {
    "age-l-1-years": 0.125,
    "age-1-4-years": 0.125,
    "age-5-14-years": 0.125,
    "age-15-29-years": 0.125,
    "age-30-44-years": 0.125,
    "age-45-59-years": 0.125,
    "age-60-74-years": 0.125,
    "age-g-75-years": 0.125
}

In [21]:
def get_prob(obj):
    if obj.get("odds", None):
        odds = obj.get("odds")
        prob = odds/(odds+1)
    else:
        prob = obj.get("probability")/100
    
    return prob

In [22]:
def get_sex_prob(sex):
    if len(sex) == 0:
        return {
            "male": 0.5,
            "female": 0.25
        }
    else:
        male_prob = get_prob(sex.get("sex-male"))
        female_prob = get_prob(sex.get("sex-female"))
        _sum = female_prob + male_prob
        male_prob /= _sum
        female_prob /= _sum
        return {
            "male": male_prob,
            "female": female_prob
        }

In [27]:
def do_hash(value):
    return hashlib.sha224(value.encode("utf-8")).hexdigest()

In [17]:
def get_symptoms_prob(symptoms):
    _prob = {do_hash(key): get_prob(symptoms.get(key)) for key in symptoms.keys()}
    _sum = sum(_prob.values())
    return {key: value/_sum for key, value in _prob.items()}

In [16]:
def get_age_prob(age):
    if len(age) == 0:
        return AGE_PROB
    else:
        _prob = {key: get_prob(age.get(key)) for key in age.keys()}
        _sum = sum(_prob.values())
        return {key: _prob.get(key)/_sum for key in _prob.keys()}

In [28]:
rl_definition = {}

In [29]:
for condition_name, condition_data in synthea_def.items():
    condition_hash = do_hash(condition_name)
    
    sex_dist = get_sex_prob(condition_data.get("sex", {}))
    age_dist = get_age_prob(condition_data.get("age", {}))
    symptom_dist = get_symptoms_prob(condition_data.get("symptoms", {}))
    rl_definition[condition_hash] = {
        "age": age_dist,
        "sex": sex_dist,
        "race": RACE_PROB,
        "symptoms": symptom_dist
    }

In [30]:
rl_definition

{'1eebbc48b667086fff2958b8419f68ac99bbcdb181c2ad835ece5abc': {'age': {'age-0-6-years': 0.03714113631800844,
   'age-7-13-years': 0.24242120056213612,
   'age-14-17-years': 0.14766111222646056,
   'age-18-25-years': 0.21481630194740012,
   'age-26-35-years': 0.16522786589038346,
   'age-36-64-years': 0.16261794820317207,
   'age-g-65-years': 0.030114434852439267},
  'sex': {'male': 0.547, 'female': 0.45299999999999996},
  'race': {'black': 0.25, 'white': 0.25, 'hispanic': 0.25, 'other': 0.25},
  'symptoms': {'a1b4c06c3ea6ab83d933b1980b6738a683aec3825b3f73a390c0c741': 0.07075471698113217,
   'f2976346f909cf2583e92512c2eb644cea2b0713d7e3132d3c0b7532': 0.22169811320754748,
   '7d063801e12b750d1176a339c4a5eebf4a39554e7502734e1355ec3d': 0.10377358490566052,
   '884f9b6946d3dd9a0d4124a93eb3c83983ab55e8909b8233f9f754d3': 0.17688679245283043,
   'dd52980213ed3f58007375b494cf13182420dd104acf39cb84c683ab': 0.12735849056603793,
   '87835ab9d2d63526b21f44ff91c065b1c834e9863baabdd1dcbf39a9': 0.23584