In [1]:
# parse the body parts
from thesislib.utils import stringutils

In [2]:
import pathlib
import os

In [3]:
# data dir for this advanced nlice parsing
data_dir = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/definitions/nlice-adv"
pathlib.Path(data_dir).mkdir(exist_ok=True, parents=True)

In [4]:
body_parts_file = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/definitions/body-parts.txt"

In [5]:
with open(body_parts_file) as fp:
    body_lines = fp.readlines()

In [6]:
body_parts = {}
main_body_part = None
for line in body_lines:
    line = line.strip()
    if line[:2] == "s-":
        main_body_part = stringutils.slugify(line[2:])
        body_parts[main_body_part]= []
        continue
    body_part_name = "_".join([main_body_part, stringutils.slugify(line)])
    body_parts[main_body_part].append(body_part_name)

In [7]:
import json
parsed_body_parts_file = os.path.join(data_dir, "body-parts.json")
with open(parsed_body_parts_file, "w") as fp:
    json.dump(body_parts, fp, indent=4)

In [8]:
# let's get an encoding system down
# location would get encoded with 0 if the symptom is not present
# and with 1 if the symptom is present but no location is specified
count = 2 

body_parts_encoded = {}
for key, value in body_parts.items():
    body_parts_encoded[key] = count
    count += 1
    for part in value:
        body_parts_encoded[part] = count
        count += 1

In [9]:
encoded_body_parts_file = os.path.join(data_dir, "body-parts-enc.json")
with open(encoded_body_parts_file, "w") as fp:
    json.dump(body_parts_encoded, fp, indent=4)

In [10]:
# get all the possible nature values
condition_definition_file = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/definitions/ai-nlice-adv.json"

In [11]:
with open(condition_definition_file) as fp:
    condition_definition = json.load(fp)

In [12]:
all_nature = set([])
all_excitation = set([])
all_vas = set([])
all_location = set([])
all_frequency = set([])

by_symptom = {}

In [13]:
for condition_data in condition_definition.values():
    symptoms = condition_data.get("symptoms")
    for symptom in symptoms:
        if symptom not in by_symptom:
            by_symptom[symptom] = {
                "nature": set([]), 
                "location": set([]),
                "location_main": set([]),
                "vas": set([]),
                "excitation": set([]),
                "frequency": set([])
            }
        nlice = symptoms.get(symptom).get("nlice", {})
        nature_keys = nlice.get("nature", {}).keys()
        location_keys = nlice.get("location", {}).keys()
        vas_keys = nlice.get("vas", {}).keys()
        excitation_keys = nlice.get("excitation", {}).keys()
        frequency_keys = nlice.get("frequency", {}).keys()
        
        location_main = nlice.get("location_main", None)
        
        if location_main:
            by_symptom[symptom]["location_main"].add(location_main)
        
        for _key in nature_keys:
            by_symptom[symptom]["nature"].add(_key)
            all_nature.add(_key)
        
        for _key in location_keys:
            by_symptom[symptom]["location"].add(_key)
            all_location.add(_key)
        
        for _key in vas_keys:
            by_symptom[symptom]["vas"].add(_key)
            all_vas.add(_key)
        
        for _key in excitation_keys:
            by_symptom[symptom]["excitation"].add(_key)
            all_excitation.add(_key)
        
        for _key  in frequency_keys:
            by_symptom[symptom]["frequency"].add(_key)
            all_frequency.add(_key)

In [14]:
# convert the sets to a list
for symptom in by_symptom:
    obj = by_symptom[symptom]
    for key in obj:
        obj[key] = list(obj[key])

In [15]:
# save the data
symptom_nlice_file = os.path.join(data_dir, "symptom_nlice.json")
with open(symptom_nlice_file, "w") as fp:
    json.dump(by_symptom, fp, indent=4)

In [16]:
# save the collection of nlice
all_nlice = {
    "nature": list(all_nature),
    "location": list(all_location),
    "vas": list(all_vas),
    "exciation": list(all_excitation)
}

all_nlice_file = os.path.join(data_dir, "all_nlice.json")
with open (all_nlice_file, "w") as fp:
    json.dump(all_nlice, fp, indent=4)

In [17]:
# excitation encoding
excitation_encoding = {
    "other": 1
}
count = 2
for item in all_excitation:
    excitation_encoding[item] = count
    count += 1

excitation_encoding_file = os.path.join(data_dir, "excitation_encoding.json")
with open(excitation_encoding_file, "w") as fp:
    json.dump(excitation_encoding, fp, indent=4)

In [18]:
# vas encoding
vas_encoding = {"other": 1}
count = 2
for item in all_vas:
    vas_encoding[item] = count
    count += 1

vas_encoding_file = os.path.join(data_dir, "vas_encoding.json")
with open(vas_encoding_file, "w") as fp:
    json.dump(vas_encoding, fp, indent=4)

In [19]:
# nature encoding
nature_encoding = {}
count = 2
for item in all_nature:
    nature_encoding[item] = count
    count += 1
nature_encoding_file = os.path.join(data_dir, "nature_encoding.json")
with open(nature_encoding_file, "w") as fp:
    json.dump(nature_encoding, fp, indent=4)

In [20]:
# frequency_encoding
frequency_encoding = {
    "other": 1
}
count = 2
for item in all_frequency:
    frequency_encoding[item] = count
    count += 1
frequency_encoding_file = os.path.join(data_dir, "frequency_encoding.json")
with open(frequency_encoding_file, "w") as fp:
    json.dump(frequency_encoding, fp, indent=4)

In [21]:
main_locations = "abdomen, hip, chest, head, left_arm, right_arm, lower_back, lower_extremities"

In [22]:
# need to augument the json module to force all categories for instance to be present
# for those missing, let's assign a low weight/probability e.g 5%
# let's make magic happen

In [23]:
updated_definitions = {}

In [24]:
INCLUDE_PERCENT = 5

In [25]:
def get_all_location(location_main=None, location_list=[]):
    if location_main is not None:
        return [location_main] + body_parts[location_main]
    
    _locations = []
    for item in location_list:
        found = []
        for key, locations in body_parts.items():
            if item == key or item in locations:
                found = get_all_location(key)
                break
        if len(found) == 0:
            found = [item]
        _locations += found
    
    return _locations

In [26]:
for condition, condition_data in condition_definition.items():
    updated_definitions[condition] = condition_data
    
    symptoms = condition_data.get("symptoms")
    updated_symptoms = {}
    for symptom in symptoms:
        symptom_data = symptoms[symptom]
        nlice_data = symptom_data.get("nlice", {})
        
        # nature
        nature = nlice_data.get("nature", {})
        full_nature = by_symptom[symptom].get("nature", [])
        
        for item in full_nature:
            if item not in nature:
                nature[item] = INCLUDE_PERCENT
        
        # location
        location_main = nlice_data.get("location_main", None)
        location = nlice_data.get("location", {})
        _all_locations = get_all_location(location_main, location)
        
        for item in _all_locations:
            if item not in location:
                location[item] = INCLUDE_PERCENT
        
        # intensity/vas
        vas = nlice_data.get("vas", {})
        if len(vas) == 0:
            vas["other"] = 90
        for item in vas_encoding.keys():
            if item not in vas:
                vas[item] = INCLUDE_PERCENT
        
        # excitation
        excitation = nlice_data.get("excitation", {})
        if len(excitation) == 0:
            excitation["other"] = 90
        
        for item in excitation_encoding:
            if item not in excitation:
                excitation[item] = INCLUDE_PERCENT
        
        # frequency
        frequency = nlice_data.get("frequency", {})
        if len(frequency) == 0:
            frequency["other"] = 90
        
        for item in frequency_encoding:
            if item not in frequency:
                frequency[item] = INCLUDE_PERCENT
        
        if len(nature) > 0:
            nlice_data["nature"] = nature
        if len(location) > 0:
            nlice_data["location"] = location
        if len(vas) > 0:
            nlice_data["vas"] = vas
        if len(excitation) > 0:
            nlice_data["excitation"] = excitation
        if len(frequency) > 0:
            nlice_data["frequency"] = frequency
        
        symptom_data["nlice"] = nlice_data
        updated_symptoms[symptom] = symptom_data
    updated_definitions[condition]["symptoms"] = updated_symptoms

In [27]:
# save the updated definition file
updated_definitions_file = os.path.join(data_dir, "ai-nlice-adv-updated.json")
with open(updated_definitions_file, "w") as fp:
    json.dump(updated_definitions, fp, indent=4)

In [28]:
conditions = sorted(updated_definitions.keys())

In [29]:
condition_db = {code: idx for idx, code in enumerate(conditions)}

In [30]:
condition_db_file = os.path.join(data_dir, "conditions_db.json")
with open(condition_db_file, "w") as fp:
    json.dump(condition_db, fp, indent=4)

In [31]:
symptoms = sorted(by_symptom.keys())

In [32]:
symptom_db = {code: idx for idx, code in enumerate(symptoms)}

In [33]:
symptom_db_file = os.path.join(data_dir, "symptoms_db.json")
with open(symptom_db_file, "w") as fp:
    json.dump(symptom_db, fp, indent=4)

In [38]:
by_symptom = {}
for condition_data in updated_definitions.values():
    symptoms = condition_data.get("symptoms")
    for symptom in symptoms:
        if symptom not in by_symptom:
            by_symptom[symptom] = {
                "nature": set([]), 
                "location": set([]),
                "location_main": set([]),
                "vas": set([]),
                "excitation": set([]),
                "frequency": set([])
            }
        nlice = symptoms.get(symptom).get("nlice", {})
        nature_keys = nlice.get("nature", {}).keys()
        location_keys = nlice.get("location", {}).keys()
        vas_keys = nlice.get("vas", {}).keys()
        excitation_keys = nlice.get("excitation", {}).keys()
        frequency_keys = nlice.get("frequency", {}).keys()
        
        location_main = nlice.get("location_main", None)
        
        if location_main:
            by_symptom[symptom]["location_main"].add(location_main)
        
        for _key in nature_keys:
            by_symptom[symptom]["nature"].add(_key)
        
        for _key in location_keys:
            by_symptom[symptom]["location"].add(_key)
        
        for _key in vas_keys:
            if _key == "other":
                continue
            by_symptom[symptom]["vas"].add(_key)
        
        for _key in excitation_keys:
            if _key == "other":
                continue
            by_symptom[symptom]["excitation"].add(_key)
        
        for _key  in frequency_keys:
            if _key == "other":
                continue
            by_symptom[symptom]["frequency"].add(_key)

In [39]:
# convert the sets to a list
for symptom in by_symptom:
    obj = by_symptom[symptom]
    for key in obj:
        obj[key] = list(obj[key])

In [40]:
updated_nlice_symptom_file = os.path.join(data_dir, "symptom_nlice_updated.json")
with open(updated_nlice_symptom_file, "w") as fp:
    json.dump(by_symptom, fp, indent=4)

In [41]:
by_symptom

{'nausea': {'nature': [],
  'location': [],
  'location_main': [],
  'vas': ['moderate', 'severe', 'mild'],
  'excitation': ['worsening_after_meals',
   'walking',
   'car_over_bumps',
   'walking_bending_forward',
   'sitting_bending_forward',
   'standing'],
  'frequency': ['regular', 'rare', 'often']},
 'abdominal_pain': {'nature': ['stinging', 'dull', 'cramping'],
  'location': ['abdomen_upper_abdomen',
   'abdomen_urq',
   'abdomen_llq',
   'abdomen_umbilical',
   'abdomen_right_abdomen',
   'abdomen_lower_abdomen',
   'abdomen_epigastric',
   'abdomen_lrq',
   'abdomen_ulq',
   'abdomen_left_abdomen',
   'abdomen'],
  'location_main': ['abdomen'],
  'vas': ['moderate', 'severe', 'mild'],
  'excitation': ['worsening_after_meals',
   'walking',
   'car_over_bumps',
   'walking_bending_forward',
   'sitting_bending_forward',
   'standing'],
  'frequency': ['regular', 'rare', 'often']},
 'vomiting': {'nature': [],
  'location': [],
  'location_main': [],
  'vas': ['moderate', 'severe