In [1]:
# /Users/teliov/TUD/Thesis/Medvice/Notebooks/data/04_06_new_data/data/split
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
%matplotlib inline

In [3]:
from thesislib.utils import pathutils

In [4]:
import json

In [5]:
ai_med_data_file = pathutils.get_data_file('definitions/ai-med-data.json')
with open(ai_med_data_file) as fp:
    ai_med_data = json.load(fp)

In [6]:
from glob import glob
import hashlib
import re

In [7]:
def slugify_condition(condition_name):
    condition_name = condition_name.lower()
    condition_name = re.sub(r"\s+", "-", condition_name)
    condition_name = re.sub(r"'", "-", condition_name)
    condition_name = re.sub(r"\(", "", condition_name)
    condition_name = re.sub(r"\)", "", condition_name)
    return condition_name

In [8]:
def get_symptom_condition_map(module_dir):
    module_files = glob(os.path.join(module_dir, "*.json"))
    symptom_map = {}
    condition_map = {}
    for file in module_files:
        with open(file) as fp:
            module = json.load(fp)
        states = module.get("states")
        for state in states.values():
            if state.get("type") != "Symptom" and state.get("type") != "ConditionOnset":
                continue
            if state.get("type") == "ConditionOnset":
                code = state.get("codes")[0]
                condition_map[code["code"]] = slugify_condition(code.get("display"))
                continue
            symptom_code = state.get("symptom_code")
            slug = slugify_condition(symptom_code.get("display"))
            slug_hash  = hashlib.sha224(slug.encode("utf-8")).hexdigest()
            symptom_map[slug_hash] = slug
    return symptom_map, condition_map

In [9]:
basic_module_dir = "/Users/teliov/TUD/symcat-to-synthea/output/module_ai_med_basic"

In [10]:
basic_symptom_map, basic_condition_map = get_symptom_condition_map(basic_module_dir)

In [11]:
data_dir = pathutils.get_data_file("05_27_nlice")
basic_data_dir = os.path.join(data_dir, "basic")

In [12]:
if not os.path.isdir(basic_data_dir):
    os.mkdir(basic_data_dir)

In [13]:
len(basic_symptom_map)

24

In [14]:
basic_symptom_map_file = os.path.join(basic_data_dir, "symptom_db.json")
with open(basic_symptom_map_file, "w") as fp:
    json.dump(basic_symptom_map, fp, indent=4)

In [15]:
basic_condition_map_file = os.path.join(basic_data_dir, "condition_db.json")
with open(basic_condition_map_file, "w") as fp:
    json.dump(basic_condition_map, fp, indent=4)

In [16]:
from thesislib.utils.ml import process
from thesislib.utils.ml import runners, models

In [17]:
import importlib
_ = importlib.reload(process)
_ = importlib.reload(runners)

In [18]:
basic_data_csv = pathutils.get_data_file("05_27_nlice/ai/output_med_ai_basic/symptoms/csv/symptoms.csv")

In [19]:
basic_op_data_dir = os.path.join(basic_data_dir, "data")
# split into a train and test set
basic_train_file, basic_test_file = process.split_data(basic_data_csv, basic_op_data_dir)

basic_parsed_data_dir = os.path.join(basic_op_data_dir, "parsed")

FileNotFoundError: [Errno 2] File b'/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/05_27_nlice/ai/output_med_ai_basic/symptoms/csv/symptoms100k.csv' does not exist: b'/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/05_27_nlice/ai/output_med_ai_basic/symptoms/csv/symptoms100k.csv'

In [None]:
import pandas as pd
import numpy as np

In [None]:
# parse the train set and let's train
basic_parsed_train = process.parse_data(
    basic_train_file,
    basic_condition_map_file,
    basic_symptom_map_file,
    basic_parsed_data_dir
)

In [None]:
# train with RF and then with NB
basic_rf_dir = os.path.join(basic_op_data_dir, "output/rf")
rfparams = models.RFParams()
rfparams.n_estimators = 200
rfparams.max_depth = None

run_ok = runners.train_ai_med_rf(
    basic_parsed_train,
    basic_symptom_map_file,
    basic_rf_dir,
    rfparams,
    "Basic AI-MED Run",
    "local-pc"
)

In [None]:
# train NB
basic_nb_dir = os.path.join(basic_op_data_dir, "output/nb")

run_ok = runners.train_ai_med_nb(
    basic_parsed_train,
    basic_symptom_map_file,
    basic_nb_dir,
    "Basic AI-MED Run",
    "local-pc"
)

In [None]:
# did we have a balanced distribution of conditions though ?

In [None]:
df = pd.read_csv(basic_data_csv)

In [None]:
cnd_size = df.groupby('PATHOLOGY').size()

In [None]:
cnd_size

In [None]:
cnd_size.plot.bar()

In [None]:
# two symptoms (migrante and tension type headache are the most prevalent)

In [None]:
symp_group = df.groupby('NUM_SYMPTOMS').size()

In [None]:
symp_group.plot.bar()

In [None]:
# most conditions have between 2-4 symptoms