This notebook trains an RF and NB agent on the extended data.

A separate dataset would be used to train the RL agent.

The downside to this method is that the RL method relies heavily on the accuracy of the RF and NB models and would be limited by their accuracy.

In [7]:
module_dir = "/Users/teliov/TUD/symcat-to-synthea/output/module_ai_med_extended"

In [8]:
from thesislib.utils.ml import process

In [9]:
symptom_map, condition_map = process.get_symptom_condition_map(module_dir)

In [12]:
import pathlib

In [13]:
data_dir = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/06_18_nlice_plus/extended"
pathlib.Path(data_dir).mkdir(exist_ok=True, parents=True)

In [16]:
import json
import os

In [17]:
symptom_map_file = os.path.join(data_dir, "symptom_db.json")
with open(symptom_map_file, "w") as fp:
    json.dump(symptom_map, fp, indent=4)

In [25]:
condition_map_file = os.path.join(data_dir, "condition_db.json")
with open(condition_map_file, "w") as fp:
    json.dump(condition_map, fp, indent=4)

In [19]:
from thesislib.utils.ml import process
from thesislib.utils.ml import runners, models

In [20]:
data_csv = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/06_18_nlice_plus/ai/output_med_ai_ext/symptoms/csv/symptoms.csv"

In [21]:
op_data_dir = os.path.join(data_dir, "data")

In [22]:
# split into train and test
train_file, test_file = process.split_data(data_csv, op_data_dir, train_split=0.9)

parsed_data_dir = os.path.join(op_data_dir, "parsed")

In [23]:
import pandas as pd
import numpy as np

In [26]:
# parse the train set and let's train
parsed_train = process.parse_data(
    train_file,
    condition_map_file,
    symptom_map_file,
    parsed_data_dir
)

In [27]:
# parse the test set for evaluation
# parse the train set and let's train
parsed_test = process.parse_data(
    test_file,
    condition_map_file,
    symptom_map_file,
    parsed_data_dir
)

In [29]:
# train with RF and then with NB
rf_dir = os.path.join(op_data_dir, "output/rf")
rfparams = models.RFParams()
rfparams.n_estimators = 200
rfparams.max_depth = None

pathlib.Path(rf_dir).mkdir(parents=True, exist_ok=True)

In [31]:
run_ok = runners.train_ai_med_rf(
    parsed_train,
    symptom_map_file,
    rf_dir,
    rfparams,
    "Basic AI-MED Run",
    "local-pc"
)

In [32]:
# train NB
nb_dir = os.path.join(op_data_dir, "output/nb")

run_ok = runners.train_ai_med_nb(
    parsed_train,
    symptom_map_file,
    nb_dir,
    "Basic AI-MED Run",
    "local-pc"
)

In [1]:
# we'll evaluate on the unseen data ..

In [2]:
import joblib

In [3]:
nb_data = joblib.load("/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/06_18_nlice_plus/nlice-adv/data/output/nb_100k/nb_serialized_sparse.joblib")

In [4]:
nb_clf = nb_data.get('clf')

In [12]:
vec=[[1,0,44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,126,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]

In [8]:
import numpy as np

In [13]:
vec = np.array(vec)

In [15]:
from scipy.sparse import csc_matrix

In [16]:
vec = csc_matrix(vec)

In [17]:
res = nb_clf.predict_proba(vec)

In [18]:
res

array([[4.92455255e-26, 2.42705705e-19, 4.85147978e-07, 7.40938886e-12,
        9.99996200e-01, 8.45675481e-36, 3.31381133e-06, 8.67802527e-10,
        1.27774149e-40, 1.13143592e-44, 1.46078507e-12, 1.21781484e-22,
        4.11544543e-20, 2.53092584e-27]])