In [1]:
# /Users/teliov/TUD/Thesis/Medvice/Notebooks/data/04_06_new_data/data/split
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from thesislib.utils import pathutils
import json
import hashlib

In [3]:
from thesislib.utils.ml import process
from thesislib.utils.ml import runners, models
from sklearn import naive_bayes

In [4]:
import importlib
_ = importlib.reload(naive_bayes)
_ = importlib.reload(process)
_ = importlib.reload(models)
_ = importlib.reload(runners)

In [5]:
data_dir = pathutils.get_data_file("06_18_nlice_plus")
nlice_data_dir = os.path.join(data_dir, "nlice")

In [6]:
nlice_module_dir = "/Users/teliov/TUD/symcat-to-synthea/output/module_ai_med_plus_nlice"
nlice_data_csv = pathutils.get_data_file("06_18_nlice_plus/ai/output_med_ai_plus_nlice/symptoms/csv/symptoms.csv")
nlice_op_data_dir = os.path.join(nlice_data_dir, "data")
nlice_parsed_data_dir = os.path.join(nlice_op_data_dir, "parsed")

In [7]:
transformation_map_file = os.path.join(nlice_op_data_dir, "transformation_map.json")
encoding_map_file = os.path.join(nlice_op_data_dir, "encoding_map.json")
reduction_map_file = os.path.join(nlice_op_data_dir, "reduction_map.json")
encoding_count_file = os.path.join(nlice_op_data_dir, "encoding_count.json")
nlice_symptoms_file = os.path.join(nlice_op_data_dir, "nlice_symptoms.json")
with open(transformation_map_file) as fp:
    transformation_map = json.load(fp)
with open(encoding_map_file) as fp:
    encoding_map = json.load(fp)
with open(reduction_map_file) as fp:
    reduction_map = json.load(fp)
with open(nlice_symptoms_file) as fp:
    nlice_symptoms = json.load(fp)

reverse_encoding_map = {}
for key, value in encoding_map.items():
    rev_value = {v:k for k, v in value.items()}
    reverse_encoding_map[key] = rev_value

In [8]:
# split into a train and test set
nlice_train_file, nlice_test_file = process.split_data(nlice_data_csv, nlice_op_data_dir)

In [9]:
basic_symptom_map_file = pathutils.get_data_file("06_18_nlice_plus/basic/symptom_db.json")
basic_condition_map_file = pathutils.get_data_file("06_18_nlice_plus/basic/condition_db.json")

In [10]:
# parse the train set and let's train
nlice_parsed_train = process.parse_data(
    nlice_train_file,
    basic_condition_map_file,
    basic_symptom_map_file,
    nlice_parsed_data_dir,
    is_nlice=True,
    transform_map=transformation_map,
    encode_map=encoding_map,
    reduce_map=reduction_map
)

In [11]:
# train with RF and then with NB
nlice_rf_dir = os.path.join(nlice_op_data_dir, "output/rf")
rfparams = models.RFParams()
rfparams.n_estimators = 20
rfparams.max_depth = None

In [12]:
run_ok = runners.train_ai_med_rf(
    nlice_parsed_train,
    basic_symptom_map_file,
    nlice_rf_dir,
    rfparams,
    "NLICE AI-MED Run",
    "local-pc",
    True
)

In [13]:
# train NB
nlice_nb_dir = os.path.join(nlice_op_data_dir, "output/nb")
nlice_symptom_hash = []
nlice_symptom_encoding = {}
for item in nlice_symptoms:
    _hash = hashlib.sha224(item.encode('utf-8')).hexdigest() 
    nlice_symptom_hash.append(_hash)
    nlice_symptom_encoding[_hash] = reverse_encoding_map[item]

In [14]:
run_ok = runners.train_ai_med_nb(
    nlice_parsed_train,
    basic_symptom_map_file,
    nlice_nb_dir,
    "NLICE AI-MED Run",
    "local-pc",
    True,
    nlice_symptom_hash,
    nlice_symptom_encoding
)

In [15]:
from sklearn.preprocessing import OrdinalEncoder

In [16]:
l = OrdinalEncoder()

In [17]:
import numpy as np
arr = np.array([1,2,3,4,4,5,5, 8])

In [18]:
l.fit(arr.reshape(-1, 1))

OrdinalEncoder()

In [19]:
l.categories_

[array([1, 2, 3, 4, 5, 8])]

In [20]:
l.transform(arr.reshape(-1, 1))

array([[0.],
       [1.],
       [2.],
       [3.],
       [3.],
       [4.],
       [4.],
       [5.]])