In [1]:
# /Users/teliov/TUD/Thesis/Medvice/Notebooks/data/04_06_new_data/data/split
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from thesislib.utils import pathutils
import json
import hashlib

In [3]:
from thesislib.utils.ml import process
from thesislib.utils.ml import runners, models
from sklearn import naive_bayes

In [4]:
import importlib
_ = importlib.reload(naive_bayes)
_ = importlib.reload(process)
_ = importlib.reload(runners)

In [5]:
data_dir = pathutils.get_data_file("05_27_nlice")
nlice_data_dir = os.path.join(data_dir, "nlice")

In [6]:
nlice_module_dir = "/Users/teliov/TUD/symcat-to-synthea/output/module_ai_med_adv"
nlice_data_csv = pathutils.get_data_file("05_27_nlice/ai/output_med_ai_adv/symptoms/csv/symptoms100k.csv")

In [7]:
nlice_op_data_dir = os.path.join(nlice_data_dir, "data")
# split into a train and test set
nlice_train_file, nlice_test_file = process.split_data(nlice_data_csv, nlice_op_data_dir)

nlice_parsed_data_dir = os.path.join(nlice_op_data_dir, "parsed")

In [8]:
transformation_map_file = os.path.join(nlice_op_data_dir, "transformation_map.json")
encoding_map_file = os.path.join(nlice_op_data_dir, "encoding_map.json")
reduction_map_file = os.path.join(nlice_op_data_dir, "reduction_map.json")
encoding_count_file = os.path.join(nlice_op_data_dir, "encoding_count.json")
with open(transformation_map_file) as fp:
    transformation_map = json.load(fp)
with open(encoding_map_file) as fp:
    encoding_map = json.load(fp)
with open(reduction_map_file) as fp:
    reduction_map = json.load(fp)

In [9]:
nlice_parsed_data_dir = os.path.join(nlice_op_data_dir, "parsed")

In [10]:
data_dir = pathutils.get_data_file("05_27_nlice")
basic_data_dir = os.path.join(data_dir, "basic")
basic_symptom_map_file = os.path.join(basic_data_dir, "symptom_db.json")
basic_condition_map_file = os.path.join(basic_data_dir, "condition_db.json")

In [11]:
# parse the train set and let's train
nlice_parsed_train = process.parse_data(
    nlice_train_file,
    basic_condition_map_file,
    basic_symptom_map_file,
    nlice_parsed_data_dir,
    is_nlice=True,
    transform_map=transformation_map,
    encode_map=encoding_map,
    reduce_map=reduction_map
)

In [12]:
# train with RF and then with NB
nlice_rf_dir = os.path.join(nlice_op_data_dir, "output/rf")
rfparams = models.RFParams()
rfparams.n_estimators = 20
rfparams.max_depth = None

run_ok = runners.train_ai_med_rf(
    nlice_parsed_train,
    basic_symptom_map_file,
    nlice_rf_dir,
    rfparams,
    "NLICE AI-MED Run",
    "local-pc",
    True
)

In [None]:
# train NB
nlice_nb_dir = os.path.join(nlice_op_data_dir, "output/nb")
nlice_symptom_names = ['headache', 'limb-weakness', 'abdominal-pain', 'fever']
nlice_symptom_hash = [hashlib.sha224(item.encode('utf-8')).hexdigest() for item in nlice_symptom_names]

run_ok = runners.train_ai_med_nb(
    nlice_parsed_train,
    basic_symptom_map_file,
    nlice_nb_dir,
    "NLICE AI-MED Run",
    "local-pc",
    True,
    nlice_symptom_hash
)

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv(nlice_parsed_train, index_col="Index")

In [None]:
with open (basic_symptom_map_file) as fp:
    symptom_db = json.load(fp)
num_symptoms = len(symptom_db)

In [None]:
sparsifier = models.ThesisAIMEDSymptomSparseMaker(num_symptoms=num_symptoms)

In [None]:
labels = df.LABEL
ordered_keys = ['GENDER', 'RACE', 'AGE', 'SYMPTOMS']
df = df[ordered_keys]

In [None]:
df_csc = sparsifier.fit_transform(df)

In [None]:
reg_indices = [0, 1, 2] + [9, 12, 20, 25]
bern_indices = []
for idx in range(df_csc.shape[1]):
    if idx not in reg_indices:
        bern_indices.append(idx)
new_indices = reg_indices + bern_indices

In [None]:
df_csc = df_csc[:, new_indices]

In [None]:
nlice_symptoms = df_csc[:, 3:7]

In [None]:
nlice1 = nlice_symptoms[:, 0].toarray()
nlice2 = nlice_symptoms[:, 1].toarray()
nlice3 = nlice_symptoms[:, 2].toarray()
nlice4 = nlice_symptoms[:, 3].toarray()

In [None]:
np.unique(nlice3)

In [None]:
print(np.unique(nlice1))
print(np.unique(nlice2))
print(np.unique(nlice3))
print(np.unique(nlice4))

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
enc = OrdinalEncoder()
transformed = enc.fit_transform(nlice_symptoms.toarray())

In [None]:
t_nlice1 = transformed[:, 0]
t_nlice2 = transformed[:, 1]
t_nlice3 = transformed[:, 2]
t_nlice4 = transformed[:, 3]

In [None]:
np.unique(transformed, axis=1)

In [None]:
enc1 = OrdinalEncoder()
enc1.fit(nlice1)

In [None]:
enc1.categories_

In [None]:
enc.categories_

In [None]:
k = np.array([1,2,3,4,5]).reshape(-1, 1)

In [None]:
p = k == enc.categories_[0]

In [None]:
np.where(np.sum(k == enc.categories_[0], axis=1) == 0)

In [None]:
unique, counts = np.unique(nlice1, return_counts=True)

In [None]:
counts

In [None]:
np.argmax(counts)

In [None]:
import numpy as np
ll = np.array([[0], [0], [0], [0], [2], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [1], [0], [0], [2], [0], [0], [1], [0], [2], [1], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [2], [0], [2], [1], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [2], [0], [0], [0], [2], [0], [0], [1], [0], [0], [2], [0], [1], [2], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [2], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [2], [0], [0], [0], [0], [0], [0], [1], [0], [2], [0], [0], [3], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [2], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [2], [0], [0], [0], [1], [0], [0], [2], [0], [0], [0], [2], [0], [0], [0], [0], [0], [0], [2], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [2], [0], [0], [0], [0], [0], [0], [2], [0], [2], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [2], [0], [0], [0], [0], [2], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [2], [0], [0], [0], [0], [0], [0], [2], [0], [0], [0], [1], [0], [0], [0], [1], [0], [0], [0], [1], [0], [1], [2], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [1], [0], [2], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [2], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [1], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [1], [2], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]])

In [None]:
ll[:, 0].reshape((-1, 1)) == [0, 1, 2, 3, 4]