In [1]:
# /Users/teliov/TUD/Thesis/Medvice/Notebooks/data/04_06_new_data/data/split
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score, make_scorer, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit
import json
import os
import joblib
import argparse
from timeit import default_timer as timer
from sklearn import naive_bayes
import sys

In [12]:
from thesislib.utils.ml import models, report

In [37]:
import numpy as np

In [8]:
data_file = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/04_06_new_data/parsed/symptoms_parsed_sparse.csv"
symptoms_db_json = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/definitions/symptoms_db.json"
conditions_db_json = "/Users/teliov/TUD/Thesis/Medvice/Notebooks/data/definitions/conditions_db.json"

In [56]:
with open(symptoms_db_json) as fp:
    symptoms_db = json.load(fp)
with open(conditions_db_json) as fp:
    conditions_db = json.load(fp)

In [57]:
num_symptoms = len(symptoms_db)
num_conditions = len(conditions_db)

print("Num Symptoms %d\t Num Conditions %d" % (num_symptoms, num_conditions))

Num Symptoms 376	 Num Conditions 801


In [58]:
classes = list(range(num_conditions))

In [59]:
data = pd.read_csv(data_file, index_col='Index')

In [63]:
condition_code = {code: idx for idx, code in enumerate(conditions_db.keys())}
unique_conditions = data.LABEL.unique()
classes = unique_conditions.tolist()

In [64]:
label_values = data.LABEL.values
ordered_keys = ['GENDER', 'RACE', 'AGE', 'SYMPTOMS']
data = data[ordered_keys]

In [65]:
sparsifier = models.ThesisSymptomSparseMaker(num_symptoms=num_symptoms)
data = sparsifier.fit_transform(data)

In [66]:
split_t = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
train_data = None
train_labels = None
test_data = None
test_labels = None
for train_index, test_index in split_t.split(data, label_values):
    train_data = data[train_index]
    train_labels = label_values[train_index]
    test_data = data[test_index]
    test_labels = label_values[test_index]

In [67]:
symptom_clf = naive_bayes.BernoulliNB()
gender_clf = naive_bayes.BernoulliNB()
race_clf = naive_bayes.MultinomialNB()
age_clf = naive_bayes.GaussianNB()

In [68]:
classifier_map = [
    [gender_clf, [0, False]],
    [race_clf, [1, False]],
    [age_clf, [2, False]],
    [symptom_clf, [(3, None), True]],
]

clf = models.ThesisSparseNaiveBayes(classifier_map=classifier_map, classes=classes)

In [72]:
roc_auc_scorer_unweighted = make_scorer(roc_auc_score, average='macro', multi_class='ovo', needs_proba=True)

In [73]:
roc_train_score_unweighted = roc_auc_scorer_unweighted(clf, train_data, train_labels)

(852, 1)
(852, 1)
(852, 1)
(852, 376)


In [75]:
np.linspace(0.1, 1.0, 20)

array([0.1       , 0.14736842, 0.19473684, 0.24210526, 0.28947368,
       0.33684211, 0.38421053, 0.43157895, 0.47894737, 0.52631579,
       0.57368421, 0.62105263, 0.66842105, 0.71578947, 0.76315789,
       0.81052632, 0.85789474, 0.90526316, 0.95263158, 1.        ])

In [76]:
from sklearn.model_selection import learning_curve

In [88]:
df = pd.read_csv(data_file, index_col='Index')
label_values = df.LABEL.values
ordered_keys = ['GENDER', 'RACE', 'AGE', 'SYMPTOMS']
df = df[ordered_keys]

sparsifier = models.ThesisSymptomSparseMaker(num_symptoms=num_symptoms)
dat = sparsifier.fit_transform(df)
symptom_clf1 = naive_bayes.BernoulliNB()
gender_clf1 = naive_bayes.BernoulliNB()
race_clf1 = naive_bayes.MultinomialNB()
age_clf1 = naive_bayes.GaussianNB()

classifier_map1 = [
    [gender_clf1, [0, False]],
    [race_clf1, [1, False]],
    [age_clf1, [2, False]],
    [symptom_clf1, [(3, None), True]],
]

clf1 = models.ThesisSparseNaiveBayes(classifier_map=classifier_map1, classes=classes)

In [87]:
importlib.reload(models)

<module 'thesislib.utils.ml.models' from '/Users/teliov/TUD/Thesis/Medvice/Notebooks/thesislib/utils/ml/models.py'>

In [92]:
from sklearn.preprocessing import LabelEncoder

In [100]:
y_target = test_labels
y_pred = clf.predict_proba(test_data)

(214, 1)
(214, 1)
(214, 1)
(214, 376)


In [101]:
labelbin = LabelEncoder()
labelbin.fit(classes)

LabelEncoder()

In [102]:
encoded_labels = labelbin.transform(y_target)

In [103]:
sorted_prob = np.argsort(-y_pred, axis=1)

In [150]:
top_n = 2
top_n_predictions = sorted_prob[:, :top_n]

In [151]:
encoded_probability = np.take_along_axis(y_pred, encoded_labels[:, None], axis=1)
encoded_probability = encoded_probability.reshape(encoded_probability.shape[0], )

In [152]:
def check_is_in(needles, haystack):
    if needles.shape[0] != haystack.shape[0]:
        raise ValueError("Needles and Haystack shape mismatch")

    result = np.zeros((needles.shape[0], ), dtype=bool)

    for idx in range(haystack.shape[0]):
        result[idx] = np.isin(needles[idx], haystack[idx, :]).reshape(1, )[0]

    return result

In [153]:
old1 = check_is_in(encoded_labels, top_n_predictions)

In [143]:
old1.shape

(214,)

In [155]:
is_in = encoded_labels.reshape(-1, 1)  == top_n_predictions
is_in = np.sum(is_in, axis=1).astype(np.bool)

In [156]:
is_in.shape

(214,)

In [157]:
np.sum(is_in)

204

In [154]:
np.sum(old1)

204

In [159]:
np.linspace(0.1, 1, 20)

array([0.1       , 0.14736842, 0.19473684, 0.24210526, 0.28947368,
       0.33684211, 0.38421053, 0.43157895, 0.47894737, 0.52631579,
       0.57368421, 0.62105263, 0.66842105, 0.71578947, 0.76315789,
       0.81052632, 0.85789474, 0.90526316, 0.95263158, 1.        ])