In [2]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
import pandas as pd
import numpy as np
import json
from dateutil.parser import parse as date_parser
from dateutil.relativedelta import relativedelta

In [57]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from tabulate import tabulate

In [4]:
from thesislib.utils import pathutils

In [7]:
patients_csv = pathutils.get_data_file("prob-synthea-1/data/patients.csv")
patient_conditions_csv = pathutils.get_data_file("prob-synthea-1/data/patient_conditions.csv")
condition_symptom_csv = pathutils.get_data_file("prob-synthea-1/data/patient_condition_symptoms.csv")

In [8]:
patients = pd.read_csv(patients_csv)
conditions = pd.read_csv(patient_conditions_csv)
symptoms = pd.read_csv(condition_symptom_csv)

In [10]:
condition_codes = conditions['CODE'].unique().tolist()
condition_codes.sort()
conditions_db = {code: conditions[conditions['CODE'] == code].iloc[0]['DESCRIPTION'] for code in condition_codes}

In [12]:
symptom_vector = symptoms['SYMPTOM_CODE'].unique().tolist()
symptom_vector.sort()
symptoms_db = {code: symptoms[symptoms['SYMPTOM_CODE'] == code].iloc[0]['SYMPTOM_DISPLAY'] for code in symptom_vector}

**Note**

If we wanted to grab the full set of symptoms and not just those appearing in the available data:

```python
with open("synthea/symptoms_db.json") as fp:
    symptom_db = json.load(fp)
```

In [15]:
condition_count = {conditions_db[code]: conditions[conditions['CODE'] == code].shape[0] for code in condition_codes}
print(condition_count)

{'Urethritis': 37740, 'Asthma': 39700, 'Acute sinusitis': 39458, 'Pharyngitis': 39771, 'Pyelonephritis': 38265, 'Acute bronchitis': 39669, 'Strep throat': 39664, 'Chronic sinusitis': 39675, 'Cystitis': 38200}


In [66]:
# also encode the conditions one-hot-encoding
condition_labels = {code: idx for idx, code in enumerate(condition_codes)}
_condition_index_map = {condition_labels[code]: conditions_db[code] for code in condition_labels.keys()}
with open(pathutils.get_data_file("prob-synthea-1/output/labels_map.json"), "w") as fp:
    json.dump(_condition_index_map, fp, indent=4)

In [17]:
race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}

In [18]:
# combine the dataframes
# conditions with patients
combined = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))

In [19]:
# symptoms with conditions
complete = symptoms.merge(combined, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))

In [20]:
to_drop = ['DESCRIPTION', 'DIAGNOSED', 'DEATHDATE', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX', 'FIRST', 'LAST', 
              'SUFFIX', 'MAIDEN', 'MARITAL', 'ETHNICITY','BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE',
           'COUNTY', 'ZIP', 'LAT', 'LON', 'HEALTHCARE_EXPENSES', 'Id', 'Id_pat', 'SYMPTOM_DISPLAY', 'VALUE_CODE',
           'VALUE_DISPLAY', 'HEALTHCARE_COVERAGE'
          ]
complete = complete.drop(columns=to_drop)

In [21]:
# are there conditions
# group by the condition id
condition_grp = complete.groupby(['CONDITION_ID'])

In [46]:
output_dir = pathutils.get_data_file("prob-synthea-1/output")
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

In [47]:
data_dump = os.path.join(output_dir, "data.json")
if os.path.exists(data_dump):
    with open(data_dump) as fp:
        design_matrix = json.load(fp)
else:
    design_matrix = {
        "label": [],
        "age": [],
        "gender": [],
        "race": [],
    }

    for item in symptom_vector:
        design_matrix[item] = []
    # build the design matrix
    for item, df in condition_output_diriter__():
        vector = {_: 0 for _ in symptom_vector}

        onset_date = date_parser(df['ONSET'].iloc[0])
        patient_birthdate = date_parser(df["BIRTHDATE"].iloc[0])
        vector['age'] =  abs(patient_birthdate.year - onset_date.year)
        vector['gender'] = 0 if df['GENDER'].iloc[0] == 'F' else 1
        vector['race'] = race_code[df['RACE'].iloc[0]]
        vector['label'] = condition_labels[df['CODE'].iloc[0]]

        # fill in the observations
        for idx, symptom_code in df["SYMPTOM_CODE"].items():
            vector[symptom_code] = 1

        for k,v in vector.items():
            design_matrix[k].append(v)
    with open(data_dump, 'w') as fp:
        json.dump(design_matrix, fp)

In [50]:
train_dump = os.path.join(output_dir, "train.json")
test_dump = os.path.join(output_dir, "test.json")

if os.path.exists(test_dump) and os.path.exists(train_dump):
    train_data = pd.read_json(train_dump)
    test_data = pd.read_json(test_dump)
    
    train_labels = train_data['labels']
    train_df = train_data.drop(columns=["labels"])
    test_labels = test_data['labels']
    test_df = test_data.drop(columns=["labels"])
else:
    data_df = pd.DataFrame(design_matrix)
    
    # let's keep a test set which we would use for evaluation, 
    split_t = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
    data_X = data_df.drop(columns=['label'])
    data_Y = data_df['label']
    
    train_df = None
    train_labels = None
    test_df = None
    test_labels = None
    
    for train_index, test_index in split_t.split(data_X, data_Y):
        train_df = data_X.iloc[train_index]
        test_df = data_X.iloc[test_index]
        train_labels = data_Y.iloc[train_index]
        test_labels = data_Y.iloc[test_index]
    
    train_data = train_df.copy()
    train_data['labels'] = train_labels
    
    test_data = test_df.copy()
    test_data['labels'] = test_labels
    
    train_data.to_json(train_dump)
    test_data.to_json(test_dump)

In [51]:
clf = RandomForestClassifier(n_estimators=140, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=2, random_state=None, verbose=0, warm_start=False, class_weight=None)

In [52]:
res = clf.fit(train_df, train_labels)

In [53]:
# predictions on the train set
train_predictions = res.predict(train_df)
diff = (train_predictions - train_labels) != 0
num_missed = np.sum(diff)
num_labels = len(train_predictions)
accuracy = (num_labels - num_missed)*1.0/num_labels

In [54]:
print("Train set: Missed %d predictions out of %d samples for an accuracy of %.3f" % (num_missed, num_labels, accuracy))

Train set: Missed 23585 predictions out of 316650 samples for an accuracy of 0.926


In [55]:
test_predictions = res.predict(test_df)
diff = (test_predictions - test_labels) != 0
num_missed = np.sum(diff)
num_labels = len(test_predictions)
accuracy = (num_labels - num_missed) * 1.0/num_labels

In [56]:
print("Test set: Missed %d predictions out of %d samples for an accuracy of %.3f" % (num_missed, num_labels, accuracy))

Test set: Missed 4811 predictions out of 35184 samples for an accuracy of 0.863


In [58]:
cnf_mat = confusion_matrix(test_labels, test_predictions)

In [59]:
cnf_mat[0, :].tolist() + [11]

[3584, 0, 0, 0, 3, 0, 0, 0, 179, 11]

In [60]:
condition_names = []
for val in condition_codes:
    parts = conditions_db[val].split(" ")
    if len(parts) == 1:
        itm = parts[0]
        condition_names.append(itm[0].upper() + itm[1].lower())
    else:
        itm = ""
        for st in parts:
            itm += st[0].upper() + st[1].lower()
        condition_names.append(itm)


table = []
for idx in range(len(condition_names)):
    data = [condition_names[idx]] + cnf_mat[idx, :].tolist()
    table.append(data)

print(tabulate(table, headers=condition_names))

        Ur    As    AcSi    Ph    Py    AcBr    StTh    ChSi    Cy
----  ----  ----  ------  ----  ----  ------  ------  ------  ----
Ur    3584     0       0     0     3       0       0       0   179
As       0  3607       3     5     0     327       1      11     0
AcSi     0    16    2400     8     0      63      77    1382     0
Ph       0    21      17  3857     0      21      53       8     0
Py       2     1       0     1  3747       0       0       0    75
AcBr     0   515      44    18     0    3252      48      90     0
StTh     0     7      18    17     0       6    3840      78     0
ChSi     0    48    1230    13     0      82     154    2439     0
Cy     113     0       0     0    56       0       0       0  3647


## Export Symptoms to txt. 
###### To run this change the jupyter cell type to "code" from "markdown"

```python
symptoms_file = pathutils.get_data_file("prob-synthea-1/data/symptoms_db.json")
with open(symptoms_file) as fp:
    d = json.load(fp)
values = list(d.values())
values.sort()

symptoms_txt = pathutils.get_data_file("prob-synthea-1/data/symptoms_list.txt")
with open(symptoms_txt, "w") as fp:
    fp.write("SLUG, DESCRIPTION\n")
    for val in values:
        parts = " ".join(val.split("-"))
        fp.write("%s, %s\n" % (val, parts))
```