This notebook preps the data exported from synthea using the "sensible" disease modules.

In [1]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np
import json
from dateutil.parser import parse as date_parser
from dateutil.relativedelta import relativedelta
from tabulate import tabulate

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

In [40]:
from thesislib.utils import pathutils
from thesislib.utils import stringutils

In [5]:
patients_csv = pathutils.get_data_file("simple-synthea/data/patients.csv")
patient_conditions_csv = pathutils.get_data_file("simple-synthea/data/patient_conditions.csv")
condition_symptom_csv = pathutils.get_data_file("simple-synthea/data/patient_condition_symptoms.csv")

In [6]:
patients = pd.read_csv(patients_csv)
conditions = pd.read_csv(patient_conditions_csv)
symptoms = pd.read_csv(condition_symptom_csv)

In [10]:
conditions.columns

Index(['Id', 'PATIENT', 'CODE', 'DESCRIPTION', 'ONSET', 'DIAGNOSED'], dtype='object')

In [21]:
condition_codes = conditions['CODE'].unique().tolist()
condition_codes.sort()
conditions_db = {code: conditions[conditions['CODE'] == code].iloc[0]['DESCRIPTION'] for code in condition_codes}

In [22]:
symptom_vector = symptoms['SYMPTOM_CODE'].unique().tolist()
symptom_vector.sort()
symptoms_db = {code: symptoms[symptoms['SYMPTOM_CODE'] == code].iloc[0]['SYMPTOM_DISPLAY'] for code in symptom_vector}

For "similar" diseases e.g the different types of sinusitis, (pharyngitis and Streptococcal sore throat) and the urinary tract infections, the symptoms are exactly (or almost exactly identical)

Merging them would be an option. So we'd end up having 5 conditions as opposed to 11.

In [18]:
condition_count = {conditions_db[code]: conditions[conditions['CODE'] == code].shape[0] for code in condition_codes}

In [19]:
condition_count

{'Streptococcal sore throat (disorder)': 7669,
 'Acute viral pharyngitis (disorder)': 26275,
 'Viral sinusitis (disorder)': 45361,
 'Sinusitis (disorder)': 2388,
 'Acute bronchitis (disorder)': 8163,
 'Acute bacterial sinusitis (disorder)': 2650,
 'Asthma': 10,
 'Escherichia coli urinary tract infection': 1142,
 'Pyelonephritis': 21,
 'Cystitis': 672,
 'Childhood asthma': 226}

In [23]:
# also encode the conditions one-hot-encoding
condition_labels = {cnd: idx for idx, cnd in enumerate(condition_codes)}

In [24]:
race_code = {'white': 0, 'black':1, 'asian':2, 'native':3, 'other':4}

In [25]:
to_drop = ['DESCRIPTION', 'DIAGNOSED', 'DEATHDATE', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX', 'FIRST', 'LAST', 
              'SUFFIX', 'MAIDEN', 'MARITAL', 'ETHNICITY','BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE',
           'COUNTY', 'ZIP', 'LAT', 'LON', 'HEALTHCARE_EXPENSES', 'Id', 'Id_pat', 'SYMPTOM_DISPLAY', 'VALUE_CODE',
           'VALUE_DISPLAY', 'HEALTHCARE_COVERAGE'
          ]

In [26]:
data_dump = pathutils.get_data_file("simple-synthea/output/data.json")
if os.path.exists(data_dump):
    with open(data_dump) as fp:
        design_matrix = json.load(fp)
else:
    # combine the dataframes
    # conditions with patients
    combined = conditions.merge(patients, how='left', left_on='PATIENT', right_on='Id', suffixes=('', '_pat'))
    # symptoms with conditions
    complete = symptoms.merge(combined, how='left', left_on='CONDITION_ID', right_on='Id', suffixes=('_symp', ''))
    complete = complete.drop(columns=to_drop)
    # are there conditions
    # group by the condition id
    condition_grp = complete.groupby(['CONDITION_ID'])
    design_matrix = {
        "label": [],
        "age": [],
        "gender": [],
        "race": [],
    }
    design_matrix.update({item: [] for item in symptom_vector})
    
    for item, df in condition_grp.__iter__():
        vector = {_: 0 for _ in symptom_vector}

        onset_date = date_parser(df['ONSET'].iloc[0])
        patient_birthdate = date_parser(df["BIRTHDATE"].iloc[0])
        vector['age'] =  abs(patient_birthdate.year - onset_date.year)
        vector['gender'] = 0 if df['GENDER'].iloc[0] == 'F' else 1
        vector['race'] = race_code[df['RACE'].iloc[0]]
        vector['label'] = condition_labels[df['CODE'].iloc[0]]

        # fill in the observations
        for idx, symptom_code in df["SYMPTOM_CODE"].items():
            vector[symptom_code] = 1

        for k,v in vector.items():
            design_matrix[k].append(v)
        
    with open(data_dump, 'w') as fp:
        json.dump(design_matrix, fp)

In [33]:
test_dump = pathutils.get_data_file("simple-synthea/output/test.json")
train_dump = pathutils.get_data_file("simple-synthea/output/train.json")

if os.path.exists(test_dump) and os.path.exists(train_dump):
    train_data = pd.read_json(train_dump)
    test_data = pd.read_json(test_dump)
    
    train_labels = train_data['labels']
    train_df = train_data.drop(columns=["labels"])
    test_labels = test_data['labels']
    test_df = test_data.drop(columns=["labels"])
else:
    data_df = pd.DataFrame(design_matrix)
    
    # let's keep a test set which we would use for evaluation, 
    split_t = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
    data_X = data_df.drop(columns=['label'])
    data_Y = data_df['label']
    
    train_df = None
    train_labels = None
    test_df = None
    test_labels = None
    
    for train_index, test_index in split_t.split(data_X, data_Y):
        train_df = data_X.iloc[train_index]
        test_df = data_X.iloc[test_index]
        train_labels = data_Y.iloc[train_index]
        test_labels = data_Y.iloc[test_index]
    
    train_data = train_df.copy()
    train_data['labels'] = train_labels
    
    test_data = test_df.copy()
    test_data['labels'] = test_labels
    
    train_data.to_json(train_dump)
    test_data.to_json(test_dump)

In [28]:
# Some ML
clf = RandomForestClassifier(n_estimators=140, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=2, random_state=None, verbose=0, warm_start=False, class_weight=None)

In [29]:
res = clf.fit(train_df, train_labels)

In [30]:
# predictions on the train set
train_predictions = res.predict(train_df)
diff = (train_predictions - train_labels) != 0
num_missed = np.sum(diff)
num_labels = len(train_predictions)
accuracy = (num_labels - num_missed)*1.0/num_labels

In [31]:
print("Train set: Missed %d predictions out of %d samples for an accuracy of %.3f" % (num_missed, num_labels, accuracy))

Train set: Missed 5096 predictions out of 85119 samples for an accuracy of 0.940


In [34]:
test_predictions = res.predict(test_df)
diff = (test_predictions - test_labels) != 0
num_missed = np.sum(diff)
num_labels = len(test_predictions)
accuracy = (num_labels - num_missed) * 1.0/num_labels

In [35]:
print("Test set: Missed %d predictions out of %d samples for an accuracy of %.3f" % (num_missed, num_labels, accuracy))

Test set: Missed 585 predictions out of 9458 samples for an accuracy of 0.938


In [36]:
cnf_mat = confusion_matrix(test_labels, test_predictions)

In [37]:
train_labels.unique()

array([10,  6,  3,  0,  5,  1,  9,  8,  2,  4,  7])

In [38]:
condition_names = []
for val in condition_codes:
    parts = conditions_db[val].split(" ")
    if len(parts) == 1:
        itm = parts[0]
        condition_names.append(itm[0].upper() + itm[1].lower())
    else:
        itm = ""
        for st in parts:
            itm += st[0].upper() + st[1].lower()
        condition_names.append(itm)


table = []
for idx in range(len(condition_names)):
    data = [condition_names[idx]] + cnf_mat[idx, :].tolist()
    table.append(data)


print("\t\t\t\tConfusion Matrix for Test Set\n\t\t\t\t=============================\n")
print(tabulate(table, headers=condition_names))

				Confusion Matrix for Test Set

              AcBr(d    Si(d    Cy    StSoTh(d    Py    AcBaSi(d    AcViPh(d    As    ChAs    EsCoUrTrIn    ViSi(d
----------  --------  ------  ----  ----------  ----  ----------  ----------  ----  ------  ------------  --------
AcBr(d           816       0     0           0     0           0           0     0       0             0         0
Si(d               0       0     0           0     0           0           0     0       0             0       239
Cy                 0       0     7           0     0           0           0     0       0            60         0
StSoTh(d           0       0     0         767     0           0           0     0       0             0         0
Py                 0       0     0           0     0           0           0     0       0             2         0
AcBaSi(d           0       0     0           0     0           0           0     0       0             0       265
AcViPh(d           0       0     0           

Results are in line with thinking. The confusion only arises from from conditions that have the same symptoms:
- There is a confusion amongs the sinusitis, with more often than not Viral Sinusitis (the more prevalent condition getting the majority vote)
- Also a confusion amongst the Urinary tract infections. More often than not urethritis gets the node.

In [39]:
# deterministic solution
# we do not need any ML for this. A simple for loop will do the trick
# We know that every condition presents the same symptoms, so we handle this. 
# use a binary encoded symptom vector to denote the symptom and then we use that value in an if statement

In [41]:
num_symptoms = len(symptom_vector)
condition_symptom_binary = {code: [0 for idx in range(num_symptoms)] for code in condition_codes}

In [42]:
# fill them in
for code in condition_symptom_binary:
    condition_id = conditions[conditions['CODE'] == code].iloc[0]['Id']
    cond_symptoms = symptoms[symptoms['CONDITION_ID'] == condition_id]['SYMPTOM_CODE'].tolist()
    for idx, itm in enumerate(symptom_vector):
        if itm in cond_symptoms:
            condition_symptom_binary[code][idx] = 1

In [43]:
condition_symptom_int = {code: stringutils.binary_seq_to_decimal(val) for code, val in condition_symptom_binary.items()}

In [44]:
# using the symptom score we can then determine which is the disease
def deterministic_rule_system(vector):
    int_val = stringutils.binary_seq_to_decimal(vector)
    if int_val == 33818736:
        # either 195967001 or 233678006 (Asthma or Childhood asthma)
        # pick childhood asthma because it's more prevalent
        return 233678006
    elif int_val == 36563072:
        return 10509002 # Acute bronchitis
    elif int_val == 68162433:
        # either Cystitis, Pyelonephritis, or Urethritis. Pick Urethritis because it's more prevalent
        return 301011002
    elif int_val == 193347716:
        # either Sinusitis or Viral Sinusitis or Bacterial Sinusitis. Pick Viral Sinusitis because it's more prevalent
        return 444814009
    elif int_val == 272688266:
        return 43878008 # strep throat
    elif int_val == 272950410:
        return 195662009 # acute viral pharyngitis
    else:
        raise ValueError("Unknown symptom combination")

def toy_predict(test_samples):
    symptom_vector_idx = {itm: idx for idx, itm in enumerate(symptom_vector)}
    predictions = []
    for index, row in test_samples.iterrows():
        bin_vector = [0 for idx in range(len(symptom_vector))]
        for jdex, val in row.items():
            if jdex not in symptom_vector_idx:
                continue
            bin_vector[symptom_vector_idx[jdex]] = int(val)
        prediction = deterministic_rule_system(bin_vector)
        predictions.append(condition_labels[prediction])
    return predictions

In [45]:
det_test_predictions = toy_predict(test_df)

In [46]:
diff = (det_test_predictions - test_labels) != 0
num_missed = np.sum(diff)
num_labels = len(test_predictions)
accuracy = (num_labels - num_missed) * 1.0/num_labels

In [47]:
print("Test set: Missed %d predictions out of %d samples for an accuracy of %.3f" % (num_missed, num_labels, accuracy))

Test set: Missed 574 predictions out of 9458 samples for an accuracy of 0.939


In [48]:
det_cnf_mat = confusion_matrix(test_labels, det_test_predictions)

In [49]:
condition_names = []
for val in condition_codes:
    parts = conditions_db[val].split(" ")
    if len(parts) == 1:
        itm = parts[0]
        condition_names.append(itm[0].upper() + itm[1].lower())
    else:
        itm = ""
        for st in parts:
            itm += st[0].upper() + st[1].lower()
        condition_names.append(itm)


table = []
for idx in range(len(condition_names)):
    data = [condition_names[idx]] + det_cnf_mat[idx, :].tolist()
    table.append(data)


print("\t\t\t\t Det. Confusion Matrix for Test Set\n\t\t\t\t=====================================\n")
print(tabulate(table, headers=condition_names))

				 Det. Confusion Matrix for Test Set

              AcBr(d    Si(d    Cy    StSoTh(d    Py    AcBaSi(d    AcViPh(d    As    ChAs    EsCoUrTrIn    ViSi(d
----------  --------  ------  ----  ----------  ----  ----------  ----------  ----  ------  ------------  --------
AcBr(d           816       0     0           0     0           0           0     0       0             0         0
Si(d               0       0     0           0     0           0           0     0       0             0       239
Cy                 0       0     0           0     0           0           0     0       0            67         0
StSoTh(d           0       0     0         767     0           0           0     0       0             0         0
Py                 0       0     0           0     0           0           0     0       0             2         0
AcBaSi(d           0       0     0           0     0           0           0     0       0             0       265
AcViPh(d           0       0     0     

As expected we are able to achieve the same results as the random forest following this simple rule!