In [1]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [10]:
# train a random forest classifier on the train data
# read in the filtered dataset and prep for training
%matplotlib inline

import pandas as pd
import json
import time
import matplotlib.pyplot as plt
import numpy as np
import math

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

In [4]:
from thesislib.utils import pathutils
from thesislib.utils.imput import utils as tutils

In [16]:
OUTPUT_DIR = os.path.join(pathutils.get_data_directory(), "plain-synthea/output")
DATA_DIR = os.path.join(pathutils.get_data_directory(), "plain-synthea/data")

In [17]:
train_csv = os.path.join(OUTPUT_DIR, "train.csv")

In [18]:
train_df = tutils.prep_data(train_csv)

In [19]:
# let's do something very basic to handle the class imbalance
# select the class that is the min value. Use that as a threshold for the other classes
# so if min class as 10 samples then we take 10 samples from everyother class and use this to
# train!

# classes were labelled in decreasing count, so class with label 9 has the least number of samples
num_min = int(0.95 * train_df.loc[train_df['condition_labels'] == 9].count().mean())

dfs = []
val_dfs = []
for idx in range(10):
    tmp = train_df.loc[train_df['condition_labels'] == idx]
    dfs.append(tmp[: num_min])
    val_dfs.append(tmp[num_min: ])    

In [20]:
train_data = pd.concat(dfs)
val_data = pd.concat(val_dfs)

In [21]:
train_vector = train_data.drop(columns=['condition_labels'])
train_labels = train_data['condition_labels']

In [22]:
val_vector = val_data.drop(columns=['condition_labels'])
val_labels = val_data['condition_labels']

In [23]:
clf = RandomForestClassifier(n_estimators=140, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=2, random_state=None, verbose=0, warm_start=False, class_weight=None)

In [15]:
res = clf.fit(train_vector, train_labels)

In [24]:
observations_db_file = pathutils.get_data_file("plain-synthea/data/observations.json")
with open(observations_db_file) as f:
    observations_db = json.load(f)

In [25]:
print("Feature Importances:\n")
# which feature is the most relevant for classification??
sorted_features_index = np.argsort(res.feature_importances_)

meta_features = ['marital_status_code', 'gender_code', 'race_code', 'patient_age']

for idx in sorted_features_index[::-1]:
    feat_name = train_vector.columns[idx]
    importance = res.feature_importances_[idx]
    if feat_name in meta_features:
        name = feat_name
    elif feat_name == '32465-7_code':
        name = observations_db['32465-7']
    elif feat_name == '72166-2_code':
        name = observations_db['72166-2']
    else:
        name = observations_db[feat_name]
    
    print("%s, %.4f" % (name, importance*100))

Feature Importances:

patient_age, 52.6335
Oral temperature, 26.8771
marital_status_code, 6.0959
race_code, 3.5315
gender_code, 2.9724
Body Weight, 1.0379
Body Height, 0.9504
Body Mass Index, 0.9365
Systolic Blood Pressure, 0.8868
Diastolic Blood Pressure, 0.7990
Pain severity - 0-10 verbal numeric rating [Score] - Reported, 0.5893
Tobacco smoking status NHIS, 0.3077
Sodium, 0.2054
Carbon Dioxide, 0.1611
Hemoglobin A1c/Hemoglobin.total in Blood, 0.1553
Urea Nitrogen, 0.1473
Chloride, 0.1472
Glucose, 0.1421
Creatinine, 0.1360
Total Cholesterol, 0.1322
Calcium, 0.1307
Low Density Lipoprotein Cholesterol, 0.1289
Triglycerides, 0.1251
High Density Lipoprotein Cholesterol, 0.1193
Potassium, 0.1158
Hematocrit [Volume Fraction] of Blood by Automated count, 0.0590
Platelet mean volume [Entitic volume] in Blood by Automated count, 0.0474
Hemoglobin [Mass/volume] in Blood, 0.0440
MCH [Entitic mass] by Automated count, 0.0411
Platelets [#/volume] in Blood by Automated count, 0.0410
MCV [Entitic v

**Note**

- Now the age is more important for the classification than it was when we did nothing to handle class imbalance

In [26]:
# how about classifications, even on the training set??
train_predictions = res.predict(train_vector)
# how many labels did it predict correctly??
diff = (train_predictions - train_labels) != 0
num_missed = np.sum(diff)
num_labels = len(train_predictions)
accuracy = (num_labels - num_missed)*1.0/num_labels

print("Train set: Missed %d predictions out of %d samples for an accuracy of %.3f" % (num_missed, num_labels, accuracy))

Train set: Missed 775 predictions out of 2470 samples for an accuracy of 0.686


**Note**:
- This is lower than what we got without touching the class imbalance!

In [27]:
# how about classifications, on the val data
val_predictions = res.predict(val_vector)
# how many labels did it predict correctly??
diff = (val_predictions - val_labels) != 0
num_missed = np.sum(diff)
num_labels = len(val_predictions)
accuracy = (num_labels - num_missed)*1.0/num_labels

print("Validation set: Missed %d predictions out of %d samples for an accuracy of %.3f" % (num_missed, num_labels, accuracy))

Validation set: Missed 15717 predictions out of 24527 samples for an accuracy of 0.359


**Note**:
- Really terrible performance!!!!

In [28]:
# confusion matrices - train
labels_index = train_labels.unique()
confusion_matrix(train_labels, train_predictions, labels=labels_index)

array([[123,   0,  21,  18,  11,   0,  18,  17,  32,   7],
       [  0, 241,   0,   0,   0,   4,   0,   1,   0,   1],
       [ 14,   0, 127,  16,   9,   0,  15,  18,  42,   6],
       [  4,   0,  12, 144,   7,   0,  12,  18,  34,  16],
       [  7,   0,  11,   3, 145,   0,   3,  14,   7,  57],
       [  0,   4,   0,   1,   1, 241,   0,   0,   0,   0],
       [ 10,   0,  22,  17,   9,   0, 122,  29,  34,   4],
       [ 10,   0,  13,  10,   6,   0,  19, 141,  41,   7],
       [  2,   0,  14,   8,   0,   0,  10,   8, 205,   0],
       [  0,   0,   0,   2,  39,   0,   0,   0,   0, 206]])

**Note**:
- In general the confusion matrix on the train set is much better than when nothing was done to handle the class imbalance. There are fewer missclassifications of similar sinusitis conditions to the Viral sinuistis (label 0) class

In [29]:
# confusion matrices - val
labels_index = val_labels.unique()
confusion_matrix(val_labels, val_predictions, labels=labels_index)

array([[1689,    0, 1589,  964,  542,    0, 1318, 1714, 1789,  241],
       [   3, 4262,    3,   14,   15, 1056,    2,    8,    4,    5],
       [ 765,    1,  717,  410,  278,    1,  642,  758,  755,  134],
       [ 178,    0,  161,  571,   57,    0,  168,  205,  249,  199],
       [  39,    0,   61,   47,  559,    0,   47,   90,   47,  323],
       [   1,  273,    0,    1,    9,  877,    0,    0,    0,    0],
       [  60,    0,   53,   33,   17,    0,   41,   63,   47,    7],
       [  48,    0,   56,   34,   15,    0,   40,   61,   52,    7],
       [   4,    0,    5,    1,    0,    0,    2,    7,   20,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,   13]])

**Note**:
- This is clearly a case of "overfitting" (which is even flattery considering that accuracy on the train set was just 64%

Not even going to bother checking the test set (haha)

# Observations
- The results do make sense in a way. In this case, 247 entries are simply not enough to discover the underlying xtics of the data and this explains why it completly fails on the validation set.
- Also from the results of the confusion matrix in the train set it is clear that handling the class imbalance is very important and might be the key to pushing prediction somewhat higher