In [1]:
import numpy as np
import pandas as pd
import os
import sys
sys.path.append(os.path.join(os.path.abspath('../'), 'predictions_collapsed'))
sys.path.append(os.path.join(os.path.abspath('../'), 'src'))
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
pd.set_option('display.max_colwidth', None)
import glob
import datetime
from config_loader import (
    D_CONFIG, DATASET_TOP_PATH,
    DATASET_SITE_PATH, PROJECT_REPO_DIR, PROJECT_CONDA_ENV_YAML,
    DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH,
    RESULTS_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH
    )

sys.path.append(os.path.join(PROJECT_REPO_DIR, 'src'))
from feature_transformation import *
from utils import load_data_dict_json
deployment_code_dir = os.path.join('/home', 'prash', 'clinical_deterioration')
sys.path.append(deployment_code_dir)
sys.path.append(os.path.join(deployment_code_dir, 'src_code'))
sys.path.append(os.path.join(deployment_code_dir, 'data_example'))
sys.path.append(os.path.join(deployment_code_dir, 'src_code', 'utils_specs'))


RESULTS_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH = os.path.join(RESULTS_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'sklearn_logistic_regression')
CLF_TRAIN_TEST_SPLIT_PATH=os.path.join(DATASET_COLLAPSED_FEAT_DYNAMIC_INPUT_OUTPUT_PATH, 'classifier_train_test_split')

## Load the raw data as well as featurized data for a test participant

In [7]:
x_test_df = pd.read_csv(os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'x_test.csv.gz'), nrows=1000)
y_test_df = pd.read_csv(os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'y_test.csv.gz'), nrows=1000)

test_adm_id = 11801416
x_test_df = x_test_df[x_test_df.hospital_admission_id==test_adm_id].reset_index(drop=True)
y_test_df = y_test_df[y_test_df.hospital_admission_id==test_adm_id].reset_index(drop=True)

In [8]:
vitals_df = pd.read_csv(os.path.join(DATASET_SITE_PATH, 'vitals_before_icu.csv.gz'))
labs_df = pd.read_csv(os.path.join(DATASET_SITE_PATH, 'labs_before_icu.csv.gz'))
medications_df = pd.read_csv(os.path.join(DATASET_SITE_PATH, 'medications_before_icu.csv.gz'))
demographics_df = pd.read_csv(os.path.join(DATASET_SITE_PATH, 'demographics_before_icu.csv.gz'))

demographics_dd = load_data_dict_json(os.path.join(DATASET_SITE_PATH,
                                                  'Spec-Demographics.json'))
vitals_dd = load_data_dict_json(os.path.join(DATASET_SITE_PATH,
                                                  'Spec-Vitals.json'))
labs_dd = load_data_dict_json(os.path.join(DATASET_SITE_PATH,
                                                  'Spec-Labs.json'))
medications_dd = load_data_dict_json(os.path.join(DATASET_SITE_PATH,
                                                  'Spec-Medications.json'))

collapsed_features_dd = load_data_dict_json(os.path.join(CLF_TRAIN_TEST_SPLIT_PATH, 'Spec_features.json'))
collapsed_features = parse_feature_cols(collapsed_features_dd)

vitals_df = vitals_df[vitals_df.hospital_admission_id==test_adm_id].reset_index(drop=True)
labs_df = labs_df[labs_df.hospital_admission_id==test_adm_id].reset_index(drop=True)
medications_df = medications_df[medications_df.hospital_admission_id==test_adm_id].reset_index(drop=True)
demographics_df = demographics_df[demographics_df.hospital_admission_id==test_adm_id].reset_index(drop=True)


vital_cols = parse_feature_cols(vitals_dd)
lab_cols = parse_feature_cols(labs_dd)
medication_cols = parse_feature_cols(medications_dd)
demographic_cols = parse_feature_cols(demographics_dd)
id_cols = parse_id_cols(vitals_dd)
time_col = parse_time_cols(vitals_dd)[-1]

## Create an artificial json for this participant that can be inserted into deployment code

In [None]:
curr_pid = vitals_df.patient_id.unique()[0]
curr_adm_ts = demographics_df.admission_timestamp.unique()[0]
curr_pred_start_ts = curr_adm_ts

prediction_window_end_hrs = 24
curr_pred_ts = str(pd.to_datetime(curr_adm_ts) + datetime.timedelta(hours=prediction_window_end_hrs))

api_dict = dict()

api_dict['patient_id'] = int(curr_pid)
api_dict['start_datetime'] = curr_adm_ts
api_dict['prediction_datetime'] = curr_pred_ts
api_dict['measurements_over_time_by_variable'] = dict()

for feature_cols, features_df, features_dd in [(vital_cols, vitals_df, vitals_dd),
                                               (lab_cols, labs_df, labs_dd), 
                                               (medication_cols, medications_df, medications_dd),
                                               (demographic_cols, demographics_df, demographics_dd)]:

    for col in feature_cols:
        curr_feature= features_df[col].values
        mask = np.logical_not(np.isnan(curr_feature))
        curr_feature = curr_feature[mask]
        if time_col in features_df.columns:
            curr_t = features_df[time_col].values[mask]

            # keep only measurements before the prediction time
            keep_t = curr_t <= prediction_window_end_hrs
            curr_t = curr_t[keep_t]
            curr_feature = curr_feature[keep_t]

            if len(curr_t)>0:
                api_dict['measurements_over_time_by_variable'][col] = []
                for ii in range(len(curr_t)):
                    curr_t_feature_dict = dict()
                    curr_t_feature_dict['value'] = float(curr_feature[ii])
                    curr_t_feature_dict['datetime'] = str(pd.to_datetime(api_dict['start_datetime']) + datetime.timedelta(hours=curr_t[ii]))
                    curr_t_feature_dict['code'] = [d for d in features_dd['schema']['fields'] if d['name']==col][0]['codes']
                    api_dict['measurements_over_time_by_variable'][col].append(curr_t_feature_dict)
        else:
            api_dict['measurements_over_time_by_variable'][col] = []
            curr_static_feature_dict = dict()
            curr_static_feature_dict['value'] = float(curr_feature)
            curr_static_feature_dict['datetime'] = str(pd.to_datetime(api_dict['start_datetime']))
            curr_static_feature_dict['code'] = [d for d in features_dd['schema']['fields'] if d['name']==col][0]['codes']
            api_dict['measurements_over_time_by_variable'][col].append(curr_static_feature_dict)


In [99]:
## Make sure to save json in file untracked by git
with open(os.path.json('\home\prash\datasets','real_mock_1.json'), 'w') as fp:
    json.dump(api_dict, fp)

## Featurize and predict with the artificial jsons using the exact code as in deployment

```console
python run_deploy_demo.py --input_data_ts_json_fpath data_example/real_mock_1.json --pretrained_model_path data_example/lightGBM_min_samples_per_leaf\=1024-max_leaves\=128-n_estimators\=100-frac_features_for_clf\=0.33-frac_training_samples_per_tree\=0.33.onnx --pretrained_watermark_path data_example/watermark.LR.onnx1.6.0_py3.8.8_darwin.txt
-------------------------------
Environment compatibility check
-------------------------------
      package  train_version    cur_version
 Architecture          64bit          64bit
      Machine         x86_64         x86_64
           OS         Darwin          Linux
       Python          3.8.8          3.8.2
        numpy         1.20.1         1.19.4
         onnx          1.6.0         1.10.1
  onnxruntime  not installed          1.8.1
     protobuf  not installed  not installed
 scikit-learn         0.24.1         0.22.1
        scipy          1.6.2          1.4.1
     skl2onnx          1.9.2          1.9.2

------------------------
Loading pretrained model
------------------------
Attempting load from file:
/home/prash/clinical_deterioration/data_example/lightGBM_min_samples_per_leaf=1024-max_leaves=128-n_estimators=100-frac_features_for_clf=0.33-frac_training_samples_per_tree=0.33.onnx
... done. Load complete.

-----------------------
Loading input data
-----------------------
For Patient #1539160 from 2022-08-23 01:44:00 - 2022-08-24 01:44:00
   3 obs. of body_temperature
   4 obs. of diastolic_blood_pressure
   4 obs. of heart_rate
   4 obs. of o2_sat
   4 obs. of systolic_blood_pressure
   1 obs. of basophils
   1 obs. of bicarbonate_venous_blood
   1 obs. of creatinine_in_serum
   1 obs. of eosinophils
   1 obs. of erithrocytes
   1 obs. of glucose_in_serum
   1 obs. of hemoglobin
   1 obs. of lymphocytes
   1 obs. of monocytes
   1 obs. of neutrophils
   1 obs. of oxygen_venous_blood
   1 obs. of ph_venous_blood
   1 obs. of potassium_in_serum
   1 obs. of sodium_in_serum
   1 obs. of age_at_admission
   1 obs. of gender_is_male
   1 obs. of gender_is_unknown

-----------------------
Predicting outcome output_probability
-----------------------

Asking for a probability of deterioration, we get:
0.1957433

Asking for a binary decision, we get:
0
```

## Compare predicted probas of deployment code v/s test set predicted probas

In [12]:
#load model 
onx_file = '/home/prash/clinical_deterioration/data_example/lightGBM_min_samples_per_leaf=1024-max_leaves=128-n_estimators=100-frac_features_for_clf=0.33-frac_training_samples_per_tree=0.33.onnx'

# load collapsed_features
x_test = x_test_df[x_test_df.stop==24.0][collapsed_features].values

sess = rt.InferenceSession(onx_file)
input_name = sess.get_inputs()[0].name
proba_label_name = sess.get_outputs()[1].name
pred_probas_onx = sess.run([proba_label_name], {input_name: x_test.astype(np.float32)})[0]


pred_probas_onx

[{0: 0.8042566776275635, 1: 0.19574332237243652}]