In [0]:
# Import libraries
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path
import sklearn
import joblib

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, precision_recall_curve
from tensorflow import keras

# Make pandas dataframes prettier
from IPython.display import display, HTML

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery
from google.colab import files


In [0]:
# authenticate
auth.authenticate_user()

In [0]:
# Set up environment variables
project_id='tdothealthhack-team'
os.environ["GOOGLE_CLOUD_PROJECT"]=project_id

In [0]:
# Helper function to read data from BigQuery into a DataFrame.
def run_query(query):
    return pd.io.gbq.read_gbq(query, project_id=project_id, dialect="standard")

In [0]:
!pip install tableone

Collecting tableone
  Downloading https://files.pythonhosted.org/packages/f7/3a/20f31d8467380762887eb79359bc6ae5e7b522d7f7569e6727b4da5b2b00/tableone-0.6.0-py2.py3-none-any.whl
Installing collected packages: tableone
Successfully installed tableone-0.6.0


In [0]:
query= """SELECT ie.subject_id, ie.hadm_id, ie.icustay_id, DRUG_NAME_GENERIC as drug  FROM `physionet-data.mimiciii_clinical.icustays` ie  
   LEFT JOIN `physionet-data.mimiciii_clinical.prescriptions` pr
   ON pr.subject_id = ie.subject_id AND pr.hadm_id = ie.hadm_id
   AND pr.startdate BETWEEN (DATETIME_SUB(ie.intime, INTERVAL 6 HOUR)) AND (DATETIME_ADD(ie.intime, INTERVAL 1 DAY))   WHERE DRUG_NAME_GENERIC is not null AND DRUG_TYPE = 'MAIN'
   """

pres_dayone = run_query(query)

In [0]:
query= """SELECT ie.subject_id, ie.hadm_id, ie.icustay_id, DRUG_NAME_GENERIC as drug  FROM `physionet-data.mimiciii_clinical.icustays` ie  
   LEFT JOIN `physionet-data.mimiciii_clinical.prescriptions` pr
   ON pr.subject_id = ie.subject_id AND pr.hadm_id = ie.hadm_id
   AND pr.startdate BETWEEN (DATETIME_ADD(ie.intime, INTERVAL 24 HOUR)) AND (DATETIME_ADD(ie.intime, INTERVAL 8 DAY))   WHERE DRUG_NAME_GENERIC is not null AND DRUG_TYPE = 'MAIN'
   """

pres_weekafter = run_query(query)

In [0]:
query = """
SELECT * EXCEPT(subject_id) FROM `physionet-data.mimiciii_derived.labsfirstday` 
"""

labs = run_query(query)

In [0]:
query = """
SELECT A.SUBJECT_ID, A.ADMISSION_TYPE, A.ADMISSION_LOCATION, B.ICUSTAY_ID, C.GENDER
FROM `physionet-data.mimiciii_clinical.admissions` A
JOIN `physionet-data.mimiciii_clinical.icustays` B
ON A.SUBJECT_ID = B.SUBJECT_ID 
AND A.HADM_ID = B.HADM_ID
AND B.INTIME BETWEEN A.ADMITTIME AND A.DISCHTIME
JOIN `physionet-data.mimiciii_clinical.patients` C
ON A.SUBJECT_ID = C.SUBJECT_ID;
"""

patient_data = run_query(query)

In [0]:
query = """
SELECT A.SUBJECT_ID, A.HADM_ID, A.TRANSFERTIME, A.CURR_SERVICE, B.ICUSTAY_ID FROM `physionet-data.mimiciii_clinical.services` A
JOIN `physionet-data.mimiciii_clinical.icustays` B
ON A.SUBJECT_ID = B.SUBJECT_ID 
AND A.HADM_ID = B.HADM_ID
AND A.TRANSFERTIME BETWEEN B.INTIME AND B.OUTTIME
ORDER BY A.SUBJECT_ID
"""

service_data = run_query(query)

service_data2 = service_data.drop_duplicates(subset=['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID'], keep="first")

In [0]:
drugs = np.unique(np.union1d(pres_dayone.drug.values,pres_weekafter.drug.values))

In [0]:
drugs.shape

(2389,)

In [0]:
!pip install tqdm



In [0]:
np.concatenate((np.array([1,2,3,4]), np.array([1,2])),axis=0)

array([1, 2, 3, 4, 1, 2])

In [0]:
labs[labs.icustay_id == sub].iloc[:,2:].values[0]

NameError: ignored

In [0]:
patient_data[patient_data.ICUSTAY_ID==21234].iloc[:,1:5].empty

True

In [0]:
### Create dictionary for data (drugs)
d = dict.fromkeys(drugs, [])


from sklearn.feature_extraction.text import CountVectorizer


# CREATE VECTORIZER FOR MULTIHOT
cv = CountVectorizer(binary=True, token_pattern=".*").fit(drugs)

In [0]:
patient_data[patient_data.ICUSTAY_ID==sub].iloc[:,1:5].values[0]

NameError: ignored

In [0]:
from tqdm import tqdm

X = []
s = []
y = []


for sub in tqdm(pres_dayone.icustay_id.unique()[:2000]):
  #Find values in the first day (see dayone query)
  drugs_for_patient = pres_dayone[pres_dayone.icustay_id == sub].drug.values
  multihot_patient = cv.transform(drugs_for_patient).toarray().any(axis=0)[1:] * 1
  
  #SERVICE
  if service_data2[service_data2.ICUSTAY_ID == sub].empty:
    np.concatenate((multihot_patient,[" "]))
  else:
    multihot_patient = np.concatenate((multihot_patient, [service_data2[service_data2.ICUSTAY_ID == sub].CURR_SERVICE.values[0]]))
  #LABS
  multihot_patient = np.concatenate((multihot_patient, labs[labs.icustay_id == sub].iloc[:,2:].values[0]))
  #PATIENT
  if patient_data[patient_data.ICUSTAY_ID==sub].iloc[:,1:5].empty:
    continue
  else:
    multihot_patient = np.concatenate((multihot_patient, np.delete(patient_data[patient_data.ICUSTAY_ID==sub].iloc[:,1:5].values[0], 2)))
    
  if multihot_patient.shape[0] != 2273:
    continue

  X.append(multihot_patient)
  s.append(sub)
  
  #Extract values for the week after for the same patient (see week after query)
  drugs_for_patient = pres_weekafter[pres_weekafter.icustay_id == sub].drug.values
  multihot_patient = cv.transform(drugs_for_patient).toarray().any(axis=0)[1:] * 1
  y.append(multihot_patient)

100%|██████████| 2000/2000 [00:21<00:00, 92.08it/s]


In [0]:
np.stack(X).shape

(1327, 2273)

In [0]:
one_hot_slice = [2231]
one_hot_slice.extend([i for i in range(2270,2273)])
scaler_slice = [i for i in range(2232,2270)]
transformers = [
                ('onehot', sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore'), one_hot_slice),
                ('scaler', Pipeline([
                                                      ('imputer', SimpleImputer()),
                                                      ('minmaxscale', sklearn.preprocessing.MinMaxScaler())
                ]), scaler_slice)
]
ct = ColumnTransformer(transformers=transformers, remainder='passthrough')

In [0]:
Xt = ct.fit_transform(np.array(X))

In [0]:
np.set_printoptions(threshold=np.inf)
Xt[0]

array([0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 1.0, 0.46428571428571425, 0.26530612244897955,
       0.9130434782608695, 0.9130434782608695, 0.17012946911047883,
       0.1825206272985385, 0.5384615384615384, 0.42105263157894735,
       0.050125313283208024, 0.045766590389016024, 0.028776978417266185,
       0.017241379310344827, 0.4736842105263157, 0.29508196721311464,
       0.2210796915167095, 0.04046242774566474, 0.6594059405940593,
       0.7034313725490196, 0.7065868263473054, 0.7957746478873238,
       0.17045454545454547, 0.04262295081967213, 0.22222222222222224,
       0.08918322295805739, 0.5238095238095238, 0.11926605504587151,
       0.10910458991723102, 0.09961977186311788, 0.18867924528301888,
       0.06410256410256411, 0.226044226044226, 0.0635260528194147,
       0.726190476190476, 0.4230769230769229, 0.07801418439716312,
       0.06701030927835051, 0.01296

In [0]:
X = np.stack(X)
y = np.stack(y)
icustay_ids = s

In [0]:
X_train, X_test, y_train, y_test, s_train, s_test = train_test_split(Xt,y,s)

In [0]:
model = keras.models.Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, 20-dimensional vectors.
model.add(keras.layers.Dense(64, activation='relu', input_dim=np.shape(X_train)[1]))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(np.shape(y_train)[1], activation='sigmoid'))

model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
Xt.shape

(1327, 2298)

In [0]:
model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), callbacks=[keras.callbacks.ReduceLROnPlateau(patience=3)])

Train on 995 samples, validate on 332 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7feb570643c8>

In [0]:
X_test.shape

(332, 2298)

In [0]:
preds = model.predict(X_test[:2,:])

In [0]:
drugs[preds > 0.05]

IndexError: ignored

In [0]:
preds = model.predict(X_test)
preds_dichot = (preds > 0.05) * 1
f1 = f1_score(y_test, preds_dichot, average='micro')
auroc = roc_auc_score(y_test, preds, average='micro')
precision = precision_score(y_test, preds_dichot, average='micro')
recall = recall_score(y_test, preds_dichot, average='micro')

In [0]:
print("""
Precision:{}\n
Recall:{}\n
Auroc:{}\n
f1:{}\n""".format(precision, recall, auroc, f1))


Precision:0.14884369009713325

Recall:0.6650871531568276

Auroc:0.9725561785723061

f1:0.24324923163323728



NameError: ignored

In [0]:
model.save('model.h5')
files.download('model.h5')

In [0]:
joblib.dump((X_test[:10], y_test[:10], s_test[:10]), 'data.joblib')
files.download('data.joblib')