In [1]:
import tensorflow as tf
import math
import numpy as np
from sklearn.metrics import roc_auc_score
import argparse, re, random, json
from collections import namedtuple

In [2]:
tf.reset_default_graph()

learning_rate = 0.001

n_input = 14   # (OH encoded vector of length 13, value)

n_steps = 100  #100 timesteps per batch

n_hidden = 100  # hidden layer num of features

n_classes = 1  # Mortality in hospital.  The network will put out one value between 0 and 1 at each timestep indicating the prediction whether the patient has lived or died.  (0 for lived, 1 for died)

batch_size = 30

# X_lengths = tf.placeholder(tf.int32, [None], name='X_lengths')  #feed in the unpadded sequence length for each batch

# tf Graph input (X is a 3D array of inuts (number of patients x number of measurements x 14 for OH vector, and value))
X = tf.placeholder(tf.float32, [None, n_steps, n_input], name='X')
# y is a 1D array of labels shape = (number of patients). Here for a given patient the label is the same at each time step since it just indicates whether that patient died in the hospital.
y = tf.placeholder(tf.float32, [None], name='y')
y2 = tf.reshape(y, shape = (batch_size,-1))

#Weights and Biases to map hidden state to n_classes predictions
w1 = tf.Variable(tf.random_normal([n_hidden, n_classes]), name='w1')
b1 = tf.Variable(tf.random_normal([n_classes]), name='w2')


def length(sequence):
    #returns a vector of sequence lengths for each patient within the batch segment
    used = tf.sign(tf.reduce_max(tf.abs(sequence), 2))
    length = tf.reduce_sum(used, 1)
    length = tf.cast(length, tf.int32)
    return length

X_l = length(X)

# Define a lstm cell with tensorflow
lstm_cell = tf.contrib.rnn.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=False)

state = tf.Variable(lstm_cell.zero_state(batch_size, tf.float32), trainable=False)

outputs, states = tf.nn.dynamic_rnn(
        cell=lstm_cell,
        dtype=tf.float32,
        sequence_length=X_l,
        inputs=X,
        initial_state=state)

state_op = tf.assign(state, states)


def last_relevant(output, length):
    #returns outputs for each patient at the indexes given in length
    batch_size = tf.shape(output)[0]
    max_length = tf.shape(output)[1]
    out_size = int(output.get_shape()[2])
    index = tf.range(0, batch_size) * max_length + (length - 1)
    flat = tf.reshape(output, [-1, out_size])
    relevant = tf.gather(flat, index)
    return relevant


# Remove patients from batch that have no measurements for that batch (ie the length of the sequence is zero for that patient for that batch).
# mask = np.array([False, True])
mask = X_l > 0
outputs2 = tf.boolean_mask(outputs, mask)
X_l2 = tf.boolean_mask(X_l, mask)
y3 = tf.boolean_mask(y2, mask)
masksum = tf.reduce_sum(tf.cast(mask, tf.float32))

# outputs2 = outputs[1]
# X_l 

rel = last_relevant(outputs2, X_l2)

pred1 = tf.matmul(rel, w1)

pred2 = tf.add(pred1,b1)

sigmoid_pred = tf.sigmoid(pred2)

cost = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(targets = y3,logits = pred2, pos_weight= 1), name='cost')

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost, name='optimizer')

saver = tf.train.Saver(max_to_keep=None)




  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [3]:
train_filenames = []
test_filenames = []
# val_filenames = []

for i in np.arange(70):
    train_filenames.append('../data/xN' + str(i)+ '.tfrecord')
    
# for i in np.arange(10):
#     train_filenames.append('../data/xC' + str(i+90)+ '.tfrecord')

for i in np.arange(10):
    test_filenames.append('../data/xN' + str(i+70) + '.tfrecord')
    
# for i in np.arange(20):
#     val_filenames.append('../data/xC' + str(i+80) + '.tfrecord')

train_dataset = tf.data.TFRecordDataset(train_filenames)
   
test_dataset = tf.data.TFRecordDataset(test_filenames)
    
# val_dataset = tf.data.TFRecordDataset(val_filenames)

# Transforms a scalar string `example_proto` into a pair of a scalar string and
# a scalar integer, representing an input and its label, respectively.  tlen is the number of measurements for the patient.
def _parse_function(example_proto):
  features = {"train/pat_n": tf.FixedLenFeature((), tf.string, default_value=""),
              "train/label": tf.FixedLenFeature((), tf.int64, default_value=0),
              "train/tlen": tf.FixedLenFeature((), tf.int64, default_value=0)}
  parsed_features = tf.parse_single_example(example_proto, features)
  return parsed_features["train/pat_n"], parsed_features["train/label"], parsed_features["train/tlen"]

train_dataset = train_dataset.map(_parse_function, num_parallel_calls=10)
test_dataset = test_dataset.map(_parse_function, num_parallel_calls=10)
# val_dataset = val_dataset.map(_parse_function, num_parallel_calls=10)

def _parse_function2(patd_str, label, tlen):
    #decodes strings and converts to floats and integers
    patd = tf.decode_raw(patd_str, out_type = tf.float64)
    patd = tf.to_float(patd)
    label = tf.to_int32(label)
    tlen = tf.to_int32(tlen)
    #finds the number of groups.  Each group contains n_steps = 100 measurements per patient (eg if there are 637 measurements for a patient that would form 7 groups).
    numgroup = tf.reduce_max([tf.to_int32(tf.ceil(tlen/100))-1, 0])
    #generates labels for each group indicating mortality at the end of that group.  Only the final group can be non-zero as the patients are still alive if measurements are still being taken.
    labellist = tf.zeros([numgroup], tf.int32)
    labellist = tf.concat([labellist, [label]], 0)
    #Reshape into 2D array.  There are 15 values at each time step (One for each feature type).
    patd = tf.reshape(patd, [-1,15])
    #take all the feature types except for the time which is not used here.  (time is at index 13).
    patd = tf.gather(patd, indices = [0,1,2,3,4,5,6,7,8,9,10,11,12,14], axis = 1)
    return patd, label, tlen, numgroup, labellist

train_dataset = train_dataset.map(_parse_function2, num_parallel_calls=10)
test_dataset = test_dataset.map(_parse_function2, num_parallel_calls=10)

#randomize the order of training examples
train_dataset = train_dataset.shuffle(buffer_size=1500)
train_dataset = train_dataset.repeat()
# test_dataset = test_dataset.repeat()

#batch together mulitple patient's data.  The ends are padded with zeros to a value which is greater than the maximum number of measurements per patient and is divisible by n_steps)
train_dataset = train_dataset.padded_batch(batch_size, padded_shapes=([300000,14],[],[],[],[None]))
test_dataset = test_dataset.padded_batch(batch_size, padded_shapes=([300000,14],[],[],[],[None]))


def _parse_function3(patt, label, tlen, numgroup, labellist):
    #find the max number of measurements for patients in the batch.
    mlen = tf.reduce_max(tlen)
    #round up to nearest 100 and take only the values up to this index.  (All values after rmlen should be zeros from the padding)
    rmlen = tf.to_int32(tf.ceil(mlen/100)*100)
    patt = patt[:,0:rmlen]
    #split up into groups of 100 measurements each
    patt = tf.reshape(patt, [batch_size,-1,100,14])
    #transpose to match format for rnn.
    patt = tf.transpose(patt, perm=[1, 0, 2, 3]) #(number of groups, batch size, n_steps, number of feature types)
    labellist = tf.transpose(labellist, perm=[1,0])
#     numsplits = tf.to_int32(rmlen/100)
#     numsplits = tf.to_int32(tf.ceil(mlen/100

    return patt, label, tlen, numgroup, labellist

train_dataset = train_dataset.map(_parse_function3, num_parallel_calls=10) 
test_dataset = test_dataset.map(_parse_function3, num_parallel_calls=10) 

train_iterator = train_dataset.make_one_shot_iterator()
train_next_element = train_iterator.get_next()

test_iterator = test_dataset.make_initializable_iterator()
test_next_element = test_iterator.get_next()

train_dataset = train_dataset.prefetch(5)
test_dataset = test_dataset.prefetch(5)

In [4]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    AUC = []
    m = 0
    for i in range(6000):
        print(i)
#         print(sess.run(state))
        train_value = sess.run(train_next_element)
        #initializes hidden/cell states to zero at the beginning of each patient batch.
        sess.run(tf.variables_initializer([state]))
#         print(train_value[1])
#         print(sess.run(state))

        for j in np.arange(np.shape(train_value[0])[0]):
            #take the jth group's input and labels to feed into the LSTM.
            batch_x, batch_y = train_value[0][j], train_value[4][j]

            feed = {X: batch_x, y: batch_y}

            #run the session.  (optimizer will update parameters).  State op will update the hidden state and cell state to the values at the end of the previous group.
            result = sess.run([optimizer, state_op], feed_dict=feed)

#             print('train cost:' + str(result[0]))
#             print('Pred:' + str(result[1]))
#             print('label:' + str(result[2]))
#             AUC = roc_auc_score(result[2].flatten(), result[1].flatten())
#             print('AUC:' + str(AUC))
             
        if i%300 == 0:
            #save_path = saver.save(sess, "../models/model5" + str(m) + ".ckpt")
            #m +=1
            #create arrays for the prediction outputs and labels.
            preds = np.array([])
            labs = np.array([])
            #initialize the test iterator to start at the beginning
            sess.run(test_iterator.initializer)
            while True:
                try:
                    test_value = sess.run(test_next_element)
    #                 print(sess.run(state))
                    sess.run(tf.variables_initializer([state]))
    #                 print(sess.run(state))
    #                 print(test_value[1])

                    for k in np.arange(np.shape(test_value[0])[0]):
                        #take the k'th group's input and labels to feed into the LSTM.
                        batch_x, batch_y = test_value[0][k], test_value[4][k]

                        feed = {X: batch_x, y: batch_y}

    #                     print(sess.run(state))
                        result = sess.run([sigmoid_pred, y3, state_op], feed_dict=feed)
    #                     print(sess.run(state))
    #                     print('test cost:' + str(result[0]))
    #                     print('Pred:' + str(result[1]))
    #                     print('label:' + str(result[2]))
                        #append the predictions and labels to the arrays
                        preds = np.append(preds, result[0].flatten())
                        labs = np.append(labs, result[1].flatten())
                except tf.errors.InvalidArgumentError:
                    break
            #Compute and print the area under receiver operator curve for predictions and labels
            AUC.append(roc_auc_score(labs, preds))
            print('Test AUC:' + str(AUC))
            

0


KeyboardInterrupt: 