# Reproduction of Chet baseline model for comparison: CGL model (Lu et al., 2021)

**UIUC, CS598 DL4H, Spring 2023**

**Authors:** Shiyu (Sherry) Li and Wei-Lun (Will) Tsai; {shiyuli2, wltsai2}@illinois.edu

**Original paper:** Chang Lu, Chandan K. Reddy, Prithwish Chakraborty, Samantha
Kleinberg, and Yue Ning. 2021. [Collaborative Graph Learning with Auxiliary Text
for Temporal Event Prediction in Healthcare](https://arxiv.org/pdf/2105.07542.pdf) In 
*Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence, IJCAI-21*,
pages 3529–3535. Interna- tional Joint Conferences on Artificial Intelligence
Organization. Main Track.

**Original codebase:** [github.com/LuChang-CS/CGL](https://github.com/LuChang-CS/CGL)

We re-used the CLG model code from the original repository, but with the following modifications:
1. Combined the code from the original classes `layers.py`, `model.py`, `loss.py`, `metrics.py`, `utils.py`, `train_codes.py`, and `train_codes.py` into a single notebook `cgl.ipynb` for ease of sequential execution.
1. Modified the model definition and training code to remove the use of clinical notes in the CGL baseline to ensure a fair baseline comparison, as specified in the Chet paper (Lu et al., 2022).
1. Modified the training code to use the same preprocessed data and train/valid/test splits as the Chet paper.

## 1. Model architecture

### 1.1. Layers

In [1]:
import tensorflow as tf

from keras.layers import Layer
from keras.layers import Dense, BatchNormalization, GRUCell
from keras.layers import Concatenate, Activation, RNN, StackedRNNCells
from keras.initializers import GlorotUniform


class HierarchicalEmbedding(Layer):
    def __init__(self, code_levels, code_num_in_levels, code_dims, name='hierarchical_embedding'):
        super().__init__(name=name)
        self.level_num = len(code_num_in_levels)
        self.code_levels = code_levels  # (leaf code num * level_num)
        self.level_embeddings = [self.add_weight(name='hier_emb_level_%d' % level,
                                                 shape=(code_num, code_dim),
                                                 initializer=GlorotUniform(),
                                                 trainable=True)
                                 for level, (code_num, code_dim) in enumerate(zip(code_num_in_levels, code_dims))]

    def call(self, inputs=None):
        """
            return: (code_num, embedding_size)
        """
        embeddings = [tf.nn.embedding_lookup(self.level_embeddings[level], self.code_levels[:, level])
                      for level in range(self.level_num)]
        embeddings = Concatenate()(embeddings)
        return embeddings


class PatientEmbedding(Layer):
    def __init__(self, patient_num, patient_dim, name='patient_embedding'):
        super().__init__(name=name)
        self.patient_embeddings = self.add_weight(name='p_emb',
                                                  shape=(patient_num, patient_dim),
                                                  initializer=GlorotUniform(),
                                                  trainable=True)

    def call(self, inputs=None):
        return self.patient_embeddings


class GraphConvBlock(Layer):
    def __init__(self, node_type, dim, adj, name='graph_conv_block'):
        super().__init__(name=name)
        self.node_type = node_type
        self.adj = adj
        self.dense = Dense(dim, activation=None, name=name + '_dense')
        self.activation = Activation('relu', name=name + '_activation')
        self.bn = BatchNormalization(name=name + 'bn')

    def call(self, embedding, embedding_neighbor, weight_decay=None):
        output = embedding + tf.matmul(self.adj, embedding_neighbor)
        if self.node_type == 'code':
            assert weight_decay is not None
            output += tf.matmul(weight_decay, embedding)
        output = self.dense(output)
        output = self.bn(output)
        output = self.activation(output)
        return output


def norm_no_nan(x):
    return tf.math.divide_no_nan(x, tf.reduce_sum(x, axis=-1, keepdims=True))


class GraphConvolution(Layer):
    def __init__(self, patient_dim, code_dim,
                 patient_code_adj, code_code_adj,
                 patient_hidden_dims, code_hidden_dims, name='graph_convolution'):
        super().__init__(name=name)
        self.patient_code_adj = norm_no_nan(patient_code_adj)  # (patient_num, code_num)
        self.code_patient_adj = norm_no_nan(tf.transpose(patient_code_adj))  # (code_num, patient_num)
        self.code_code_adj = code_code_adj  # (code_num, code_num)

        self.patient_blocks = [
            GraphConvBlock('patient', dim, self.patient_code_adj, name='patient_graph_block_%d' % layer)
            for layer, dim in enumerate(patient_hidden_dims)]
        self.code_blocks = [GraphConvBlock('code', dim, self.code_patient_adj, name='code_graph_block_%d' % layer)
                            for layer, dim in enumerate(code_hidden_dims)]

        c2p_dims = ([patient_dim] + patient_hidden_dims)[:-1]
        p2c_dims = ([code_dim] + code_hidden_dims)[:-1]
        self.c2p_denses = [Dense(dim, activation=None, name='code_to_patient_dense_%d' % layer)
                           for layer, dim in enumerate(c2p_dims)]
        self.p2c_denses = [Dense(dim, activation=None, name='patient_to_code_dense_%d' % layer)
                           for layer, dim in enumerate(p2c_dims)]

        code_num = code_code_adj.shape[0]
        self.miu = self.add_weight(name='miu', shape=(code_num,), trainable=True)
        self.theta = self.add_weight(name='theta', shape=(code_num,), trainable=True)

    def call(self, patient_embeddings, code_embeddings):
        weight_decay = tf.nn.sigmoid(self.miu * self.code_code_adj + self.theta)
        weight_decay = norm_no_nan(weight_decay)
        # weight_decay = None
        for c2p_dense, p2c_dense, patient_block, code_block in zip(self.c2p_denses, self.p2c_denses,
                                                                   self.patient_blocks, self.code_blocks):
            code_embeddings_p = c2p_dense(code_embeddings)
            patient_embeddings_new = patient_block(patient_embeddings, code_embeddings_p)
            patient_embeddings_c = p2c_dense(patient_embeddings)
            code_embeddings = code_block(code_embeddings, patient_embeddings_c, weight_decay)
            patient_embeddings = patient_embeddings_new
        patient_embeddings_c = self.p2c_denses[-1](patient_embeddings)
        code_embeddings = self.code_blocks[-1](code_embeddings, patient_embeddings_c, weight_decay)
        return patient_embeddings, code_embeddings


class VisitEmbedding(Layer):
    def __init__(self, max_seq_len, name='visit_embedding'):
        super().__init__(name=name)
        self.max_seq_len = max_seq_len

    def call(self, code_embeddings, visit_codes, visit_lens):
        """
            visit_codes: (batch_size, max_seq_len, max_code_num_in_a_visit)
        """
        visit_codes_embedding = tf.nn.embedding_lookup(code_embeddings, visit_codes)  # (batch_size, max_seq_len, max_code_num_in_a_visit, code_dim)
        visit_codes_mask = tf.expand_dims(visit_codes > 0, axis=-1)
        visit_codes_mask = tf.cast(visit_codes_mask, visit_codes_embedding.dtype)
        visit_codes_embedding *= visit_codes_mask  # (batch_size, max_seq_len, max_code_num_in_a_visit, code_dim)
        visit_codes_num = tf.expand_dims(tf.reduce_sum(tf.cast(visit_codes > 0, visit_codes_embedding.dtype), axis=-1), axis=-1)
        visits_embeddings = tf.math.divide_no_nan(tf.reduce_sum(visit_codes_embedding, axis=-2), visit_codes_num)  # (batch_size, max_seq_len, code_dim)
        visit_mask = tf.expand_dims(tf.sequence_mask(visit_lens, self.max_seq_len, dtype=visits_embeddings.dtype), axis=-1)  # (batch_size, max_seq_len, 1)
        visits_embeddings *= visit_mask  # (batch_size, max_seq_len, code_dim)
        return visits_embeddings


def masked_softmax(inputs, mask):
    inputs = inputs - tf.reduce_max(inputs, keepdims=True, axis=-1)
    exp = tf.exp(inputs) * mask
    result = tf.math.divide_no_nan(exp, tf.reduce_sum(exp, keepdims=True, axis=-1))
    return result


class Attention(Layer):
    def __init__(self, attention_dim, name='attention'):
        super().__init__(name=name)
        self.attention_dim = attention_dim
        self.u_omega = self.add_weight(name=name + '_u', shape=(attention_dim,), initializer=GlorotUniform())
        self.w_omega = None

    def build(self, input_shape):
        hidden_size = input_shape[-1]
        self.w_omega = self.add_weight(name=self.name + '_w', shape=(hidden_size, self.attention_dim), initializer=GlorotUniform())

    def call(self, x, mask=None):
        """
            x: (batch_size, max_seq_len, rnn_dim[-1] / hidden_size)
        """
        t = tf.matmul(x, self.w_omega)
        vu = tf.tensordot(t, self.u_omega, axes=1)  # (batch_size, max_seq_len)
        if mask is not None:
            vu *= mask
            alphas = masked_softmax(vu, mask)
        else:
            alphas = tf.nn.softmax(vu)  # (batch_size, max_seq_len)
        output = tf.reduce_sum(x * tf.expand_dims(alphas, -1), axis=-2)  # (batch_size, rnn_dim[-1] / hidden_size)
        return output, alphas


class TemporalEmbedding(Layer):
    def __init__(self, rnn_dims, attention_dim, max_seq_len, cell_type=GRUCell, name='code_ra'):
        super().__init__(name=name)
        rnn_cells = [cell_type(rnn_dim) for rnn_dim in rnn_dims]
        stacked_rnn = StackedRNNCells(rnn_cells)
        self.rnn_layers = RNN(stacked_rnn, return_sequences=True, name=name + 'rnn')
        self.attention = Attention(attention_dim, name=name + '_attention')
        self.max_seq_len = max_seq_len

    def call(self, embeddings, lens):
        seq_mask = tf.sequence_mask(lens, self.max_seq_len, dtype=embeddings.dtype)
        outputs = self.rnn_layers(embeddings) * tf.expand_dims(seq_mask, axis=-1)  # (batch_size, max_seq_len, rnn_dim[-1])
        outputs, alphas = self.attention(outputs, seq_mask)  # (batch_size, rnn_dim[-1])
        return outputs, alphas


def log_no_nan(x):
    mask = tf.cast(x == 0, dtype=x.dtype)
    x = x + mask
    return tf.math.log(x)

### 1.2 Model

In [2]:
import numpy as np
import tensorflow as tf
from keras import Model
from keras.layers import Layer, Dense


class CGLFeatureExtractor(Layer):
    def __init__(self, config, hyper_params, name='cgl_feature'):
        super().__init__(name=name)
        self.config = config
        self.hyper_params = hyper_params
        self.hierarchical_embedding_layer = HierarchicalEmbedding(
            code_levels=config['code_levels'],
            code_num_in_levels=config['code_num_in_levels'],
            code_dims=hyper_params['code_dims'])
        self.patient_embedding_layer = PatientEmbedding(
            patient_num=config['patient_num'],
            patient_dim=hyper_params['patient_dim'])
        self.graph_convolution_layer = GraphConvolution(
            patient_dim=hyper_params['patient_dim'],
            code_dim=np.sum(hyper_params['code_dims']),
            patient_code_adj=config['patient_code_adj'],
            code_code_adj=config['code_code_adj'],
            patient_hidden_dims=hyper_params['patient_hidden_dims'],
            code_hidden_dims=hyper_params['code_hidden_dims'])
        self.visit_embedding_layer = VisitEmbedding(
            max_seq_len=config['max_visit_seq_len'])
        self.visit_temporal_embedding_layer = TemporalEmbedding(
            rnn_dims=hyper_params['visit_rnn_dims'],
            attention_dim=hyper_params['visit_attention_dim'],
            max_seq_len=config['max_visit_seq_len'],
            name='visit_temporal')

    def call(self, inputs, training=True):
        visit_codes = inputs['visit_codes']  # (batch_size, max_seq_len, max_code_num_in_a_visit)
        visit_lens = tf.reshape(inputs['visit_lens'], (-1, ))  # (batch_size, )
        code_embeddings = self.hierarchical_embedding_layer(None)
        patient_embddings = self.patient_embedding_layer(None)

        patient_embddings, code_embeddings = self.graph_convolution_layer(
            patient_embeddings=patient_embddings, code_embeddings=code_embeddings)
        visits_embeddings = self.visit_embedding_layer(
            code_embeddings=code_embeddings,
            visit_codes=visit_codes,
            visit_lens=visit_lens)
        visit_output, alpha_visit = self.visit_temporal_embedding_layer(visits_embeddings, visit_lens)
        output = visit_output
        return output


class Classifier(Layer):
    def __init__(self, output_dim, activation=None, name='classifier'):
        super().__init__(name=name)
        self.dense = Dense(output_dim, activation=activation)
        self.dropout = tf.keras.layers.Dropout(0.2)

    def call(self, x):
        x = self.dropout(x)
        output = self.dense(x)
        return output


class CGL(Model):
    def __init__(self, config, hyper_params, name='cgl'):
        super().__init__(name=name)
        self.cgl_feature_extractor = CGLFeatureExtractor(config, hyper_params)
        self.classifier = Classifier(config['output_dim'], activation=config['activation'])

    def call(self, inputs, training=True):
        output = self.cgl_feature_extractor(inputs, training=training)
        output = self.classifier(output)
        return output

## 2. Loss function and evaluation metrics

In [3]:
import tensorflow as tf
from keras.callbacks import Callback
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score


def medical_codes_loss(y_true, y_pred):
    return tf.reduce_mean(tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true, logits=y_pred), axis=1))

def f1(y_true_hot, y_pred, metrics='weighted'):
    result = np.zeros_like(y_true_hot)
    for i in range(len(result)):
        true_number = np.sum(y_true_hot[i] == 1)
        result[i][y_pred[i][:true_number]] = 1
    return f1_score(y_true=y_true_hot, y_pred=result, average=metrics)


def top_k_prec_recall(y_true_hot, y_pred, ks):
    a = np.zeros((len(ks), ))
    r = np.zeros((len(ks), ))
    for pred, true_hot in zip(y_pred, y_true_hot):
        true = np.where(true_hot == 1)[0].tolist()
        t = set(true)
        for i, k in enumerate(ks):
            p = set(pred[:k])
            it = p.intersection(t)
            a[i] += len(it) / k
            r[i] += len(it) / len(t)
    return a / len(y_true_hot), r / len(y_true_hot)


def calculate_occurred(historical, y, preds, ks):
    r1 = np.zeros((len(ks), ))
    r2 = np.zeros((len(ks),))
    n = np.sum(y, axis=-1)
    for i, k in enumerate(ks):
        n_k = n
        pred_k = np.zeros_like(y)
        for T in range(len(pred_k)):
            pred_k[T][preds[T][:k]] = 1
        pred_occurred = np.logical_and(historical, pred_k)
        pred_not_occurred = np.logical_and(np.logical_not(historical), pred_k)
        pred_occurred_true = np.logical_and(pred_occurred, y)
        pred_not_occurred_true = np.logical_and(pred_not_occurred, y)
        r1[i] = np.mean(np.sum(pred_occurred_true, axis=-1) / n_k)
        r2[i] = np.mean(np.sum(pred_not_occurred_true, axis=-1) / n_k)
    return r1, r2


class EvaluateCodesCallBack(Callback):
    def __init__(self, data_gen, y, historical=None):
        super().__init__()
        self.data_gen = data_gen
        self.y = y
        self.historical = historical

    def on_epoch_end(self, epoch, logs=None):
        step_size = len(self.data_gen)
        preds = []
        for i in range(step_size):
            batch_codes_x, batch_visit_lens, batch_note_x, batch_note_lens = self.data_gen[i]
            output = self.model(inputs={
                'visit_codes': batch_codes_x,
                'visit_lens': batch_visit_lens,
                'word_ids': batch_note_x,
                'word_lens': batch_note_lens
            }, training=False)
            logits = tf.math.sigmoid(output)
            pred = tf.argsort(logits, axis=-1, direction='DESCENDING')
            preds.append(pred.numpy())
        preds = np.vstack(preds)
        f1_score = f1(self.y, preds)
        prec, recall = top_k_prec_recall(self.y, preds, ks=[10, 20, 30, 40])
        if self.historical is not None:
            r1, r2 = calculate_occurred(self.historical, self.y, preds, ks=[10, 20, 30, 40])
            print('\t', 'f1_score:', f1_score, '\t', 'top_k_recall:', recall, '\t', 'occurred:', r1, '\t', 'not occurred:', r2)
        else:
            print('\t', 'f1_score:', f1_score, '\t', 'top_k_recall:', recall)


class EvaluateHFCallBack(Callback):
    def __init__(self, data_gen, y):
        super().__init__()
        self.data_gen = data_gen
        self.y = y

    def on_epoch_end(self, epoch, logs=None):
        step_size = len(self.data_gen)
        preds, outputs = [], []
        for i in range(step_size):
            batch_codes_x, batch_visit_lens, batch_note_x, batch_note_lens = self.data_gen[i]
            output = self.model(inputs={
                'visit_codes': batch_codes_x,
                'visit_lens': batch_visit_lens,
                'word_ids': batch_note_x,
                'word_lens': batch_note_lens
            }, training=False)
            outputs.append(tf.squeeze(output).numpy())
            pred = tf.squeeze(tf.cast(output > 0.5, tf.int32))
            preds.append(pred.numpy())
        outputs = np.concatenate(outputs)
        preds = np.concatenate(preds)
        auc = roc_auc_score(self.y, outputs)
        f1_score_ = f1_score(self.y, preds)
        print('\t', 'auc:', auc, '\t', 'f1_score:', f1_score_)

## 3. Data generation and loading

### 3.1. Generate data unique to CGL

We need additional functions to generate data for the CGL model that was
neither used nor generated in Chet.

In [10]:
import numpy as np

class DataGenerator:
    def __init__(self, inputs, shuffle=True, batch_size=32):
        assert len(inputs) > 0
        self.inputs = inputs
        self.idx = np.arange(len(inputs[0]))
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.on_epoch_end()

    def data_length(self):
        return len(self.idx)

    def __len__(self):
        n = self.data_length()
        len_ = n // self.batch_size
        return len_ if n % self.batch_size == 0 else len_ + 1

    def __getitem__(self, index):
        start = index * self.batch_size
        end = start + self.batch_size
        index = self.idx[start:end]
        data = []
        for x in self.inputs:
            data.append(x[start:end])
        return data

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.idx)

    def set_batch_size(self, batch_size):
        self.batch_size = batch_size

def generate_patient_code_adjacent(code_x, code_num):
    print('generating patient code adjacent matrix ...')
    result = np.zeros((len(code_x), code_num), dtype=int)
    for i, codes in enumerate(code_x):
        adj_codes = codes[codes > 0]
        result[i][adj_codes] = 1
    return result

### 3.2. Loading and generating data

Load the same data used in training Chet and generate the additional data needed
for CGL.

In [11]:
import os
import pickle

def load_sparse(path):
    data = np.load(path)
    idx, values = data['idx'], data['values']
    mat = np.zeros(data['shape'], dtype=values.dtype)
    mat[tuple(idx)] = values
    return mat

all_data = {}
datasets = ['mimic3', 'mimic4']

for dataset in datasets:
    data_path = os.path.join('data', dataset)
    parsed_path = os.path.join(data_path, 'parsed')
    encoded_path = os.path.join(data_path, 'encoded')
    standard_path = os.path.join(data_path, 'standard')
    train_path = os.path.join(standard_path, 'train')
    valid_path = os.path.join(standard_path, 'valid')
    test_path = os.path.join(standard_path, 'test')

    train_codes_x = load_sparse(os.path.join(train_path, 'code_x_cgl.npz'))
    train_codes_y = load_sparse(os.path.join(train_path, 'code_y.npz'))
    train_hf_y = np.load(os.path.join(train_path, 'hf_y.npz'))['hf_y']
    train_visit_lens = np.load(os.path.join(train_path, 'visit_lens.npz'))['lens']
    valid_codes_x = load_sparse(os.path.join(valid_path, 'code_x_cgl.npz'))
    valid_codes_y = load_sparse(os.path.join(valid_path, 'code_y.npz'))
    valid_hf_y = np.load(os.path.join(valid_path, 'hf_y.npz'))['hf_y']
    valid_visit_lens = np.load(os.path.join(valid_path, 'visit_lens.npz'))['lens']
    test_codes_x = load_sparse(os.path.join(test_path, 'code_x_cgl.npz'))
    test_codes_y = load_sparse(os.path.join(test_path, 'code_y.npz'))
    test_hf_y = np.load(os.path.join(test_path, 'hf_y.npz'))['hf_y']
    test_visit_lens = np.load(os.path.join(test_path, 'visit_lens.npz'))['lens']
    code_map = pickle.load(open(os.path.join(encoded_path, 'code_map.pkl'), 'rb'))
    code_levels = pickle.load(open(os.path.join(parsed_path, 'code_levels.pkl'), 'rb'))['code_levels']
    code_code_adj = load_sparse(os.path.join(standard_path, 'code_adj.npz'))
    code_num = len(code_map)
    patient_code_adj = generate_patient_code_adjacent(train_codes_x, code_num)

    all_data[dataset] = {
        'train_codes_x': train_codes_x,
        'train_codes_y': train_codes_y,
        'train_hf_y': train_hf_y,
        'train_visit_lens': train_visit_lens,
        'valid_codes_x': valid_codes_x,
        'valid_codes_y': valid_codes_y,
        'valid_hf_y': valid_hf_y,
        'valid_visit_lens': valid_visit_lens,
        'test_codes_x': test_codes_x,
        'test_codes_y': test_codes_y,
        'test_hf_y': test_hf_y,
        'test_visit_lens': test_visit_lens,
        'code_map': code_map,
        'code_num': code_num,
        'patient_code_adj': patient_code_adj,
        'code_levels': code_levels,
        'code_code_adj': code_code_adj
    }

print("*** data loaded ***")

generating patient code adjacent matrix ...
generating patient code adjacent matrix ...
*** data loaded ***


## 4. Training and evaluation

We train the CGL model on the same train/valid datasets as the one we used to
train Chet, with the same seed and epochs.

In [12]:
import random
import pickle as pickle

import tensorflow as tf
from keras.callbacks import LearningRateScheduler
import numpy as np


# use the same seed and number of epochs as our Chet experiment
seed = 6669
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
num_epochs = 20

### 4.1. Train and evaluate the model for diagnosis prediction task

In [17]:
def historical_hot(code_x, code_num):
    result = np.zeros((len(code_x), code_num), dtype=int)
    for i, x in enumerate(code_x):
        for code in x:
            result[i][code - 1] = 1
    return result

for dataset in datasets:
    print("\n******************Training diagnosis prediction task on {}******************\n".format(dataset))
    config = {
        'patient_code_adj': tf.constant(all_data[dataset]['patient_code_adj'], dtype=tf.float32),
        'code_code_adj': tf.constant(all_data[dataset]['code_code_adj'], dtype=tf.float32),
        'code_levels': tf.constant(all_data[dataset]['code_levels'], dtype=tf.int32),
        'code_num_in_levels': np.max(all_data[dataset]['code_levels'], axis=0) + 1,
        'patient_num': all_data[dataset]['train_codes_x'].shape[0],
        'max_visit_seq_len': all_data[dataset]['train_codes_x'].shape[1],
        'output_dim': len(all_data[dataset]['code_map']),
        'lambda': 0.3,
        'activation': None
    }

    test_historical = historical_hot(all_data[dataset]['test_codes_x'], config['output_dim'])

    visit_rnn_dims = [200]
    hyper_params = {
        'code_dims': [32, 32, 32, 32],
        'patient_dim': 16,
        'word_dim': 16,
        'patient_hidden_dims': [32],
        'code_hidden_dims': [64, 128],
        'visit_rnn_dims': visit_rnn_dims,
        'visit_attention_dim': 32,
    }

    test_codes_gen = DataGenerator([all_data[dataset]['test_codes_x'], all_data[dataset]['test_visit_lens']], shuffle=False)

    def lr_schedule_fn(epoch, lr):
        if epoch < 20:
            lr = 0.01
        elif epoch < 100:
            lr = 0.001
        elif epoch < 200:
            lr = 0.0001
        else:
            lr = 0.00001
        return lr

    lr_scheduler = LearningRateScheduler(lr_schedule_fn)
    test_callback = EvaluateCodesCallBack(test_codes_gen, all_data[dataset]['test_codes_y'], historical=test_historical)

    cgl_model = CGL(config, hyper_params)
    cgl_model.compile(optimizer='adam', loss=medical_codes_loss)
    cgl_model.fit(x={
        'visit_codes': all_data[dataset]['train_codes_x'],
        'visit_lens': all_data[dataset]['train_visit_lens'],
    }, y=all_data[dataset]['train_codes_y'].astype(float), validation_data=({
        'visit_codes': all_data[dataset]['valid_codes_x'],
        'visit_lens': all_data[dataset]['valid_visit_lens'],
        }, all_data[dataset]['valid_codes_y'].astype(float)), epochs=num_epochs, batch_size=32, callbacks=[lr_scheduler, test_callback])
    cgl_model.summary()


******************Training diagnosis prediction task on mimic3******************

Epoch 1/20
 17/188 [=>............................] - ETA: 7:03 - loss: 519.3796

KeyboardInterrupt: 

### 4.2. Train and evaluate the model for heart failure prediction task

In [18]:
for dataset in datasets:
    print("\n******************Training heart failure prediction task on {}******************\n".format(dataset))

    config = {
            'patient_code_adj': tf.constant(all_data[dataset]['patient_code_adj'], dtype=tf.float32),
            'code_code_adj': tf.constant(all_data[dataset]['code_code_adj'], dtype=tf.float32),
            'code_levels': tf.constant(all_data[dataset]['code_levels'], dtype=tf.int32),
            'code_num_in_levels': np.max(all_data[dataset]['code_levels'], axis=0) + 1,
            'patient_num': all_data[dataset]['train_codes_x'].shape[0],
            'max_visit_seq_len': all_data[dataset]['train_codes_x'].shape[1],
            'output_dim': 1,
            'lambda': 0.1,
            'activation': 'sigmoid'
        }

    visit_rnn_dims = [200]
    hyper_params = {
        'code_dims': [32, 32, 32, 32],
        'patient_dim': 16,
        'word_dim': 16,
        'patient_hidden_dims': [32],
        'code_hidden_dims': [64, 128],
        'visit_rnn_dims': visit_rnn_dims,
        'visit_attention_dim': 32,
        'note_attention_dim': visit_rnn_dims[-1]
    }

    test_codes_gen = DataGenerator([ all_data[dataset]['test_codes_x'], all_data[dataset]['test_visit_lens']], shuffle=False)

    def lr_schedule_fn(epoch, lr):
        if epoch < 8:
            lr = 0.1
        elif epoch < 20:
            lr = 0.01
        elif epoch < 50:
            lr = 0.001
        else:
            lr = 0.0001
        return lr

    lr_scheduler = LearningRateScheduler(lr_schedule_fn)
    test_callback = EvaluateHFCallBack(test_codes_gen, all_data[dataset]['test_hf_y'])

    cgl_model = CGL(config, hyper_params)
    cgl_model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[tf.metrics.AUC()])
    cgl_model.fit(x={
        'visit_codes': all_data[dataset]['train_codes_x'],
        'visit_lens': all_data[dataset]['train_visit_lens'],
    }, y= all_data[dataset]['train_hf_y'].astype(float), validation_data=({
        'visit_codes': all_data[dataset]['valid_codes_x'],
        'visit_lens': all_data[dataset]['valid_visit_lens'],
    }, all_data[dataset]['valid_hf_y'].astype(float)), epochs=num_epochs, batch_size=32, callbacks=[lr_scheduler, test_callback])
    cgl_model.summary()


******************Training heart failure prediction task on mimic3******************



ValueError: Data cardinality is ambiguous:
  x sizes: 6000, 6000
  y sizes: 1000
Make sure all arrays contain the same number of samples.