In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import os
import numpy as np
#from gensim.models import Word2Vec, FastText
#import glove
#from glove import Corpus

import collections
import gc 

import keras
from keras import backend as K
from keras import regularizers
from keras.models import Sequential, Model
from keras.layers import Flatten, Dense, Dropout, Input, concatenate, merge, Activation, Concatenate, LSTM, GRU
from keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, Conv1D, BatchNormalization, GRU, Convolution1D, LSTM
from keras.layers import UpSampling1D, MaxPooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D,MaxPool1D, merge

#from keras.optimizers import Adam

from keras.callbacks import EarlyStopping, ModelCheckpoint, History, ReduceLROnPlateau
from keras.utils import np_utils
#from keras.backend.tensorflow_backend import set_session, clear_session, get_session
from keras.backend import set_session, clear_session, get_session
import tensorflow as tf


from sklearn.utils import class_weight
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Reset Keras Session
def reset_keras(model):
    sess = get_session()
    clear_session()
    sess.close()
    sess = get_session()

    try:
        del model # this is from global space - change this as you need
    except:
        pass

    gc.collect() # if it's done something you should see a number being outputted

In [None]:
def create_dataset(dict_of_ner):
    temp_data = []
    for k, v in sorted(dict_of_ner.items()):
        temp = []
        for embed in v:
            temp.append(embed)
        temp_data.append(np.mean(temp, axis = 0)) 
    return np.asarray(temp_data)

def make_prediction_multi_avg(model, test_data):
    probs = model.predict(test_data)
    y_pred = [1 if i>=0.5 else 0 for i in probs]
    return probs, y_pred

def save_scores_multi_avg(predictions, probs, ground_truth, 
                          
                          embed_name, problem_type, iteration, hidden_unit_size,
                          
                          sequence_name, type_of_ner):
    
    auc = roc_auc_score(ground_truth, probs)
    auprc = average_precision_score(ground_truth, probs)
    acc   = accuracy_score(ground_truth, predictions)
    F1    = f1_score(ground_truth, predictions)
    
    result_dict = {}    
    result_dict['auc'] = auc
    result_dict['auprc'] = auprc
    result_dict['acc'] = acc
    result_dict['F1'] = F1
    
    result_path = "/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/results/"
    file_name = str(sequence_name)+"-"+str(hidden_unit_size)+"-"+embed_name
    file_name = file_name +"-"+problem_type+"-"+str(iteration)+"-"+type_of_ner+"-avg-.p"
    pd.to_pickle(result_dict, os.path.join(result_path, file_name))

    print(auc, auprc, acc, F1)
    
def avg_ner_model(layer_name, number_of_unit, embedding_name):

    if embedding_name == "concat":
        input_dimension = 200
        #print("input_dimension", input_dimension)
    else:
        input_dimension = 100
    input_dimension = 100
    sequence_input = Input(shape=(24,104))

    input_avg = Input(shape=(input_dimension, ), name = "avg")        
#     x_1 = Dense(256, activation='relu')(input_avg)
#     x_1 = Dropout(0.3)(x_1)
    
    if layer_name == "GRU":
        x = GRU(number_of_unit)(sequence_input)
    elif layer_name == "LSTM":
        x = LSTM(number_of_unit)(sequence_input)

    x = keras.layers.Concatenate()([x, input_avg])

    x = Dense(256, activation='relu')(x)
    x = Dropout(0.2)(x)
    
    
    #logits_regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
    logits_regularizer = tf.keras.regularizers.L2(0.01)
    preds = Dense(1, activation='sigmoid',use_bias=False,
                         kernel_initializer=tf.keras.initializers.glorot_normal(), 
                  kernel_regularizer=logits_regularizer)(x)
    
    
    #opt = Adam(lr=0.001, decay = 0.01)
    model = Model(inputs=[sequence_input, input_avg], outputs=preds)
    model.compile(loss='binary_crossentropy',
                  optimizer="adam",
                  metrics=['acc'])
    
    return model

In [None]:
lvl2_train =  pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/lvl2_imputer_train_los.pkl")
lvl2_dev =  pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/lvl2_imputer_dev_los.pkl")
lvl2_test =  pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/lvl2_imputer_test_los.pkl")

Ys =  pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/Ys_los.pkl")
Ys_train =  pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/Ys_train_los.pkl")
Ys_dev =  pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/Ys_dev_los.pkl")
Ys_test =  pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/Ys_test_los.pkl")

In [None]:
all_train_ids = set()
for i in Ys_train.itertuples():
    all_train_ids.add( i.Index[0] )
    
all_dev_ids = set()
for i in Ys_dev.itertuples():
    all_dev_ids.add( i.Index[0] )
    
all_test_ids = set()
for i in Ys_test.itertuples():
    all_test_ids.add( i.Index[0] )

print (sum(Ys_train.mort_icu.values)*1.0 / len(Ys_train.mort_icu.values))
print (sum(Ys_dev.mort_icu.values)*1.0 / len(Ys_dev.mort_icu.values))
print (sum(Ys_test.mort_icu.values)*1.0 / len(Ys_test.mort_icu.values))
print ("====")
print (sum(Ys_train.mort_hosp.values)*1.0 / len(Ys_train.mort_hosp.values))
print (sum(Ys_dev.mort_hosp.values)*1.0 / len(Ys_dev.mort_hosp.values))
print (sum(Ys_test.mort_hosp.values)*1.0 / len(Ys_test.mort_hosp.values))
print ("====")
print (sum(Ys_train.los_3.values)*1.0 / len(Ys_train.los_3.values))
print (sum(Ys_dev.los_3.values)*1.0 / len(Ys_dev.los_3.values))
print (sum(Ys_test.los_3.values)*1.0 / len(Ys_test.los_3.values))
print ("====")
print (sum(Ys_train.los_7.values)*1.0 / len(Ys_train.los_7.values))
print (sum(Ys_dev.los_7.values)*1.0 / len(Ys_dev.los_7.values))
print (sum(Ys_test.los_7.values)*1.0 / len(Ys_test.los_7.values))

0.07159904534606205
0.06725146198830409
0.07432150313152401
====
0.10680190930787589
0.09649122807017543
0.10855949895615867
====
0.43323389021479713
0.42105263157894735
0.4246346555323591
====
0.07696897374701671
0.07268170426065163
0.07954070981210856


In [None]:
type_of_ner = "new"

x_train_lstm = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/"+type_of_ner+"_x_train_los.pkl")
x_dev_lstm = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/"+type_of_ner+"_x_dev_los.pkl")
x_test_lstm = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/"+type_of_ner+"_x_test_los.pkl")

y_train = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/"+type_of_ner+"_y_train_los.pkl")
y_dev = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/"+type_of_ner+"_y_dev_los.pkl")
y_test = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/"+type_of_ner+"_y_test_los.pkl")

ner_word2vec = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/new_ner_word2vec_limited_dict_los.pkl")
ner_fasttext = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/new_ner_fasttext_limited_dict_los.pkl")
ner_concat = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/new_ner_combined_limited_dict_los.pkl")

new_keys = set(ner_word2vec.keys())
train_ids = sorted(all_train_ids.intersection(new_keys))
dev_ids = sorted(all_dev_ids.intersection(new_keys))
test_ids = sorted(all_test_ids.intersection(new_keys))

#train_ids = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/"+type_of_ner+"_train_ids.pkl")
#dev_ids = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/"+type_of_ner+"_dev_ids.pkl")
#test_ids = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer-master/data/"+type_of_ner+"_test_ids.pkl")

print("train_ids = ", train_ids)
print("dev_ids = ", dev_ids)
print("test_ids = ", test_ids)

train_ids =  [3, 9, 12, 13, 17, 19, 21, 25, 30, 31, 35, 41, 45, 52, 55, 56, 61, 62, 64, 65, 68, 71, 78, 81, 83, 85, 88, 97, 99, 100, 101, 103, 105, 106, 114, 115, 123, 124, 127, 129, 130, 133, 134, 135, 137, 140, 141, 143, 144, 146, 147, 149, 152, 160, 163, 169, 170, 171, 173, 174, 175, 177, 178, 186, 191, 195, 201, 202, 205, 208, 209, 211, 212, 214, 222, 225, 226, 228, 234, 238, 242, 245, 249, 251, 253, 255, 256, 261, 262, 265, 267, 268, 269, 270, 271, 272, 273, 274, 275, 279, 281, 287, 290, 298, 301, 302, 305, 306, 307, 309, 310, 313, 314, 315, 318, 319, 321, 323, 326, 329, 330, 335, 344, 345, 346, 347, 350, 351, 353, 354, 356, 360, 364, 366, 367, 370, 371, 379, 383, 389, 391, 394, 395, 396, 397, 400, 402, 406, 407, 409, 417, 418, 420, 421, 422, 424, 429, 434, 436, 437, 438, 439, 440, 445, 448, 452, 453, 458, 462, 464, 466, 468, 471, 472, 477, 481, 482, 485, 487, 489, 491, 492, 498, 503, 507, 510, 518, 523, 533, 536, 538, 539, 540, 543, 545, 554, 557, 558, 559, 561, 564, 569, 577, 58

In [10]:
#embedding_types = ['concat']
#embedding_dict = [ner_concat]
#target_problems = ['mort_hosp']

embedding_types = ['word2vec', 'fasttext', 'concat']
embedding_dict = [ner_word2vec, ner_fasttext, ner_concat]
target_problems = ['mort_hosp', 'mort_icu', 'los_3', 'los_5', 'los_7']


#num_epoch = 2
num_epoch = 100
model_patience = 3
monitor_criteria = 'val_loss'
batch_size = 64
iter_num = 11
#iter_num = 2
unit_sizes = [128, 256]
#unit_sizes = [256]
#unit_sizes = [128]
#layers = ["LSTM", "GRU"]
layers = ["GRU"]

for each_layer in layers:
    print ("Layer: ", each_layer)
    for each_unit_size in unit_sizes:
        print ("Hidden unit: ", each_unit_size)

        for embed_dict, embed_name in zip(embedding_dict, embedding_types):    
            print ("Embedding: ", embed_name)
            print("=============================")

            temp_train_ner = dict((k, ner_word2vec[k]) for k in train_ids)
            temp_dev_ner = dict((k, ner_word2vec[k]) for k in dev_ids)
            temp_test_ner = dict((k, ner_word2vec[k]) for k in test_ids)

            #print("temp_train_ner: ", temp_train_ner.shape)
            #print("temp_dev_ner: ", temp_dev_ner.shape)
            #print("temp_test_ner: ", temp_test_ner.shape)

            x_train_ner = create_dataset(temp_train_ner)
            x_dev_ner = create_dataset(temp_dev_ner)
            x_test_ner = create_dataset(temp_test_ner)

            for iteration in range(1, iter_num):
                print ("Iteration number: ", iteration)

                for each_problem in target_problems:
                    print ("Problem type: ", each_problem)
                    print ("__________________")

                    early_stopping_monitor = EarlyStopping(monitor=monitor_criteria, patience=model_patience)
                    best_model_name = "avg-"+str(embed_name)+"-"+str(each_problem)+"-"+"best_model.hdf5"
                    checkpoint = ModelCheckpoint(best_model_name, monitor='val_loss', verbose=1,
                        save_best_only=True, mode='min', period=1)


                    callbacks = [early_stopping_monitor, checkpoint]

                    model = avg_ner_model(each_layer, each_unit_size, embed_name)
                    
                    #print("x_train_lstm: ", x_train_lstm.shape)
                    #print("x_train_ner: ", x_train_ner.shape)
                    #print("y_train[each_problem]: ", (y_train[each_problem]).shape)
                    #print("x_dev_lstm: ", x_dev_lstm.shape)
                    #print("x_dev_ner: ", x_dev_ner.shape)
                    #print("y_dev[each_problem]: ", (y_dev[each_problem]).shape)


                    model.fit([x_train_lstm, x_train_ner], y_train[each_problem], epochs=num_epoch, verbose=1, 
                              validation_data=([x_dev_lstm, x_dev_ner], y_dev[each_problem]), callbacks=callbacks, 
                              batch_size=batch_size )

                    model.load_weights(best_model_name)

                    probs, predictions = make_prediction_multi_avg(model, [x_test_lstm, x_test_ner])
                    
                    save_scores_multi_avg(predictions, probs, y_test[each_problem], 
                                embed_name, each_problem, iteration, each_unit_size, 
                                each_layer, type_of_ner)
                    
                    reset_keras(model)
                    #del model
                    clear_session()
                    gc.collect()

Layer:  GRU
Hidden unit:  128
Embedding:  word2vec
Iteration number:  1
Problem type:  mort_hosp
__________________
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.25383, saving model to avg-word2vec-mort_hosp-best_model.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.25383 to 0.23085, saving model to avg-word2vec-mort_hosp-best_model.hdf5
Epoch 3/100
Epoch 3: val_loss improved from 0.23085 to 0.22995, saving model to avg-word2vec-mort_hosp-best_model.hdf5
Epoch 4/100
Epoch 4: val_loss did not improve from 0.22995
Epoch 5/100
Epoch 5: val_loss did not improve from 0.22995
Epoch 6/100
Epoch 6: val_loss did not improve from 0.22995
0.8731375011014186 0.5735515285398531 0.9139834406623735 0.49184782608695654
Problem type:  mort_icu
__________________
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.17729, saving model to avg-word2vec-mort_icu-best_model.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.17729 to 0.17002, saving model to avg-word2vec-mort_icu-best_model.hdf5
Ep