## CNN with Attention for 5K records, 5 epochs

In [1]:
%matplotlib inline
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import random
from collections import Counter, defaultdict
from operator import itemgetter
import matplotlib.pyplot as plt


#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate
from keras.models import load_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

# Custom functions
%load_ext autoreload
%autoreload 2
import database_selection
import vectorization
import helpers
import icd9_cnn_model
import lstm_model


Using TensorFlow backend.


## Read Input File

In [2]:
#reading file
full_df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])


In [3]:
print 'full shape: ', full_df.shape
# taking just a subset of the records for developing models
df = full_df.sample(frac=0.1).reset_index(drop=True)
#df = full_df
print 'shapeto process: ', df.shape

full shape:  (52696, 5)
shapeto process:  (5270, 5)


## Pre processing ICD 9 codes

In [4]:
#Source: https://github.com/sirrice/icd9 plus doing queries with it
ICD9_FIRST_LEVEL = [
    '001-139','140-239','240-279','290-319', '320-389', '390-459','460-519', '520-579', '580-629', 
    '630-679', '680-709','710-739', '760-779', '780-789', '790-796', '797', '798', '799', '800-999' ]
N_TOP = len(ICD9_FIRST_LEVEL)
# replacing leave ICD9 codes with the grandparents
df['ICD9'] = df['ICD9'].apply(lambda x: helpers.replace_with_grandparent_codes(x,ICD9_FIRST_LEVEL))


In [5]:
#preprocess icd9 codes to vectors 
top_codes = ICD9_FIRST_LEVEL
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)
print 'sample of vectorized icd9 labels: ', labels[0]


sample of vectorized icd9 labels:  [0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1]


## Pre process Notes

In [6]:
#preprocess notes
MAX_VOCAB = None # to limit original number of words (None if no limit)
MAX_SEQ_LENGTH = 5000 # to limit length of word sequence (None if no limit)
df.TEXT = vectorization.clean_notes(df, 'TEXT')
data_vectorized, dictionary, MAX_VOCAB = vectorization.vectorize_notes(df.TEXT, MAX_VOCAB, verbose = True)
data, MAX_SEQ_LENGTH = vectorization.pad_notes(data_vectorized, MAX_SEQ_LENGTH)

print("Final Vocabulary: %s" % MAX_VOCAB)
print("Final Max Sequence Length: %s" % MAX_SEQ_LENGTH)

Vocabulary size: 45047
Average note length: 1641.51442125
Max note length: 10924
Final Vocabulary: 45047
Final Max Sequence Length: 5000


In [7]:
#pulling external embeddings and create embedding matrix
EMBEDDING_DIM = 100 # given the glove that we chose
EMBEDDING_MATRIX= []
EMBEDDING_LOC = '../data/notes.100.txt' # location of embedding
EMBEDDING_MATRIX, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True, sigma=True)


('Vocabulary in notes:', 45047)
('Vocabulary in original embedding:', 21056)
('Vocabulary intersection:', 19980)


## Split Files

In [8]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    data, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (3688, 5000), (3688, 19))
('Validation: ', (1054, 5000), (1054, 19))
('Test: ', (528, 5000), (528, 19))


In [9]:
# Delete temporary variables to free some memory
del df, data, labels

## CNN and attention

In [10]:
import icd9_cnn_att

In [11]:
reload(icd9_cnn_att)
#### build model
cnn_att_model = icd9_cnn_att.build_icd9_cnn_model (input_seq_length=MAX_SEQ_LENGTH, max_vocab = MAX_VOCAB,
                             external_embeddings = True,
                             embedding_dim=EMBEDDING_DIM,embedding_matrix=EMBEDDING_MATRIX, 
                             num_filters = 100, filter_sizes=[2,3,4,5],
                             training_dropout=0.5,
                             num_classes=N_TOP )

  bias=False))(u_it)
  s_i =merge([att, inputs], mode='dot', dot_axes=(1,1), name='s_i_dot'+i)
  name=name)


____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 5000)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 5000, 100)     4504800     input_1[0][0]                    
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 4999, 100)     20100       embedding_1[0][0]                
____________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, 4998, 100)     30100       embedding_1[0][0]                
___________________________________________________________________________________________

## Tuning parameters

The original run (see below) was overfit, it run well for the first 5 epcchs,actually it got a better f1-score than the other models, but it was overfit.   

Starting from epoch 7 the dev accuracy didn't improve neither the f1 score, while the training accuracy continued improving. As a reference, the simple CNN model can run for 20 epochs with its valiation accuracy still increasing.  

Then this CNN_ATT model performs much better during the first 5 epochs, but then keep just memorizing the training data.

Here we did the following hyperparameter tuning that improved fitting a little, the f1-score still improved after 10 epochs and the accuracy kept improving for 1 more epoch, but no major improvement. 
* dropout in the output layer
* two dropouts in attention layer
* dropout value = 0.5  (tried higher values, runs took longer but still they will not improve f1 score)
* L2 regularizations
* default learning rate (we used a smaller one, it didn't work)

In [25]:
## two dropouts in the attention layer
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 3688 samples, validate on 1054 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f031083b4d0>

In [26]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.495      0.509
0.030:      0.503      0.517
0.040:      0.512      0.526
0.050:      0.523      0.537
0.055:      0.529      0.542
0.058:      0.533      0.545
0.060:      0.535      0.548
0.080:      0.557      0.568
0.100:      0.575      0.587
0.200:      0.646      0.653
0.300:      0.701      0.706
0.400:      0.721      0.721
0.500:      0.698      0.698
0.600:      0.640      0.642
0.700:      0.538      0.541


In [27]:
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 3688 samples, validate on 1054 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f03f1aaffd0>

In [28]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.515      0.528
0.030:      0.532      0.543
0.040:      0.547      0.557
0.050:      0.560      0.570
0.055:      0.567      0.576
0.058:      0.570      0.580
0.060:      0.572      0.582
0.080:      0.593      0.599
0.100:      0.611      0.615
0.200:      0.686      0.681
0.300:      0.741      0.730
0.400:      0.762      0.746
0.500:      0.744      0.720
0.600:      0.704      0.683
0.700:      0.628      0.607


In [29]:
cnn_att_model.save('models/cnn_att_10_epochs.h5')

## Original run
* 5K records   
* no regularizations, it overfits after 5 epochs

In [52]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.530      0.526
0.030:      0.545      0.540
0.040:      0.560      0.555
0.050:      0.574      0.569
0.055:      0.581      0.577
0.058:      0.585      0.580
0.060:      0.587      0.582
0.080:      0.608      0.603
0.100:      0.625      0.620
0.200:      0.689      0.675
0.300:      0.734      0.717
0.400:      0.753      0.728
0.500:      0.741      0.717
0.600:      0.705      0.683
0.700:      0.645      0.618


In [53]:
cnn_att_model.save('models/cnn_att_5_epochs_5k.h5')

In [54]:
# 5 more epochs
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 3688 samples, validate on 1054 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f721039df90>

In [55]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.585      0.566
0.030:      0.609      0.587
0.040:      0.629      0.604
0.050:      0.646      0.617
0.055:      0.655      0.621
0.058:      0.659      0.625
0.060:      0.662      0.627
0.080:      0.687      0.643
0.100:      0.709      0.656
0.200:      0.788      0.704
0.300:      0.830      0.727
0.400:      0.845      0.729
0.500:      0.839      0.720
0.600:      0.813      0.695
0.700:      0.763      0.651


In [56]:
cnn_att_model.save('models/cnn_att_10_epochs_5k.h5')

In [57]:
cnn_att_model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=1)

Train on 3688 samples, validate on 1054 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6eb1b74e90>

Here we start seeing the dev F1 score going down

In [58]:
pred_train = cnn_att_model.predict(X_train, batch_size=100)
pred_dev = cnn_att_model.predict(X_val, batch_size=100)
# perform evaluation
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.668      0.613
0.030:      0.697      0.630
0.040:      0.719      0.645
0.050:      0.738      0.654
0.055:      0.747      0.660
0.058:      0.751      0.663
0.060:      0.754      0.664
0.080:      0.779      0.676
0.100:      0.801      0.686
0.200:      0.865      0.713
0.300:      0.898      0.720
0.400:      0.913      0.719
0.500:      0.913      0.714
0.600:      0.903      0.696
0.700:      0.876      0.671


In [59]:
cnn_att_model.save('models/cnn_att_15_epochs_5k.h5')