In [None]:
# Binary classification using GRU with general text pre-trained embeddings. 

### MIMIC-III discharge summary, for ICD-9 level 1 `circ' label where `circ' includes ICD-9 codes between 390-459: diseases of the circulatory system.
### Percentage of occurrence is of `circ'in unique hospital admissions in MIMIC-III is 78.4%. 
### The total number of hospital admissions with a recorded discharge summary is 52,710.

In [1]:
import os
import pandas as pd
import numpy as np
import re
#import gensim
import string
import codecs

from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 

import tensorflow as tf
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU
from tensorflow.keras.initializers import Constant
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report

In [2]:
np.random.seed(0)

stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

In [3]:
df = pd.read_csv("circ_binary_mimic_dis.csv")
df = df.drop(['HADM_ID'],axis=1)
df.head()

Unnamed: 0,TEXT,label
0,admission date discharge date date of birth se...,1
1,admission date discharge date date of birth se...,0
2,admission date discharge date date of birth se...,1
3,admission date discharge date date of birth se...,1
4,admission date discharge date date of birth se...,0


In [4]:
df['split'] = np.random.randn(df.shape[0], 1)

msk = np.random.rand(len(df)) <= 0.7
train = df[msk]
test = df[~msk]
train = train.drop(['split'],axis=1)
test = test.drop(['split'],axis=1)
texts = train.TEXT


In [5]:
tokenizer = RegexpTokenizer(r'\w+')

MAX_NB_WORDS = 100000
max_seq_len = 3000

raw_docs_train = train['TEXT'].tolist()
raw_docs_test = test['TEXT'].tolist() 
num_classes = 1

print("pre-processing train data...")
processed_docs_train = []
for doc in tqdm(raw_docs_train):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_docs_train.append(" ".join(filtered))


processed_docs_test = []
for doc in tqdm(raw_docs_test):
    tokens = tokenizer.tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    processed_docs_test.append(" ".join(filtered))


print("tokenizing input data...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, lower=True, char_level=False)
tokenizer.fit_on_texts(processed_docs_train + processed_docs_test)  #leaky
word_seq_train = tokenizer.texts_to_sequences(processed_docs_train)
word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)
word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

#pad sequences
word_seq_train = sequence.pad_sequences(word_seq_train, maxlen=max_seq_len)
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_seq_len)


  1%|          | 270/36692 [00:00<00:13, 2685.90it/s]

pre-processing train data...


100%|██████████| 36692/36692 [00:14<00:00, 2585.24it/s]
100%|██████████| 16030/16030 [00:06<00:00, 2554.18it/s]


tokenizing input data...
dictionary size:  150352


In [6]:
#load pre-trained embeddings 
print('loading word embeddings...')
embeddings_index = {}
f = codecs.open('cc.en.300.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('found %s word vectors' % len(embeddings_index))


embed_dim = 300 


1335it [00:00, 13347.12it/s]

loading word embeddings...


2000001it [02:19, 14387.17it/s]

found 2000000 word vectors





In [7]:
#embedding matrix
print('preparing embedding matrix...')
words_not_found = []
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)
print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))



preparing embedding matrix...
number of null word embeddings: 52460


In [8]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
model=Sequential()
optimzer=Adam(clipvalue=0.5)

embedding=Embedding(nb_words, embed_dim,
          weights=[embedding_matrix], input_length=max_seq_len, trainable=False)

model.add(embedding)
model.add(GRU(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer=optimzer, loss='binary_crossentropy', metrics=['acc'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 3000, 300)         30000000  
_________________________________________________________________
gru (GRU)                    (None, 128)               165120    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 30,165,249
Trainable params: 165,249
Non-trainable params: 30,000,000
_________________________________________________________________


In [10]:
## Modify the number of epochs. This is only for an example.
history=model.fit(word_seq_train,train.label, callbacks=callback, 
                  batch_size=64, epochs=100, 
                  validation_split=0.1, 
                  verbose=2)

Epoch 1/100
516/516 - 73s - loss: 0.2209 - acc: 0.9075 - val_loss: 0.2034 - val_acc: 0.9101
Epoch 2/100
516/516 - 73s - loss: 0.2111 - acc: 0.9122 - val_loss: 0.2058 - val_acc: 0.9079
Epoch 3/100
516/516 - 73s - loss: 0.2027 - acc: 0.9164 - val_loss: 0.2200 - val_acc: 0.8984
Epoch 4/100
516/516 - 73s - loss: 0.1948 - acc: 0.9190 - val_loss: 0.1931 - val_acc: 0.9144
Epoch 5/100
516/516 - 73s - loss: 0.1878 - acc: 0.9206 - val_loss: 0.2103 - val_acc: 0.9068
Epoch 6/100
516/516 - 73s - loss: 0.1832 - acc: 0.9233 - val_loss: 0.1912 - val_acc: 0.9169
Epoch 7/100
516/516 - 73s - loss: 0.1789 - acc: 0.9269 - val_loss: 0.1925 - val_acc: 0.9210
Epoch 8/100
516/516 - 73s - loss: 0.1734 - acc: 0.9289 - val_loss: 0.1917 - val_acc: 0.9172
Epoch 9/100
516/516 - 73s - loss: 0.1653 - acc: 0.9317 - val_loss: 0.2001 - val_acc: 0.9131
Epoch 10/100
516/516 - 73s - loss: 0.1613 - acc: 0.9331 - val_loss: 0.2002 - val_acc: 0.9158
Epoch 11/100
516/516 - 73s - loss: 0.1534 - acc: 0.9374 - val_loss: 0.2039 - va

In [11]:
y_out = model.predict(word_seq_test,batch_size=64)
y_pred = np.where(y_out > 0.5, 1, 0)


print(classification_report(test.label, y_pred,digits=4))

              precision    recall  f1-score   support

           0     0.7788    0.8011    0.7898      3494
           1     0.9441    0.9366    0.9403     12536

    accuracy                         0.9070     16030
   macro avg     0.8615    0.8688    0.8651     16030
weighted avg     0.9081    0.9070    0.9075     16030

