In [1]:
import numpy as np
import pandas as pd
import random
from collections import Counter, defaultdict
from operator import itemgetter
import matplotlib.pyplot as plt

#keras
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.models import load_model
from tensorflow.keras import regularizers
from sklearn.metrics import classification_report

In [2]:
#reading file
df = pd.read_csv('18label_mimic_dis.csv') ### pre-processed text with 18 labels
print('df shape: ', df.shape)

df.head(3)

df shape:  (52722, 19)


Unnamed: 0,TEXT,blood,circulatory,congenital,digestive,endocrine,genitourinary,infectious,injury,mental,muscular,neoplasms,nervous,pregnancy,prenatal,respiratory,skin,symptoms,E and V
0,admission date discharge date date of birth se...,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,1,1,0
1,admission date discharge date date of birth se...,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,1
2,admission date discharge date date of birth se...,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1


In [3]:
df.columns

Index(['TEXT', 'blood', 'circulatory', 'congenital', 'digestive', 'endocrine',
       'genitourinary', 'infectious', 'injury', 'mental', 'muscular',
       'neoplasms', 'nervous', 'pregnancy', 'prenatal', 'respiratory', 'skin',
       'symptoms', 'E and V'],
      dtype='object')

In [4]:
### This file contains the functions necessary to vectorize the ICD labels and text inputs
import numpy as np
import pandas as pd
import re
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Vectorize ICD codes

def vectorize_icd_string(x, code_list):
    """Takes a string with ICD codes and returns an array of the right of 0/1"""
    r = []
    for code in code_list:
        if code in x: r.append(1)
        else: r.append(0)
    return np.asarray(r)

def vectorize_icd_column(df, col_name, code_list):
    """Takes a column and applies the """
    r = df[col_name].apply(lambda x: vectorize_icd_string(x, code_list))
    r = np.transpose(np.column_stack(r))
    return r


# Vectorize and Pad notes Text

def vectorize_notes(col, MAX_NB_WORDS, verbose = True):
    """Takes a note column and encodes it into a series of integer
        Also returns the dictionnary mapping the word to the integer"""
    tokenizer = Tokenizer(num_words = MAX_NB_WORDS)
    tokenizer.fit_on_texts(col)
    data = tokenizer.texts_to_sequences(col)
    note_length =  [len(x) for x in data]
    vocab = tokenizer.word_index
    MAX_VOCAB = len(vocab)
    if verbose:
        print('Vocabulary size: %s' % MAX_VOCAB)
        print('Average note length: %s' % np.mean(note_length))
        print('Max note length: %s' % np.max(note_length))
    return data, vocab, MAX_VOCAB

def pad_notes(data, MAX_SEQ_LENGTH):
    data = pad_sequences(data, maxlen = MAX_SEQ_LENGTH)
    return data, data.shape[1]


# Creates an embedding Matrix
# Based on https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

def embedding_matrix(f_name, dictionary, EMBEDDING_DIM, verbose = True, sigma = None):
    """Takes a pre-trained embedding and adapts it to the dictionary at hand
        Words not found will be all-zeros in the matrix"""

    # Dictionary of words from the pre trained embedding
    pretrained_dict = {}
    with open(f_name, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            pretrained_dict[word] = coefs

    # Default values for absent words
    if sigma:
        pretrained_matrix = sigma * np.random.rand(len(dictionary) + 1, EMBEDDING_DIM)
    else:
        pretrained_matrix = np.zeros((len(dictionary) + 1, EMBEDDING_DIM))
    
    # Substitution of default values by pretrained values when applicable
    for word, i in dictionary.items():
        vector = pretrained_dict.get(word)
        if vector is not None:
            pretrained_matrix[i] = vector

    if verbose:
        print('Vocabulary in notes:', len(dictionary))
        print('Vocabulary in original embedding:', len(pretrained_dict))
        inter = list( set(dictionary.keys()) & set(pretrained_dict.keys()) )
        print('Vocabulary intersection:', len(inter))

    return pretrained_matrix, pretrained_dict

def train_val_test_split(X, y, val_size=0.2, test_size=0.2, random_state=101):
    """Splits the input and labels into 3 sets"""
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(val_size+test_size), random_state=random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_size/(val_size+test_size), random_state=random_state)
    return X_train, X_val, X_test, y_train, y_val, y_test


In [5]:
#preprocess notes
MAX_VOCAB = None # to limit original number of words (None if no limit)
MAX_SEQ_LENGTH = 2500 # to limit length of word sequence (None if no limit)
##TEXT is pre-processed
data_vectorized, dictionary, MAX_VOCAB = vectorize_notes(df.TEXT, MAX_VOCAB, verbose = True)
data, MAX_SEQ_LENGTH = pad_notes(data_vectorized, MAX_SEQ_LENGTH)

print("Final Vocabulary: %s" % MAX_VOCAB)
print("Final Max Sequence Length: %s" % MAX_SEQ_LENGTH)

Vocabulary size: 150475
Average note length: 1513.407647661318
Max note length: 10500
Final Vocabulary: 150475
Final Max Sequence Length: 2500


In [6]:
EMBEDDING_DIM = 50 
EMBEDDING_MATRIX= []

In [7]:

EMBEDDING_LOC = 'Emb50.txt' # pretrained 50 dimensional embeddings
EMBEDDING_MATRIX, embedding_dict = embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True, sigma=True)


Vocabulary in notes: 150475
Vocabulary in original embedding: 911413
Vocabulary intersection: 59823


In [8]:
labels = ['blood', 'circulatory', 'congenital', 'digestive', 'endocrine',
       'genitourinary', 'infectious', 'injury', 'mental', 'muscular',
       'neoplasms', 'nervous', 'pregnancy', 'prenatal', 'respiratory', 'skin',
       'symptoms', 'E and V']

len(labels)

18

In [9]:
X = data
Y = df[labels]

In [10]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    X, Y, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

Train:  (36905, 2500) (36905, 18)
Validation:  (10544, 2500) (10544, 18)
Test:  (5273, 2500) (5273, 18)


In [11]:
# Delete temporary variables to free some memory
del df, data, X, Y

## CNNText classification

Based on:
* "Convolutional Neural Networks for Sentence Classification"   
* http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
* https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras/blob/master/sentiment_cnn.py
* http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/
* https://github.com/dennybritz/cnn-text-classification-tf/blob/master/text_cnn.py

In [12]:
def cnntext_model(input_seq_length, 
                         max_vocab, external_embeddings, embedding_dim, embedding_matrix,
                         num_filters, filter_sizes,
                         training_dropout_keep_prob,
                         num_classes):
    #Embedding
    model_input = Input(shape=(input_seq_length, ))
    if external_embeddings:
        # use embedding_matrix 
        z = Embedding(max_vocab + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=input_seq_length,
                            trainable=True)(model_input)
    else:
        # train embeddings 
        z =  Embedding(max_vocab + 1, 
                   embedding_dim, 
                   input_length=input_seq_length, embeddings_regularizer=regularizers.l2(0.0001),
                   name="embedding")(model_input)

    # Convolutional block
    conv_blocks = []
    for sz in filter_sizes:
        conv = Convolution1D(filters=num_filters,                         
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
        window_pool_size =  input_seq_length  - sz + 1 
        conv = MaxPooling1D(pool_size=window_pool_size)(conv)  
        conv = Flatten()(conv)
        conv_blocks.append(conv)

    #concatenate
    z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
    z = Dropout(training_dropout_keep_prob)(z)

    model_output = Dense(num_classes, activation="sigmoid")(z)

    model = Model(model_input, model_output)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    
    print(model.summary())

    return model

In [13]:
#### build model
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
model = cnntext_model (input_seq_length=MAX_SEQ_LENGTH, max_vocab = MAX_VOCAB,
                             external_embeddings = False,
                             embedding_dim=EMBEDDING_DIM,embedding_matrix=EMBEDDING_MATRIX,
                             num_filters = 100, filter_sizes=[2,3,4,5],
                             training_dropout_keep_prob=0.5,
                             num_classes=18 )

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 2500)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 2500, 50)     7523800     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 2499, 100)    10100       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 2498, 100)    15100       embedding[0][0]                  
______________________________________________________________________________________________

In [14]:
# Train the model
model.fit(X_train, y_train, batch_size=16, epochs=200, validation_data=(X_val, y_val), verbose=2, callbacks=callback)

Epoch 1/200
2307/2307 - 284s - loss: 0.4517 - accuracy: 0.4383 - val_loss: 0.3903 - val_accuracy: 0.4189
Epoch 2/200
2307/2307 - 282s - loss: 0.3893 - accuracy: 0.3973 - val_loss: 0.3607 - val_accuracy: 0.4132
Epoch 3/200
2307/2307 - 283s - loss: 0.3725 - accuracy: 0.3909 - val_loss: 0.3515 - val_accuracy: 0.3947
Epoch 4/200
2307/2307 - 283s - loss: 0.3648 - accuracy: 0.3872 - val_loss: 0.3460 - val_accuracy: 0.3674
Epoch 5/200
2307/2307 - 283s - loss: 0.3579 - accuracy: 0.3822 - val_loss: 0.3426 - val_accuracy: 0.4008
Epoch 6/200
2307/2307 - 283s - loss: 0.3546 - accuracy: 0.3819 - val_loss: 0.3408 - val_accuracy: 0.3980
Epoch 7/200
2307/2307 - 282s - loss: 0.3524 - accuracy: 0.3799 - val_loss: 0.3394 - val_accuracy: 0.3957
Epoch 8/200
2307/2307 - 283s - loss: 0.3502 - accuracy: 0.3812 - val_loss: 0.3370 - val_accuracy: 0.3624
Epoch 9/200
2307/2307 - 283s - loss: 0.3487 - accuracy: 0.3781 - val_loss: 0.3377 - val_accuracy: 0.4144
Epoch 10/200
2307/2307 - 283s - loss: 0.3475 - accuracy

<tensorflow.python.keras.callbacks.History at 0x7fcb1c41d2b0>

In [15]:
pred = model.predict(X_test, batch_size=50)
y_pred = np.where(pred > 0.5, 1, 0)

In [16]:
print(classification_report(y_test, y_pred,digits=4, target_names=labels))

               precision    recall  f1-score   support

        blood     0.7688    0.5699    0.6546      1809
  circulatory     0.9407    0.9436    0.9421      4132
   congenital     0.7883    0.3942    0.5255       274
    digestive     0.8525    0.7473    0.7965      2042
    endocrine     0.8732    0.8987    0.8858      3534
genitourinary     0.8671    0.7910    0.8273      2120
   infectious     0.8172    0.7168    0.7637      1416
       injury     0.8233    0.5969    0.6920      2225
       mental     0.8227    0.6294    0.7132      1592
     muscular     0.7225    0.4420    0.5485       966
    neoplasms     0.8802    0.7730    0.8232       846
      nervous     0.7776    0.6090    0.6831      1573
    pregnancy     1.0000    0.4444    0.6154        18
     prenatal     0.8090    0.5542    0.6578       848
  respiratory     0.8709    0.8107    0.8397      2404
         skin     0.7640    0.4474    0.5643       608
     symptoms     0.6695    0.4776    0.5575      1629
      E a

  _warn_prf(average, modifier, msg_start, len(result))
