In [1]:
import warnings
import importlib
warnings.filterwarnings('ignore')
from loader import Loader
from preprocessor import Preprocessor
from spliter import Spliter 
import utils

In [2]:
#Constants
# file_name = ["data/ae.sas7bdat"]
# dict_name = ["data/meddra_dict_v21","data/meddra_dict_v22", "data/meddra_dict_v23"]
dict_name = ["data/meddra_dict_v22"]

In [3]:
loader = Loader(dict_name)

In [4]:
loader.rawdf

Unnamed: 0,Verbatim Term,LLT Name,Version
0,contact dermatitis,contact dermatitis,17.0
1,cataracts,cataracts,17.0
2,occasional lightheadedness,lightheadedness,17.0
3,occassional neurologic dizziness,dizziness,17.0
4,swollen lymph nodes bilateral neck,swollen lymph nodes,17.0
...,...,...,...
32768,right hemicolectomy,right hemicolectomy,23.0
32769,hepatic lobectomy,liver lobectomy,23.0
32770,colon biopsy,colon biopsy,23.0
32771,laparoscopic low anterior resection,lower anterior resection,23.0


In [5]:
processor = Preprocessor(loader.rawdf, loader.dict)

In [6]:
medra,raw = processor.pipe_line()

In [7]:
raw

Unnamed: 0,TERM,LLT
0,contact dermatitis,contact dermatitis
1,cataract,cataracts
2,occasional lightheadedness,lightheadedness
3,occassional neurologic dizziness,dizziness
4,swollen lymph node bilateral neck,swollen lymph nodes
...,...,...
32638,abdominal hysterectomy,abdominal hysterectomy
32639,right hemicolectomy,right hemicolectomy
32640,hepatic lobectomy,liver lobectomy
32641,colon biopsy,colon biopsy


In [8]:
X_train, X_test, Y_train, Y_test, X_ls, X_testls = Spliter(raw, medra).get_train_test()

In [9]:
X_train

0              exacerbation herpes simplex
1           painful - l foot joint big toe
2         high creatinine level 106 umol l
3            absent reflex lower extremity
4        interstitial nodular opacity lung
                       ...                
30026                        uti infection
30027                     burnt right hand
30028                       hypomagnasemia
30029                            sore limb
30030                   allergy penicillin
Name: TERM, Length: 30031, dtype: object

In [10]:
X_train.shape, X_test.shape

((30031,), (2612,))

In [11]:
%%time
# WARNING: Time consuming
word_to_index, index_to_word, word_to_vec_map = utils.read_emb_vecs('./ri-3gram-400-tsv/vocab.tsv', './ri-3gram-400-tsv/vectors.tsv')

CPU times: user 7min 6s, sys: 18.1 s, total: 7min 24s
Wall time: 7min 33s


In [12]:
# Constants
CLASSES=len(set(medra['LLT']))
WINDOWS_Size=6

Count potential spelling errors or words cannot be found in the embeddings

In [13]:
cnt = 0
for record in X_ls:
    for i in record[0:WINDOWS_Size]:
        if i not in word_to_vec_map:
            cnt += 1

In [14]:
print("There are " + str(cnt) + " number of individual training words NOT found in the word embedding vectors" )

There are 7752 number of individual training words NOT found in the word embedding vectors


In [15]:
%%time
# WARNING: Time consuming cell
from spellchecker import SpellChecker 
spell = SpellChecker()
for record in X_ls:
    for i in record[0:WINDOWS_Size]:
        if i not in word_to_vec_map:
            idx = record.index(i)
            record[idx] = spell.correction(i)
            
for record in X_testls:
    for i in record[0:WINDOWS_Size]:
        if i not in word_to_vec_map:
            idx = record.index(i)
            record[idx] = spell.correction(i)

CPU times: user 16min 51s, sys: 1.07 s, total: 16min 52s
Wall time: 16min 53s


In [16]:
cnt2 = 0
for record in X_ls:
    for i in record[0:WINDOWS_Size]:
        if i not in word_to_vec_map:
#             print(i)
            cnt2 += 1

In [17]:
print("There are " + str(cnt2) + " number of individual training words NOT found after spell correction and other corrections" )

There are 5501 number of individual training words NOT found after spell correction and other corrections


## Encode and Decode

In [18]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [19]:
encoder = {}
for i, pt in enumerate(set(medra['LLT'])):
    encoder.update({pt: i})
    i = i + 1
decoder = dict([(pt, i) for i, pt in encoder.items()])

outfile = open(os.path.join('data', 'lltcoder.pkl'),'wb')
pickle.dump(encoder,outfile)
outfile.close()
outfile = open(os.path.join('data', 'lltdecoder.pkl'),'wb')
pickle.dump(decoder,outfile)
outfile.close()

The following lines of code are designed to test if there are any train/test not in the target meddra version. Even though, using different version to train the model, it not makes sense to includes every version in the output since the output is version specific.

In [20]:
print("extra in test: ",[i for i in set(Y_test.tolist()) if i not in set(medra['LLT'].tolist())])

extra in test:  []


In [21]:
# Warning: This cell takes long time
print("extra in train: ",[i for i in set(Y_train.tolist()) if i not in set(medra['LLT'].tolist())])

extra in train:  []


In [22]:
y_train =np.array([[encoder[i]] for i in Y_train])
y_test = np.array([[encoder[i]] for i in Y_test])

In [23]:
N=len(max(X_ls,key=len))
N

31

In [24]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((30031,), (30031, 1), (2612,), (2612, 1))

## Embeddings

In [25]:
def emdlayer(window, textlst, dim):
    '''generate embedding layer
    random normal distribution from 0 to 0.01.
    '''
    v_tmp=[np.array([word_to_vec_map[i] 
                     if i in word_to_vec_map 
                     else (np.random.randn(dim,)*10000).astype('float32') for i in record[0:window]])
           for record in textlst]
    

    vec = []
    
    for i in range(len(textlst)):
        x1 = (np.random.randn(window-v_tmp[i].shape[0], dim)*10000).astype('float32')
        x2 = v_tmp[i]
        x = np.concatenate((x1,x2), axis=0)
        vec.append(x)
    
    vec = np.array(vec)
#     vec = np.array([np.concatenate((v_tmp[i],
#                                     (np.random.randn(window-v_tmp[i].shape[0], dim)*10000).astype('float32')),
#                                    axis=0)
#         for i in range(len(textlst))])
    return vec

In [26]:
# permutation of the input X_ls

X_ls_perm = []

for x in X_ls:
    X_ls_perm.append(np.random.permutation(x).tolist())

In [27]:
X_ls_all = [*X_ls, *X_ls_perm]

In [28]:
y_train_all = np.concatenate((y_train, y_train), axis=0)

In [29]:
%%time
# WARNING: Time consuming Cell
# X=emdlayer(WINDOWS_Size, X_ls, 400)
# X=emdlayer(WINDOWS_Size, X_ls_perm, 400)
X=emdlayer(WINDOWS_Size, X_ls_all, 400)
Xtest=emdlayer(WINDOWS_Size, X_testls, 400)
INPUT_DIM = X.shape[2]
SINGLE_ATTENTION_VECTOR = False

CPU times: user 4 s, sys: 1.05 s, total: 5.05 s
Wall time: 5.16 s


## Model

In [30]:
X.shape, y_train.shape, Xtest.shape, y_test.shape

((60062, 6, 400), (30031, 1), (2612, 6, 400), (2612, 1))

In [31]:
import keras.backend as K
import numpy as np
from keras.layers import concatenate, Bidirectional, Dropout, MaxPooling1D, Conv1D
from keras.layers.core import *
from keras.layers.recurrent import LSTM
from keras.models import *
from keras.optimizers import Adam
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
import tensorflow as tf

Using TensorFlow backend.


In [None]:
callbacks=utils.callback_()
filename = str('./model.m0.LLT.M22.w6.6-5-2021(test)')
checkpoint = ModelCheckpoint(filename, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

In [None]:
def get_activations(model, inputs, print_shape_only=False, layer_name=None):

    print('----- activations -----')
    activations = []
    inp = model.input
    if layer_name is None:
        outputs = [layer.output for layer in model.layers]
    else:
        outputs = [layer.output for layer in model.layers if layer.name == layer_name]  # all layer outputs
    funcs = [K.function([inp] + [K.learning_phase()], [out]) for out in outputs]  # evaluation functions
    layer_outputs = [func([inputs, 1.])[0] for func in funcs]
    for layer_activations in layer_outputs:
        activations.append(layer_activations)
        if print_shape_only:
            print(layer_activations.shape)
        else:
            print(layer_activations)
    return activations

def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, WINDOWS_Size))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(WINDOWS_Size, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    
    output_attention_mul = concatenate([inputs, a_probs], name='attention_mul')
    return output_attention_mul



def model_attention_applied_after_lstm():
    inputs = Input(shape=(WINDOWS_Size, INPUT_DIM,))
    ###YD 6/4/2021: Apply dropout to the input layer can dramatically control the overfit problem###
    ### but an important question here is the dropout rate. it shows 0.3 > 0.2####
    drop=Dropout(0.4)(inputs)
    #########
    lstm_units1 = 256
    lstm_out = LSTM(lstm_units1, return_sequences=True, recurrent_regularizer=regularizers.l2(0.01))(drop)
#     drop2=Dropout(0.5)(lstm_out)
    attention_mul = attention_3d_block(lstm_out)
    attention_mul = Flatten()(attention_mul)
    output = Dense(CLASSES, activation='softmax')(attention_mul)
    model = Model(input=[inputs], output=output)
    return model

def model_conv_2():
    inputs = Input(shape=(WINDOWS_Size, INPUT_DIM,))
    attention_mul = attention_3d_block(inputs)
    conv_0 = Conv1D(64, 7,activation='relu',kernel_regularizer=regularizers.l2(0.01))(attention_mul)
    conv_0=Dropout(0.2)(conv_0)
    conv_1 = Conv1D(64, 5,activation='relu',kernel_regularizer=regularizers.l2(0.01))(attention_mul)
    conv_1=Dropout(0.2)(conv_1)
    conv_2 = Conv1D(64, 3,activation='relu',kernel_regularizer=regularizers.l2(0.01))(attention_mul)
    conv_2=Dropout(0.2)(conv_2)
    
    maxpool_0 = MaxPooling1D(pool_size=2)(conv_0)
    maxpool_1 = MaxPooling1D(pool_size=2)(conv_1)
    maxpool_2 = MaxPooling1D(pool_size=2)(conv_2)

    conc_tensor_0 = concatenate([maxpool_0, maxpool_1], axis=1)
    conc_tensor_1 = concatenate([maxpool_1, maxpool_2], axis=1)
    conc_tensor_2 = concatenate([maxpool_0, maxpool_2], axis=1)
    
    conc_tensor_0 = Flatten()(conc_tensor_0)
    conc_tensor_1 = Flatten()(conc_tensor_1)
    conc_tensor_2 = Flatten()(conc_tensor_2)
    
    dnn_out_1=Dense(256, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01))(conc_tensor_0)
    dnn_out_2=Dense(256, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01))(conc_tensor_1)    
    dnn_out_3=Dense(256, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01))(conc_tensor_2)    

    conc_dnn_out=concatenate([dnn_out_1, dnn_out_2,dnn_out_3], axis=1)
    dnn_out_4=Dense(128, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01))(conc_dnn_out)  
    dnn_out_5=Dense(128, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01))(dnn_out_4) 
    dnn_out_6=Dense(128, activation=tf.nn.relu, kernel_regularizer=regularizers.l2(0.01))(dnn_out_5) 
    output = Dense(CLASSES, activation='softmax')(dnn_out_6)
    model = Model(input=[inputs], output=output)
    return model

In [None]:
%%time
from keras import optimizers
m0 = model_attention_applied_after_lstm()
rms = optimizers.RMSprop(lr=0.001)
# adam = optimizers.Adam(learning_rate = 0.001, beta_1 = 0.9)
m0.compile(optimizer=rms, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
m0.summary()


h0=m0.fit([X], y_train_all, epochs=20, batch_size=128, validation_data=[[Xtest], y_test], callbacks=[checkpoint])

In [None]:
y_p = m0.predict(Xtest)

####obtain LLT
y_pred = [decoder[i] for i in y_p.argmax(axis=1)]
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test.tolist(), y_pred)

In [None]:
def plotresult(hist, title, outputfile):
	acc = hist.history['accuracy']
	val_acc = hist.history['val_accuracy']

	epochs = len(acc)
	plt.plot(range(epochs), acc, marker='.', label='acc')
	plt.plot(range(epochs), val_acc, marker='.', label='val_acc')
	plt.legend(loc='best')
	plt.grid()
	plt.xlabel('epoch')
	plt.ylabel('acc')
	plt.title('Training/Validation: '+ title)
	plt.savefig('images/'+outputfile)
	plt.show()

In [None]:
plotresult(h0, '400D + LSTM + Attention', 'm0_model_attention_applied_after_lstm.png')

In [None]:
data = {"Test AE Term ":X_test, "Predicted AE LLT": y_pred, "Actual AE LLT": Y_test}
s = pd.DataFrame(data)

In [None]:
s

In [None]:
len(s.loc[s["Predicted AE LLT"] == s["Actual AE LLT"]])

In [None]:
s.loc[s["Predicted AE LLT"] == s["Actual AE LLT"]]

In [None]:
# pd.set_option('display.max_rows', None)
s.loc[s["Predicted AE LLT"] != s["Actual AE LLT"]]

## Model 2

In [None]:
def model_conv_sm():
    inputs = Input(shape=(WINDOWS_Size, INPUT_DIM,))
    attention_mul = attention_3d_block(inputs)
    conv_0 = Conv1D(64, 1,activation='relu',kernel_regularizer=regularizers.l2(0.01))(attention_mul)
    conv_1 = Conv1D(64, 2,activation='relu',kernel_regularizer=regularizers.l2(0.01))(attention_mul)
    conv_2 = Conv1D(64, 3,activation='relu',kernel_regularizer=regularizers.l2(0.01))(attention_mul)

    maxpool_0 = MaxPooling1D(pool_size=3)(conv_0)
    maxpool_1 = MaxPooling1D(pool_size=3)(conv_1)
    maxpool_2 = MaxPooling1D(pool_size=3)(conv_2)

    merged_tensor = concatenate([maxpool_0, maxpool_1,maxpool_2], axis=1)
    lstm_out = Bidirectional(LSTM(256, activation='tanh',recurrent_regularizer=regularizers.l2(0.01),return_sequences=True))(maxpool_0)
    merged_tensor = Flatten()(lstm_out)
    dnn_out=Dense(64, activation=tf.nn.relu)(merged_tensor)
    output = Dense(CLASSES, activation='softmax')(merged_tensor)
    model = Model(input=[inputs], output=output)
    return model

In [None]:
%%time
m3 = model_conv_sm()

m3.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
m3.summary()

h1=m3.fit([X], y_train_all, epochs=35, batch_size=32, validation_data=[[Xtest], y_test], callbacks=[checkpoint2])

In [None]:
plotresult(h1, '400D + BiLSTM + Attention', 'm1_model_attention_applied_after_bilstm.png')

In [None]:
plt.plot(h1.history['loss'])
plt.plot(h1.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

## Test

In [None]:
# from tensorflow import keras
test = keras.models.load_model('./model.m0.LLT.M22.w6.6-4-2021-dropout-0.4')

In [None]:
import pandas as pd
test = pd.read_excel('')

In [None]:
test = test[['Verbatim Term', 'LLT Name']]

In [None]:
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
def string_processor(x, grammer):
        """
        Method to preprocess the string, includes following process:
        1. lower case
        2. remove punctuation
        3. remove stop words
        4. stem or lemmatize the word: i.e. for grammatical reasons, d documents are going to use different forms of a
        word, such as organize, organizes, and organizing.
        For the difference between lemmatization and stemming,
        https://blog.bitext.com/what-is-the-difference-between-stemming-and-lemmatization/
        :param grammer: "stem" or "lemma"
        :return: return a cleaned version of string (particularly the term in raw datasets, i.e. AETERM in AE)
        """
        try:
            nltk.data.find('corpora/stopwords')
            nltk.data.find('wordnet')
        except LookupError:
            # If it does not exist, the program downloads the stopwords.
            nltk.download('stopwords', quiet=True)
            nltk.download("wordnet", quiet=True)
            nltk.download('stopwords', download_dir='nltk_packages', quiet=True)
            
        sw = stopwords.words('english')
        # Stemming
        stemmer = SnowballStemmer("english")
        # lemmatization
        lemma = WordNetLemmatizer()

        if grammer == 'stem':
            x_cln = ' '.join([stemmer.stem(i) for i in re.sub(r'[^a-zA-Z]',' ', x).split() if i not in sw]).lower()
        elif grammer == 'lemma':
            x_cln = ' '.join([lemma.lemmatize(i) for i in re.sub(r'[^a-zA-Z0-9-]',' ', x).split() if i not in sw]).lower()
        elif grammer == "medra":
            x_cln = ' '.join([i.strip() for i in re.sub(r'[^a-zA-Z0-9-]',' ', x).split() if i not in sw]).lower() # keep the hyphen and numbers for the medra dictionary
            # x_cln = ' '.join([i.strip() for i in re.sub(r'[^a-zA-Z]',' ', x).split() if i not in sw]).lower())
        else:
            # x_cln = ' '.join([i.strip() for i in re.sub(r'[^\w\s]+',' ', x).split() if i not in sw]).lower()
            x_cln = ' '.join([i.strip() for i in re.sub(r'[^a-zA-Z0-9-]',' ', x).split() if i not in sw]).lower()
        return x_cln


In [None]:
test['Verbatim Term'] = test['Verbatim Term'].apply(lambda x: string_processor(x, "lemma"))
test['LLT Name'] = test['LLT Name'].apply(lambda x: string_processor(x, None))

In [None]:
no_dup = new.drop_duplicates()

In [None]:
len(test)

In [None]:
len(no_dup)

In [None]:
X_testls_new = [w.split() for w in new['Verbatim Term']]
len(X_testls_new)

In [None]:
cnt3 = 0
WINDOWS_Size=6
for record in X_testls_new:
    for i in record[0:WINDOWS_Size]:
        if i not in word_to_vec_map:
            cnt3 += 1
cnt3

In [None]:
Xtest_new=emdlayer(WINDOWS_Size, X_testls_new, 400)

In [None]:
y_p_test = test.predict(Xtest_new)
# y_p_test2 = test2.predict(Xtest_new)

In [None]:
y_pred_test = [decoder[i] for i in y_p_test.argmax(axis=1)]
data0 = {"Test AE Term ": new['Verbatim Term'], "Predicted AE LLT": y_pred_test, "Actual AE LLT": new['LLT Name']}
s_0 = pd.DataFrame(data0)
s0 = s_0.loc[s_0["Predicted AE LLT"] == s_0["Actual AE LLT"]]

In [None]:
y_pred_test2 = [decoder[i] for i in y_p_test2.argmax(axis=1)]
data1 = {"Test AE Term ": new['Verbatim Term'], "Predicted AE LLT": y_pred_test2, "Actual AE LLT": new['LLT Name']}
s_1 = pd.DataFrame(data1)
s1 = s_1.loc[s_1["Predicted AE LLT"] == s_1["Actual AE LLT"]]

In [None]:
s0

In [None]:
len(s0)

In [None]:
len(s1)

In [None]:
acc0 = len(s0)/len(new)
acc0

In [None]:
acc1 = len(s1)/len(new)
acc1

In [None]:
print("Test accuracy for new test data is with model 1 is " + str(acc0 * 100) + "%")

In [None]:
print("Test accuracy for new test data is with model 2 is " + str(acc1 * 100) + "%")

In [None]:
# s_0.to_excel("output.xlsx")  