
# Entity Recognition using Deep Learning
### Task: Train a deep neural network model using the provided training dataset to identify adverse events and SSI from drug reviews. 

## Import packages

In [288]:
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn_crfsuite.metrics import flat_classification_report
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

import tensorflow as tf
import tensorflow.keras.layers 
import random
import pandas as pd
import numpy as np

## Define functions

In [289]:
def read_file(f):
    data = open(f,'r').readlines()[1:]
    row_id = [i.split('\t')[0].strip() for i in data]
    data = [i.split('\t')[1].strip().split(' ') for i in data]
    return row_id,data

def reset_random_seeds(x):
   os.environ['PYTHONHASHSEED']=str(1)
   tf.random.set_seed(x)
   np.random.seed(x)
   random.seed(x)

## Read in the data

In [290]:

os.chdir(r"/Users/szoriac/OneDrive/Michigan/=WN 2021/LHS 712/Assignment 3 CRF LSTM") 
print(os.getcwd())

row_id_text, texts = read_file('./REVIEW_TEXT.txt')
row_id_tags, tags = read_file('./REVIEW_LABELSEQ.txt')
row_id_finaltext, finaltexts = read_file('./TEST_REVIEW_TEXT.txt')

combined_text = texts + finaltexts

#For this demo, let's just use the first 100 sentences 
texts = texts
tags = tags

/Users/szoriac/OneDrive/Michigan/=WN 2021/LHS 712/Assignment 3 CRF LSTM


## Input representation (converting words to vectors - one hot encoding)

### Setting up vocabulary of words and tags

In [291]:
unique_words = list(set([j for i in combined_text for j in i]))
word2idx = {j:i+1 for i,j in enumerate(unique_words)}
word2idx["PAD"] = 0

unique_tags = list(set([j for i in tags for j in i]))
label2idx = {j:i for i,j in enumerate(unique_tags)}
idx2label = {j:i for i,j in label2idx.items()}



### Padding

In [292]:
X = [[word2idx[j] for j in i] for i in texts]
X = pad_sequences(maxlen = 160, sequences = X, padding = "post", value = word2idx["PAD"])
y = [[label2idx[j] for j in i] for i in tags]
y = pad_sequences(maxlen = 160, sequences = y, padding = "post", value = label2idx["O"])
y = [to_categorical(i, num_classes = len(unique_tags)) for i in y]

### Use Glove pretrained 

In [293]:
path_to_glove_file = './glove.6B.100d.txt'

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [294]:
num_tokens = len(word2idx) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word2idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 4296 words (5543 misses)


In [297]:
from tensorflow.keras.layers import Embedding

glove_embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tensorflow.keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

### Train and Validation sets

In [298]:
X_train, X_validation, y_train, y_validation  = train_test_split(X, y, test_size = 0.2)

### Deep Learning

In [300]:
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.layers import Input, Dropout, Flatten, Conv2D, MaxPooling2D, Dense, Activation

reset_random_seeds(1)
model = Sequential()
model.add(glove_embedding_layer)
model.add(Bidirectional(LSTM(units=80,return_sequences=True,dropout=0.4), merge_mode = 'concat'))
model.add(Dense(40, activation='tanh'))
model.add(Dense(len(label2idx.keys()), activation="sigmoid"))
opt = tensorflow.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
print(model.summary())






Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_26 (Embedding)     (None, None, 100)         984100    
_________________________________________________________________
bidirectional_23 (Bidirectio (None, None, 160)         115840    
_________________________________________________________________
dense_64 (Dense)             (None, None, 40)          6440      
_________________________________________________________________
dense_65 (Dense)             (None, None, 5)           205       
Total params: 1,106,585
Trainable params: 122,485
Non-trainable params: 984,100
_________________________________________________________________
None


In [266]:
#run1()
#run3()

### Training & Prediction

In [301]:
history = model.fit(X_train,np.array(y_train),batch_size=100,epochs=30,validation_split=0.1)

y_pred = model.predict(X_validation)
y_pred = np.argmax(y_pred, axis=-1)
y_validation = np.argmax(y_validation, -1)
y_pred = [[idx2label[i] for i in row] for row in y_pred]
y_validation = [[idx2label[i] for i in row] for row in y_validation]

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### Result

In [309]:
nopaddingy_pred = [item[:len(texts[idx])] for idx,item in enumerate(y_pred)]
nopaddingy_validation = [item[:len(texts[idx])] for idx,item in enumerate(y_validation)]

In [310]:
report = flat_classification_report(y_pred=nopaddingy_pred, y_true=nopaddingy_validation)
print(report)

              precision    recall  f1-score   support

        B-AE       0.79      0.66      0.72       583
       B-SSI       0.78      0.60      0.68       121
        I-AE       0.81      0.65      0.72       924
       I-SSI       1.00      0.01      0.03        67
           O       0.96      0.99      0.98     13133

    accuracy                           0.95     14828
   macro avg       0.87      0.58      0.63     14828
weighted avg       0.95      0.95      0.94     14828



In [304]:
X_final = [[word2idx[j] for j in i] for i in finaltexts]
X_final = pad_sequences(maxlen =160, sequences = X_final, padding = "post", value = word2idx["PAD"])

In [306]:
y_test_pred = model.predict(X_final)
y_test_pred = np.argmax(y_test_pred, axis=-1)
y_test_pred = [[idx2label[i] for i in row] for row in y_test_pred]
nopaddingy_test_pred = [item[:len(finaltexts[idx])] for idx,item in enumerate(y_test_pred)]



In [307]:
foroutput = []

for idx,item in enumerate(row_id_finaltext):
    withinfo = [item, ' '.join(nopaddingy_test_pred[idx])]
    foroutput.append(withinfo)


a = [['ID', 'TAGSEQ']]

xout = a +foroutput

In [308]:
with open('TEST_REVIEW_LABELSEQ_DEEP_run11.txt','w',encoding='utf-8-sig') as out:
    for line in xout:
        out.write(str('\t'.join(line)) +'\n') 