In [1]:
import numpy as np
import pandas as pd
import nltk
import os
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### LOAD BIO files

In [3]:
BIO_PATH ='C:/Projects/try1/bio_folder'

In [4]:
all_files =[]
for i in os.listdir(BIO_PATH):
    f = os.path.join(BIO_PATH,i)
    if i.endswith('.bio'):
        all_files.append(f)

In [5]:
len(all_files)

200

## Data Preprocessing:

### Remove stopwords.

In [6]:
from nltk.corpus import stopwords
nltk.download('stopwords')
STOPWORDS = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tannn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Separate the data into labels and sentences.

In [7]:
sentences =[]
labels =[]

In [8]:
import re
import spacy
nlp = spacy.load('en_core_web_sm')

In [9]:
for files in all_files:
    with open(files, 'r',encoding = 'utf-8') as f:
        current_sentence = []
        current_label =[]
        for line in f:
            if line.strip() == '':
                sentences.append(current_sentence)
                labels.append(current_label)

                current_sentence =[]
                current_label =[]
                continue
            #print(line)
            word = line.strip().split("\t")[0]
            tag = line.strip().split("\t")[1]

            cleaned_word = re.sub(r'[^a-zA-Z]', '', word)
        
            if cleaned_word in STOPWORDS:
                cleaned_word = ' '
            else:
                doc = nlp(cleaned_word)
                lemmatized_word = " ".join([token.lemma_ for token in doc])
                cleaned_word = lemmatized_word

            if cleaned_word.strip():

                current_sentence.append(cleaned_word)
                if len(current_label) > 0:
                    if tag[2:] == current_label[-1][2:] and tag[:2] == "B-":
                        tag = f"I-{tag[2:]}"
                current_label.append(tag)           
     
                
                
        

In [10]:
print(f"Dataset contains {len(sentences)} examples\n")

Dataset contains 4741 examples



In [11]:
print(f"Dataset contains {len(labels)} examples\n")

Dataset contains 4741 examples



## Prepare training Data.

In [12]:
combined_data = list(zip(sentences, labels))
random.shuffle(combined_data)
sentences, categorical_labels = zip(*combined_data)

sentences_train, sentences_temp, labels_train, labels_temp = train_test_split(sentences, categorical_labels, test_size=0.2, random_state=42, shuffle=True)
sentences_validation, sentences_test, labels_validation, labels_test = train_test_split(sentences_temp, labels_temp, test_size=0.5, random_state=42, shuffle=True)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_train)

max_sequence_length = 200  
train_sequences = tokenizer.texts_to_sequences(sentences_train)
validation_sequences = tokenizer.texts_to_sequences(sentences_validation)
test_sequences = tokenizer.texts_to_sequences(sentences_test)
train_padded_sequences = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
validation_padded_sequences = pad_sequences(validation_sequences, maxlen=max_sequence_length, padding='post', truncating='post')
test_padded_sequences = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post', truncating='post')

In [14]:
from tensorflow.keras.utils import to_categorical

flat_labels_train = [label for sublist in labels_train for label in sublist]
flat_labels_validation = [label for sublist in labels_validation for label in sublist]
flat_labels_test = [label for sublist in labels_test for label in sublist]

unique_labels_set = set(flat_labels_train).union(set(flat_labels_test))
print(len(unique_labels_set))
# Map labels to their one-hot encoded index
label_to_index = {label: i for i, label in enumerate(unique_labels_set)}

82


In [15]:
index_to_label = {id: label for label, id in label_to_index.items()}

In [16]:
label_to_index['<PAD>'] = 0
index_to_label[0] = '<PAD>'

In [17]:
NUM_CLASSES = len(index_to_label)
NUM_CLASSES

82

In [18]:
MAX_LENGTH = 200
train_labels = [[label_to_index[label] for label in labels] for labels in labels_train]
train_labels = pad_sequences(train_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)
train_labels = to_categorical(train_labels, num_classes=NUM_CLASSES)

valid_labels = [[label_to_index[label] for label in labels] for labels in labels_validation]
valid_labels = pad_sequences(valid_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)
valid_labels = to_categorical(valid_labels, num_classes=NUM_CLASSES)

test_labels = [[label_to_index[label] for label in labels] for labels in labels_test]
test_labels = pad_sequences(test_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)
test_labels = to_categorical(test_labels, num_classes=NUM_CLASSES)

In [19]:
train_labels.shape

(3792, 200, 82)

In [20]:
train_padded_sequences.shape

(3792, 200)

In [21]:
directory = "C:/Projects/try1"  # Replace with your desired directory
file_name = "preprocessed_data_2.npz"  # Replace with your desired file name
file_path = os.path.join(directory, file_name)
file_path

'C:/Projects/try1\\preprocessed_data_2.npz'

In [22]:
np.savez(file_path,
         train_padded_sequences=train_padded_sequences,
         test_padded_sequences=test_padded_sequences,
         validation_padded_sequences=validation_padded_sequences,
         train_labels=train_labels,
         test_labels=test_labels,
         valid_labels=valid_labels)

In [23]:
num_words = 100000
EMBEDDING_DIM = 128
max_sequence_length = 200
num_labels = 82
NUM_EPOCHS = 10
num_classes = 82

In [24]:
INPUT_DIM = len(tokenizer.word_index)+1
EMBEDDING_DIM = 64
NUM_CLASSES = len(index_to_label)
MAX_LENGTH = train_padded_sequences.shape[1]
LSTM_UNITS = 64
DENSE_UNITS = 64
DROPOUT_RATE = 0.2
BATCH_SIZE = 32
EPOCHS = 20

In [26]:
NUM_CLASSES

82

## Model 1: CNN-BILSTM with word Word2Vec Embedding.

In [27]:
model = tf.keras.models.Sequential([    
    tf.keras.layers.Embedding(input_dim=INPUT_DIM, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH),
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=LSTM_UNITS, return_sequences=True)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=DENSE_UNITS, activation='relu')),
    tf.keras.layers.Dropout(rate=DROPOUT_RATE),
    tf.keras.layers.Dense(units=NUM_CLASSES, activation='softmax'),
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 64)           442688    
                                                                 
 conv1d (Conv1D)             (None, 200, 32)           6176      
                                                                 
 bidirectional (Bidirection  (None, 200, 128)          49664     
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 200, 64)           8256      
 ributed)                                                        
                                                                 
 dropout (Dropout)           (None, 200, 64)           0         
                                                                 
 dense_1 (Dense)             (None, 200, 82)           5

In [28]:
history = model.fit(
    train_padded_sequences, 
    train_labels, 
    epochs=EPOCHS, 
    validation_data=(validation_padded_sequences, valid_labels)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [33]:
loss, accuracy = model.evaluate(test_padded_sequences, test_labels)
print(f'Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}')


Test loss: 0.3024, Test accuracy: 0.9594


In [34]:
predictions = model.predict(test_padded_sequences)



## Model 2: BiLSTM with BioVector  Embedding

In [36]:
import gensim
from gensim.models import Word2Vec

### Load BioVector Embeddings

In [37]:
vec_file = "C:\Projects\try1"

In [39]:
path_to_vectors = r'C:\Projects\try1\BioWordVec_PubMed_MIMICIII_d200.vec.bin'

In [40]:
biowv = gensim.models.KeyedVectors.load_word2vec_format(path_to_vectors, binary=True)


In [41]:
biowv

<gensim.models.keyedvectors.KeyedVectors at 0x1fd3ed78bd0>

### Create  BioVector Embedding Matrix

In [43]:
embedding_dim = 200  
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in biowv:
        embedding_matrix[i] = biowv[word]


In [44]:
from tensorflow.keras.layers import Embedding
embedding_layer = Embedding(
    input_dim=len(tokenizer.word_index) + 1,
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    input_length=max_sequence_length,
    trainable=False 
)


In [45]:
class BioVectorEmbeddingLayer(Layer):
    def __init__(self, biowv, input_dim, output_dim, trainable=False, **kwargs):
        super(BioVectorEmbeddingLayer, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.trainable = trainable
        self.embeddings = Embedding(input_dim=input_dim, output_dim=output_dim, trainable=trainable,
                                   embeddings_initializer=self.get_initializer(biowv))

    def get_initializer(self, biowv):
        embedding_matrix = np.zeros((self.input_dim, self.output_dim))
        for word, i in biowv.index2word:
            if i < self.input_dim:
                embedding_matrix[i] = biowv[word]
        return tf.constant_initializer(embedding_matrix)

    def call(self, inputs):
        return self.embeddings(inputs)


### Model Archtecture:

In [46]:
model = tf.keras.models.Sequential([
    embedding_layer,    
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=LSTM_UNITS, return_sequences=True)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=DENSE_UNITS, activation='relu')),
    tf.keras.layers.Dropout(rate=DROPOUT_RATE),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(units=NUM_CLASSES, activation='softmax'))
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 200)          1383400   
                                                                 
 bidirectional_1 (Bidirecti  (None, 200, 128)          135680    
 onal)                                                           
                                                                 
 time_distributed_1 (TimeDi  (None, 200, 64)           8256      
 stributed)                                                      
                                                                 
 dropout_1 (Dropout)         (None, 200, 64)           0         
                                                                 
 time_distributed_2 (TimeDi  (None, 200, 82)           5330      
 stributed)                                                      
                                                      

In [47]:
history = model.fit(
    train_padded_sequences, 
    train_labels, 
    epochs=EPOCHS, 
    validation_data=(validation_padded_sequences, valid_labels)
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [48]:
loss, accuracy = model.evaluate(test_padded_sequences, test_labels)
print(f'Test loss: {loss:.4f}, Test accuracy: {accuracy:.4f}')

Test loss: 0.1758, Test accuracy: 0.9643
