In [1]:
import tensorflow as tf
import numpy as np
from transformers import TFBertModel, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [5]:
#Define the maximum sequence length (should match the pre-trained model)
max_seq_length = 128

In [6]:
#Define a model
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(max_seq_length,), name='attention_mask', dtype='int32')
bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
output_layer = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(bert_embds)
# Freeze BERT layers
# model.bert.trainable = False
final_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
final_model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                           

In [7]:
# Define the loss function and optimizer
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

In [8]:
# Define training loop
epochs = 10
batch_size = 1

In [9]:
# Train the model using online learning
dataset = {
    "abominable": "Negative",
    "assure": "A bit positive",
    "atrocious": "Negative",
    "average": "Neutral",
    "awful": "Negative"
}

In [10]:
for epoch in range(epochs):
    print('Epoch:', epoch+1)
    for i, (word, label) in enumerate(dataset.items()):
        # Prepare the input data
        inputs = tokenizer.encode_plus(word, add_special_tokens=True, max_length=max_seq_length, padding='max_length', truncation=True, return_tensors='tf')
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        y_true = np.zeros((1,3))
        if label == "Negative":
            y_true[0,0] = 1
        elif label == "A bit positive":
            y_true[0,1] = 1
        else:
            y_true[0,2] = 1
        
        # Perform a single online learning update
        with tf.GradientTape() as tape:
            logits = final_model([input_ids, attention_mask])
            loss_value = loss_function(y_true, logits)
        gradients = tape.gradient(loss_value, final_model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, final_model.trainable_variables))
        
        # Print progress
        if (i+1) % batch_size == 0:
            print('  Batch:', i+1, '/', len(dataset))
            print('  Loss:', loss_value.numpy())

Epoch: 1


  output, from_logits = _get_logits(


  Batch: 1 / 5
  Loss: 1.5004086
  Batch: 2 / 5
  Loss: 2.3184364
  Batch: 3 / 5
  Loss: 1.1669056
  Batch: 4 / 5
  Loss: 0.7315926
  Batch: 5 / 5
  Loss: 0.9992112
Epoch: 2
  Batch: 1 / 5
  Loss: 0.4491219
  Batch: 2 / 5
  Loss: 1.93696
  Batch: 3 / 5
  Loss: 0.44368446
  Batch: 4 / 5
  Loss: 0.49495578
  Batch: 5 / 5
  Loss: 0.50102514
Epoch: 3
  Batch: 1 / 5
  Loss: 0.21192998
  Batch: 2 / 5
  Loss: 1.6279416
  Batch: 3 / 5
  Loss: 0.21872535
  Batch: 4 / 5
  Loss: 0.48704636
  Batch: 5 / 5
  Loss: 0.26618335
Epoch: 4
  Batch: 1 / 5
  Loss: 0.12260947
  Batch: 2 / 5
  Loss: 1.1525233
  Batch: 3 / 5
  Loss: 0.13130438
  Batch: 4 / 5
  Loss: 0.422589
  Batch: 5 / 5
  Loss: 0.1656648
Epoch: 5
  Batch: 1 / 5
  Loss: 0.08321707
  Batch: 2 / 5
  Loss: 0.65288585
  Batch: 3 / 5
  Loss: 0.0897254
  Batch: 4 / 5
  Loss: 0.36421108
  Batch: 5 / 5
  Loss: 0.107005976
Epoch: 6
  Batch: 1 / 5
  Loss: 0.060469553
  Batch: 2 / 5
  Loss: 0.2790986
  Batch: 3 / 5
  Loss: 0.06254729
  Batch: 4 / 5
  

<h2>Online Training </h2>

In [None]:
import tensorflow as tf
import numpy as np
from transformers import TFBertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertModel.from_pretrained('bert-base-cased')

#Define the maximum sequence length (should match the pre-trained model)
max_seq_length = 128

#Define a model
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(max_seq_length,), name='attention_mask', dtype='int32')
bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
output_layer = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(bert_embds)
# Freeze BERT layers
# model.bert.trainable = False
final_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)

# Define the loss function and optimizer
loss_function = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

# Define training loop
epochs = 10
batch_size = 1

# Train the model using online learning
dataset = {
    "abominable": "Negative",
    "assure": "A bit positive",
    "atrocious": "Negative",
    "average": "Neutral",
    "awful": "Negative"
}

for epoch in range(epochs):
    print('Epoch:', epoch+1)
    for i, (word, label) in enumerate(dataset.items()):
        # Prepare the input data
        inputs = tokenizer.encode_plus(word, add_special_tokens=True, max_length=max_seq_length, padding='max_length', truncation=True, return_tensors='tf')
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        y_true = np.zeros((1,3))
        if label == "Negative":
            y_true[0,0] = 1
        elif label == "A bit positive":
            y_true[0,1] = 1
        else:
            y_true[0,2] = 1
        
        # Perform a single online learning update
        with tf.GradientTape() as tape:
            logits = final_model([input_ids, attention_mask])
            loss_value = loss_function(y_true, logits)
        gradients = tape.gradient(loss_value, final_model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, final_model.trainable_variables))
        
        # Print progress
        if (i+1) % batch_size == 0:
            print('  Batch:', i+1, '/', len(dataset))
            print('  Loss:', loss_value.numpy())

<h2>Full Training on PHEE_dataset using Bio_ClinicalBERT</h2>

In [1]:
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the JSON file as a Python object
with open('PHEE_dataset/train.json') as f:
    data = json.loads(f.read())
# Initialize empty lists to store the extracted features
contexts = []
event_types = []
# Extract the features for each entry in the object
for entry in data:
    contexts.append(entry["context"])
    events = entry["annotations"][0]["events"]
    event_types.append(events[0]["event_type"])
# Create a pandas dataframe using the extracted features
df = pd.DataFrame({"context": contexts, "event_type": event_types})

In [3]:
event_type_counts = df['event_type'].value_counts()
print(event_type_counts)

event_type
Adverse_event                  3509
Potential_therapeutic_event     350
Name: count, dtype: int64


In [4]:
# Get the unique event_type values from your dataframe
event_types = df['event_type'].unique()
print(event_types)

['Adverse_event' 'Potential_therapeutic_event']


In [5]:
# Convert the event_type column to a categorical column
df['event_type_categorical'] = pd.Categorical(df['event_type'], categories=event_types)
# Convert the categorical column to one-hot encoding using tf.keras.utils.to_categorical()
labels = tf.keras.utils.to_categorical(df['event_type_categorical'].cat.codes, num_classes=len(event_types))

In [6]:
df.head()

Unnamed: 0,context,event_type,event_type_categorical
0,OBJECTIVE: To test the hypothesis that tumor n...,Adverse_event,Adverse_event
1,An evaluation of ovarian structure and functio...,Adverse_event,Adverse_event
2,Phenobarbital hepatotoxicity in an 8-month-old...,Adverse_event,Adverse_event
3,The authors report a case of Balint syndrome w...,Adverse_event,Adverse_event
4,"According to the Naranjo probability scale, fl...",Adverse_event,Adverse_event


In [7]:
# Creating a tokenizer object from the BERT-base-cased model
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# Setting the maximum sequence length for the tokenized input
maxLength = 128

In [4]:
# Defining a function to generate the training data
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['context'])):
        # Tokenizing the input text using the BERT tokenizer
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=maxLength,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        # Storing the tokenized input and attention masks in numpy arrays
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [9]:
# Creating numpy arrays to store the tokenized input and attention masks
X_input_ids = np.zeros((len(df), maxLength), dtype=np.int32)
X_attn_masks = np.zeros((len(df), maxLength), dtype=np.int32)

In [10]:
# Generating the training data using the generate_training_data function
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

3859it [00:02, 1804.13it/s]


In [11]:
# Creating a TensorFlow dataset object from the input and target data
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [12]:
# Defining a function to map the input and target data to the required format for the TensorFlow dataset
def MapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [13]:
# Mapping the dataset to the required format using the SentimentDatasetMapFunction
dataset = dataset.map(MapFunction)

In [14]:
print(dataset.element_spec)

({'input_ids': TensorSpec(shape=(128,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(128,), dtype=tf.int32, name=None)}, TensorSpec(shape=(2,), dtype=tf.float32, name=None))


In [15]:
# Shuffling and batching the dataset for training the model
dataset = dataset.shuffle(1000).batch(32, drop_remainder=True)
# Splitting the dataset into training and validation sets
p = 0.8
train_size = int((len(df) // 32) * p)
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [16]:
# Creating a BERT model object and defining the input layers
BertModel = TFAutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
input_ids = tf.keras.layers.Input(shape=(maxLength,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(maxLength,), name='attention_mask', dtype='int32')

# Feeding the input layers to the BERT model and adding a dense layer and output layer
bert_embds = BertModel.bert(input_ids, attention_mask=attn_masks)[1]
output_layer = tf.keras.layers.Dense(2, activation='softmax', name='output_layer')(bert_embds)
# Creating a TensorFlow model object with the input and output layers
model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)


Some layers from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [17]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                           

In [None]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer=optim, loss=loss_func, metrics=[acc])
hist = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs = 10
)

Epoch 1/10
 6/96 [>.............................] - ETA: 1:21 - loss: 0.3331 - accuracy: 0.9167

In [20]:
model.save('phee_model.h5')

<h2>Full Training on PHEE_dataset using Bio_ClinicalBERT(FULL Code)</h2>

In [None]:
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel

# Load the JSON file as a Python object
with open('PHEE_dataset/train.json') as f:
    data = json.loads(f.read())
# Initialize empty lists to store the extracted features
contexts = []
event_types = []
# Extract the features for each entry in the object
for entry in data:
    contexts.append(entry["context"])
    events = entry["annotations"][0]["events"]
    event_types.append(events[0]["event_type"])
# Create a pandas dataframe using the extracted features
df = pd.DataFrame({"context": contexts, "event_type": event_types})

# Get the unique event_type values from your dataframe
event_types = df['event_type'].unique()
# event_types_sorted = sorted(event_types)

print(event_types)
#Savng event_types attribute inside model
custom_att = tf.Variable(['custom_att1', 'custom_att2'], trainable=False)

# Convert the event_type column to a categorical column
df['event_type_categorical'] = pd.Categorical(df['event_type'], categories=event_types)
# Convert the categorical column to one-hot encoding using tf.keras.utils.to_categorical()
labels = tf.keras.utils.to_categorical(df['event_type_categorical'].cat.codes, num_classes=len(event_types))

# Creating a tokenizer object from the BERT-base-cased model
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# Setting the maximum sequence length for the tokenized input
maxLength = 128
# Defining a function to generate the training data
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['context'])):
        # Tokenizing the input text using the BERT tokenizer
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=maxLength,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        # Storing the tokenized input and attention masks in numpy arrays
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

# Creating numpy arrays to store the tokenized input and attention masks
X_input_ids = np.zeros((len(df), maxLength), dtype=np.int32)
X_attn_masks = np.zeros((len(df), maxLength), dtype=np.int32)

# Generating the training data using the generate_training_data function
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

# Creating a TensorFlow dataset object from the input and target data
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

# Defining a function to map the input and target data to the required format for the TensorFlow dataset
def MapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

# Mapping the dataset to the required format using the SentimentDatasetMapFunction
dataset = dataset.map(MapFunction)
# Shuffling and batching the dataset for training the model
dataset = dataset.shuffle(1000).batch(32, drop_remainder=True)
# Splitting the dataset into training and validation sets
p = 0.8
train_size = int((len(df) // 32) * p)
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

# Creating a BERT model object and defining the input layers
BertModel = TFAutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
input_ids = tf.keras.layers.Input(shape=(maxLength,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(maxLength,), name='attention_mask', dtype='int32')

# Feeding the input layers to the BERT model and adding a dense layer and output layer
bert_embds = BertModel.bert(input_ids, attention_mask=attn_masks)[1]
output_layer = tf.keras.layers.Dense(2, activation='softmax', name='output_layer')(bert_embds)
# Creating a TensorFlow model object with the input and output layers
model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer=optim, loss=loss_func, metrics=[acc])


hist = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs = 10
)

# model.save('phee_model.h5')

  from .autonotebook import tqdm as notebook_tqdm


['Adverse_event' 'Potential_therapeutic_event']


3859it [00:01, 2553.09it/s]
Some layers from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/10
 4/96 [>.............................] - ETA: 1:24 - loss: 0.8247 - accuracy: 0.4219

In [1]:
# Load the model from the saved file
from tensorflow.keras.models import load_model
lmodel = load_model('phee_model.h5',custom_objects={"TFBertMainLayer":TFBertMainLayer})

  from .autonotebook import tqdm as notebook_tqdm


<h3> Evaluating model accurcay  on full test set </h3>

In [5]:
# Load the JSON file as a Python object
with open('PHEE_dataset/test.json') as f:
    data = json.loads(f.read())
# Initialize empty lists to store the extracted features
contexts = []
event_types = []
maxLength = 128
# Extract the features for each entry in the object
for entry in data:
    contexts.append(entry["context"])
    events = entry["annotations"][0]["events"]
    event_types.append(events[0]["event_type"])
# Create a pandas dataframe using the extracted features
df_test = pd.DataFrame({"context": contexts, "event_type": event_types})
# Get the unique event_type values from your dataframe
event_types = df_test['event_type'].unique()
# Convert the event_type column to a categorical column
df_test['event_type_categorical'] = pd.Categorical(df_test['event_type'], categories=event_types)
# Convert the categorical column to one-hot encoding using tf.keras.utils.to_categorical()
labels_test = tf.keras.utils.to_categorical(df_test['event_type_categorical'].cat.codes, num_classes=len(event_types))

# Creating numpy arrays to store the tokenized input and attention masks
X_input_ids_test = np.zeros((len(df_test), maxLength), dtype=np.int32)
X_attn_masks_test = np.zeros((len(df_test), maxLength), dtype=np.int32)


# Generating the test data using the generate_training_data function
X_input_ids_test, X_attn_masks_test = generate_training_data(df_test, X_input_ids_test, X_attn_masks_test, tokenizer)

# Creating a TensorFlow dataset object from the test input and target data
test_dataset = tf.data.Dataset.from_tensor_slices((X_input_ids_test, X_attn_masks_test, labels_test))

# Mapping the test dataset to the required format using the MapFunction
test_dataset = test_dataset.map(MapFunction)
# Batching the dataset for evaluation
test_dataset = test_dataset.batch(32)
# Evaluate the model on the test dataset
loss, accuracy = model.evaluate(test_dataset, verbose=1)
print(f'Test loss: {loss}, Test accuracy: {accuracy}')

NameError: name 'tokenizer' is not defined

<h2>Full Training on Dictionary training data (low dataset) using Bio_ClinicalBERT(FULL Code)</h2>

In [1]:
# Training data dictionary
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
training_data = {
        "OBJECTIVE: To test the hypothesis that tumor necrosis factor (TNF)-alpha may mediate the loss and the dedifferentiation of subcutaneous fat tissue in the insulin-induced lipoatrophies of a diabetic patient who presented extensive lesions": "Adverse_event",
        "An evaluation of ovarian structure and function should be considered in women of reproductive age being treated with valproate for epilepsy, especially if they develop menstrual cycle disturbances during treatment": "Adverse_event",
        "Phenobarbital hepatotoxicity in an 8-month-old infant": "Adverse_event",
        "The authors report a case of Balint syndrome with irreversible posterior leukoencephalopathy on MRI following intrathecal methotrexate and cytarabine": "Adverse_event",
        "According to the Naranjo probability scale, flecainide was the probable cause of the patient's delirium; the Horn Drug Interaction Probability Scale indicates a possible pharmacokinetic drug interaction between flecainide and paroxetine": "Adverse_event",
        "Contact dermatitis due to budesonide: report of five cases and review of the Japanese literature": "Adverse_event",
        "Prolongation of the QT interval observed in a Japanese patient with vivax malaria following treatment with halofantrine": "Adverse_event",
        "We report three cases of severe hepatotoxicity related to benzarone, a benzofuran derivative": "Adverse_event",
        "Four patients who manifested symptoms of the antiepileptic drug (AED) hypersensitivity syndrome during therapy with carbamazepine are reported": "Adverse_event",
        "Minocycline as a cause of drug-induced autoimmune hepatitis": "Adverse_event",
        "Secondary acute myeloid leukemia after etoposide therapy for haemophagocytic lymphohistiocytosis": "Potential_therapeutic_event",
        "Allergic contact angioedema to benzoyl peroxide": "Potential_therapeutic_event",
        "A 60-year-old white man with chronic bronchitis was noted to develop acute respiratory failure and metabolic acidosis four days after being started on methazolamide (Neptazane) for an ophthalmologic problem": "Adverse_event",
        "L-DOPA-induced excessive daytime sleepiness in PD: a placebo-controlled case with MSLT assessment": "Adverse_event",
        "CONCLUSIONS: It is probable that foscarnet contributed to the electrolyte disorders and symptomatology in this patient": "Adverse_event",
        "Acute drug induced hepatitis due to erlotinib": "Adverse_event",
        "We observed 3 diabetic patients with intolerable dizziness followed by nausea and vomiting immediately after an initial administration of the alpha-glucosidase inhibitor, voglibose": "Adverse_event",
        "Although risk factors for MTX-induced pulmonary toxicity are poorly understood, the presence in 3 out of 5 of our patients of pre-existing lung disease, represented by diffuse interstitial changes on chest X-ray, and mild bronchial asthma in two RA patients and by pulmonary silicosis in the patient with PsA may account for a predisposition to the development of MTX pneumonitis": "Adverse_event",
        "Acute esmolol toxicity may be self-limiting because of its extremely short half-life": "Adverse_event",
        "The association with prolonged unopposed estrogen-like stimulation with tamoxifen as a possible factor in the development of ovarian endometrioid carcinoma is discussed": "Adverse_event",
        "The other woman had rheumatoid arthritis and developed acute tubular necrosis after treatment with gentamicin and cefoxitin": "Adverse_event",
        "One patient who received clindamycin had liver biopsy findings of marked cholestasis, portal inflammation, bile duct injury and bile duct paucity (ductopenia)": "Adverse_event",
        "A MEDLINE search (1966-January 2009) revealed one in vivo pharmacokinetic study on the interaction between flecainide, a CYP2D6 substrate, and paroxetine, a CYP2D6 inhibitor, as well as 3 case reports of flecainide-induced delirium": "Adverse_event",
        "As these cases revealed, close monitoring of blood chemistry is mandatory after starting spironolactone, and patients should be advised to stop spironolactone immediately if diarrhoea develops": "Adverse_event",
        "When these cells are exposed to nicotinic acid, an exaggerated immune response is produced that may lead to pain, redness, and swelling at the injection site": "Adverse_event",
        "A case of heatstroke is reported in a 32-year-old man diagnosed with schizophrenia and on clozapine monotherapy": "Adverse_event",
        "Paraplegia following intrathecal cytosine arabinoside": "Potential_therapeutic_event",
        "Cimetidine-induced fever": "Potential_therapeutic_event",
        "This case report illustrates the neurotoxicity unique to HDARAC": "Adverse_event",
        "Agranulocytosis associated with ticlopidine: a possible benefit with filgastim": "Adverse_event",
        "Since SS is a clinical diagnosis, heightened clinician awareness of the possibility of SS among patients receiving SSRI or mirtazapine in combination with opioids may lead to earlier detection and avoidance of potentially lethal consequences": "Adverse_event",
        "The mechanism by which sunitinib induces gynaecomastia is thought to be associated with an unknown direct action on breast hormonal receptors": "Adverse_event",
        "Intravitreal triamcinolone may have had an influence on the exacerbation of retinochoroiditis in the posterior pole of the patient": "Adverse_event",
        "We evaluated a patient who developed a psychotic disorder after 4 months of isoniazid prophylaxis for a positive tuberculosis tine test": "Adverse_event"}

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Create a pandas dataframe using the dictionary
df = pd.DataFrame(training_data.items(), columns=['context', 'event_type'])

# Get the unique event_type values from your dataframe
event_types = df['event_type'].unique()
# event_types_sorted = sorted(event_types)

print(event_types)

# Convert the event_type column to a categorical column
df['event_type_categorical'] = pd.Categorical(df['event_type'], categories=event_types)
# Convert the categorical column to one-hot encoding using tf.keras.utils.to_categorical()
labels = tf.keras.utils.to_categorical(df['event_type_categorical'].cat.codes, num_classes=len(event_types))

# Creating a tokenizer object from the BERT-base-cased model
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# Setting the maximum sequence length for the tokenized input
maxLength = 128
# Defining a function to generate the training data
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['context'])):
        # Tokenizing the input text using the BERT tokenizer
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=maxLength,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        # Storing the tokenized input and attention masks in numpy arrays
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

# Creating numpy arrays to store the tokenized input and attention masks
X_input_ids = np.zeros((len(df), maxLength), dtype=np.int32)
X_attn_masks = np.zeros((len(df), maxLength), dtype=np.int32)

# Generating the training data using the generate_training_data function
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

# Creating a TensorFlow dataset object from the input and target data
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

# Defining a function to map the input and target data to the required format for the TensorFlow dataset
def MapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

# Mapping the dataset to the required format using the SentimentDatasetMapFunction
dataset = dataset.map(MapFunction)
# Shuffling and batching the dataset for training the model
dataset = dataset.shuffle(len(df)).batch(4, drop_remainder=False)
# Splitting the dataset into training and validation sets
p = 0.9
train_size = int((len(df) // 4) * p)
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

# Creating a BERT model object and defining the input layers
BertModel = TFAutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
input_ids = tf.keras.layers.Input(shape=(maxLength,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(maxLength,), name='attention_mask', dtype='int32')

# Feeding the input layers to the BERT model and adding a dense layer and output layer
bert_embds = BertModel.bert(input_ids, attention_mask=attn_masks)[1]
output_layer = tf.keras.layers.Dense(len(event_types), activation='softmax', name='output_layer')(bert_embds)
# Creating a TensorFlow model object with the input and output layers
model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer=optim, loss=loss_func, metrics=[acc])
hist = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs = 10
)


['Adverse_event' 'Potential_therapeutic_event']


34it [00:01, 28.11it/s]
Some layers from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Epoch 1/10
Epoch 2/10
Epoch 3/10

<h2>Full Training on Dictionary training data (low dataset) using Bio_ClinicalBERT and sub class method(Full Code)</h2>

In [1]:
# Training data dictionary
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
training_data = {
        "OBJECTIVE: To test the hypothesis that tumor necrosis factor (TNF)-alpha may mediate the loss and the dedifferentiation of subcutaneous fat tissue in the insulin-induced lipoatrophies of a diabetic patient who presented extensive lesions": "Adverse_event",
        "An evaluation of ovarian structure and function should be considered in women of reproductive age being treated with valproate for epilepsy, especially if they develop menstrual cycle disturbances during treatment": "Adverse_event",
        "Phenobarbital hepatotoxicity in an 8-month-old infant": "Adverse_event",
        "The authors report a case of Balint syndrome with irreversible posterior leukoencephalopathy on MRI following intrathecal methotrexate and cytarabine": "Adverse_event",
        "According to the Naranjo probability scale, flecainide was the probable cause of the patient's delirium; the Horn Drug Interaction Probability Scale indicates a possible pharmacokinetic drug interaction between flecainide and paroxetine": "Adverse_event",
        "Contact dermatitis due to budesonide: report of five cases and review of the Japanese literature": "Adverse_event",
        "Prolongation of the QT interval observed in a Japanese patient with vivax malaria following treatment with halofantrine": "Adverse_event",
        "We report three cases of severe hepatotoxicity related to benzarone, a benzofuran derivative": "Adverse_event",
        "Four patients who manifested symptoms of the antiepileptic drug (AED) hypersensitivity syndrome during therapy with carbamazepine are reported": "Adverse_event",
        "Minocycline as a cause of drug-induced autoimmune hepatitis": "Adverse_event",
        "Secondary acute myeloid leukemia after etoposide therapy for haemophagocytic lymphohistiocytosis": "Potential_therapeutic_event",
        "Allergic contact angioedema to benzoyl peroxide": "Potential_therapeutic_event",
        "A 60-year-old white man with chronic bronchitis was noted to develop acute respiratory failure and metabolic acidosis four days after being started on methazolamide (Neptazane) for an ophthalmologic problem": "Adverse_event",
        "L-DOPA-induced excessive daytime sleepiness in PD: a placebo-controlled case with MSLT assessment": "Adverse_event",
        "CONCLUSIONS: It is probable that foscarnet contributed to the electrolyte disorders and symptomatology in this patient": "Adverse_event",
        "Acute drug induced hepatitis due to erlotinib": "Adverse_event",
        "We observed 3 diabetic patients with intolerable dizziness followed by nausea and vomiting immediately after an initial administration of the alpha-glucosidase inhibitor, voglibose": "Adverse_event",
        "Although risk factors for MTX-induced pulmonary toxicity are poorly understood, the presence in 3 out of 5 of our patients of pre-existing lung disease, represented by diffuse interstitial changes on chest X-ray, and mild bronchial asthma in two RA patients and by pulmonary silicosis in the patient with PsA may account for a predisposition to the development of MTX pneumonitis": "Adverse_event",
        "Acute esmolol toxicity may be self-limiting because of its extremely short half-life": "Adverse_event",
        "The association with prolonged unopposed estrogen-like stimulation with tamoxifen as a possible factor in the development of ovarian endometrioid carcinoma is discussed": "Adverse_event",
        "The other woman had rheumatoid arthritis and developed acute tubular necrosis after treatment with gentamicin and cefoxitin": "Adverse_event",
        "One patient who received clindamycin had liver biopsy findings of marked cholestasis, portal inflammation, bile duct injury and bile duct paucity (ductopenia)": "Adverse_event",
        "A MEDLINE search (1966-January 2009) revealed one in vivo pharmacokinetic study on the interaction between flecainide, a CYP2D6 substrate, and paroxetine, a CYP2D6 inhibitor, as well as 3 case reports of flecainide-induced delirium": "Adverse_event",
        "As these cases revealed, close monitoring of blood chemistry is mandatory after starting spironolactone, and patients should be advised to stop spironolactone immediately if diarrhoea develops": "Adverse_event",
        "When these cells are exposed to nicotinic acid, an exaggerated immune response is produced that may lead to pain, redness, and swelling at the injection site": "Adverse_event",
        "A case of heatstroke is reported in a 32-year-old man diagnosed with schizophrenia and on clozapine monotherapy": "Adverse_event",
        "Paraplegia following intrathecal cytosine arabinoside": "Potential_therapeutic_event",
        "Cimetidine-induced fever": "Potential_therapeutic_event",
        "This case report illustrates the neurotoxicity unique to HDARAC": "Adverse_event",
        "Agranulocytosis associated with ticlopidine: a possible benefit with filgastim": "Adverse_event",
        "Since SS is a clinical diagnosis, heightened clinician awareness of the possibility of SS among patients receiving SSRI or mirtazapine in combination with opioids may lead to earlier detection and avoidance of potentially lethal consequences": "Adverse_event",
        "The mechanism by which sunitinib induces gynaecomastia is thought to be associated with an unknown direct action on breast hormonal receptors": "Adverse_event",
        "Intravitreal triamcinolone may have had an influence on the exacerbation of retinochoroiditis in the posterior pole of the patient": "Adverse_event",
        "We evaluated a patient who developed a psychotic disorder after 4 months of isoniazid prophylaxis for a positive tuberculosis tine test": "Adverse_event"}

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class BertClassifier(tf.keras.Model):
    def __init__(self, num_classes,event_types, bert_model_name):
        super(BertClassifier, self).__init__()
        self.bert = TFAutoModel.from_pretrained(bert_model_name)
        self.dense = tf.keras.layers.Dense(num_classes, activation='softmax')
        self.custom_att = tf.Variable(event_types,trainable=False)
        
    def call(self, inputs, training=False):
        input_ids = inputs['input_ids']
        attn_masks = inputs['attention_mask']
        bert_embds = self.bert(input_ids, attention_mask=attn_masks)[1]
        outputs = self.dense(bert_embds)
        return outputs
    
# Create a pandas dataframe using the dictionary
df = pd.DataFrame(training_data.items(), columns=['context', 'event_type'])

# Get the unique event_type values from your dataframe
event_types = df['event_type'].unique()
# event_types_sorted = sorted(event_types)

print(event_types)

# Convert the event_type column to a categorical column
df['event_type_categorical'] = pd.Categorical(df['event_type'], categories=event_types)
# Convert the categorical column to one-hot encoding using tf.keras.utils.to_categorical()
labels = tf.keras.utils.to_categorical(df['event_type_categorical'].cat.codes, num_classes=len(event_types))

# Creating a tokenizer object from the BERT-base-cased model
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# Setting the maximum sequence length for the tokenized input
maxLength = 128
# Defining a function to generate the training data
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['context'])):
        # Tokenizing the input text using the BERT tokenizer
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=maxLength,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        # Storing the tokenized input and attention masks in numpy arrays
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

# Creating numpy arrays to store the tokenized input and attention masks
X_input_ids = np.zeros((len(df), maxLength), dtype=np.int32)
X_attn_masks = np.zeros((len(df), maxLength), dtype=np.int32)

# Generating the training data using the generate_training_data function
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

# Creating a TensorFlow dataset object from the input and target data
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

# Defining a function to map the input and target data to the required format for the TensorFlow dataset
def MapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

# Mapping the dataset to the required format using the SentimentDatasetMapFunction
dataset = dataset.map(MapFunction)
# Shuffling and batching the dataset for training the model
dataset = dataset.shuffle(len(df)).batch(4, drop_remainder=False)
# Splitting the dataset into training and validation sets
p = 0.9
train_size = int((len(df) // 4) * p)
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)
# Create model instance and compile
model = BertClassifier(num_classes=len(event_types),event_types = event_types, bert_model_name='emilyalsentzer/Bio_ClinicalBERT')
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.CategoricalCrossentropy()
accuracy_metric = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss_func, metrics=[accuracy_metric])
# Train the model
hist = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1
)
# save the model using saved_model
# tf.saved_model.save(model, 'PHEE_model')

['Adverse_event' 'Potential_therapeutic_event']


34it [00:01, 32.96it/s]
Some layers from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.




<h3>Saving the Model</h3>

In [3]:
# save the model using saved_model
tf.saved_model.save(model, 'PHEE_model')



INFO:tensorflow:Assets written to: PHEE_model\assets


INFO:tensorflow:Assets written to: PHEE_model\assets


<h3>Loading the Model</h3>

In [1]:
import tensorflow as tf
# Load the saved model
model = tf.saved_model.load('PHEE_model')

<h2>Prediction</h2>

In [2]:
import tensorflow as tf
import numpy as np
from transformers import AutoTokenizer
# Load the saved model
event_model = tf.saved_model.load('PHEE_model')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=maxLength, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.int32),
        'attention_mask': tf.cast(token.attention_mask, tf.int32)
    }
def make_prediction(model, processed_data, event_types):
    probs = model({
    'input_ids': processed_data['input_ids'],
    'attention_mask': processed_data['attention_mask']})
    #Note change the threshold based on perfromance of model recognizing the event
    threshold = 0.55
    if np.max(probs) < threshold:
        prediction = 'Fallback Class'
    else:
        prediction = event_types[np.argmax(probs)]
    return prediction

In [4]:
event_types_encoded = event_model.custom_att.numpy()
event_types = [event.decode('utf-8') for event in event_types_encoded]
print(event_types)
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
maxLength = 128
input_text = "A 60-year-old white man with chronic bronchitis was noted to develop acute respiratory failure and metabolic acidosis four days after being started on methazolamide (Neptazane) for an ophthalmologic problem"
processed_data = prepare_data(input_text, tokenizer)
nlp_predicted_event = make_prediction(event_model, processed_data=processed_data, event_types=event_types)
print(nlp_predicted_event)


['Adverse_event', 'Potential_therapeutic_event']
Adverse_event


<h2>Prediction on Prepare Model</h2>

In [1]:
#Import necessary libraries and load odel
import tensorflow as tf
import numpy as np
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
maxLength = 128
# Load the saved model
event_model = tf.saved_model.load('Models/131/131scenario1_v1')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=maxLength, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.int32),
        'attention_mask': tf.cast(token.attention_mask, tf.int32)
    }
def make_prediction(model, processed_data, event_types):
    probs = model({
    'input_ids': processed_data['input_ids'],
    'attention_mask': processed_data['attention_mask']})
    #Note change the threshold based on perfromance of model recognizing the event
    threshold = 0.55
    # if np.max(probs) < threshold:
    #     prediction = 'Fallback Class'
    # else:
    #     prediction = event_types[np.argmax(probs)]
    # return prediction
    return probs

In [6]:
event_types_encoded = event_model.custom_att.numpy()
event_types = [event.decode('utf-8') for event in event_types_encoded]
print(event_types)
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
maxLength = 128
input_text = "sepsis is a medical condition"
processed_data = prepare_data(input_text, tokenizer)
nlp_predicted_event = make_prediction(event_model, processed_data=processed_data, event_types=event_types)
print(nlp_predicted_event)

['sepsis', 'medical history', 'cardiac activity']
Fallback Class


<h2>Prediction on Sentiment Model</h2>

In [2]:
#Import necessary libraries and load odel
import tensorflow as tf
import numpy as np
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
maxLength = 128
# Load the saved model
sentiment_model = tf.saved_model.load('sentimentModelPrepare')

In [8]:
def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=maxLength, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.int32),
        'attention_mask': tf.cast(token.attention_mask, tf.int32)
    }
def make_sentiment_prediction(model, processed_data, sentiment_labels):
    probs = model({
    'input_ids': processed_data['input_ids'],
    'attention_mask': processed_data['attention_mask']})
    #Note change the threshold based on perfromance of model recognizing the event
    threshold = 0.55
    if np.max(probs) < threshold:
        prediction = 'Fallback Class'
    else:
        prediction = sentiment_labels[np.argmax(probs)]
    return prediction
    # return probs

In [10]:
sentiment_labels_encoded = sentiment_model.custom_att.numpy()
sentiment_labels = [label.decode('utf-8') for label in sentiment_labels_encoded]
print(sentiment_labels)
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
maxLength = 128
input_text = "I am really angry with you"
processed_data = prepare_data(input_text, tokenizer)
predicted_sentiment =make_sentiment_prediction(sentiment_model, processed_data=processed_data, sentiment_labels = sentiment_labels)
print(predicted_sentiment)

['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']
anger
