#### Imports

In [1]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

tf.get_logger().setLevel('ERROR')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import datetime

#for BERT
import transformers

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print(tf.__version__)

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3231291913769821192
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 22723493888
locality {
  bus_id: 1
  links {
  }
}
incarnation: 17866220592368552074
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6"
]
2.5.0


#### Data loading

In [5]:
ner_labelled = pd.read_csv('training_data/training_chars.csv', index_col=0)

In [6]:
labels = ner_labelled.columns[1:].to_list()

In [7]:
len(labels)

35

#### Train/val/test splits

In [8]:
temp_df, test_df = train_test_split(ner_labelled, test_size=0.08)

In [9]:
train_df, val_df = train_test_split(temp_df, test_size = 0.1)

#### Create TF datasets

In [10]:
train_ds = tf.data.Dataset.from_tensor_slices((train_df['text'].to_numpy().reshape(-1,1),
                                              train_df[labels].to_numpy()))

In [11]:
val_ds = tf.data.Dataset.from_tensor_slices((val_df['text'].to_numpy().reshape(-1,1),
                                             val_df[labels].to_numpy()))

In [13]:
test_ds = tf.data.Dataset.from_tensor_slices((test_df['text'].to_numpy().reshape(-1,1),
                                              test_df[labels].to_numpy().reshape(-1,1,35)))

In [14]:
train_ds

<TensorSliceDataset shapes: ((1,), (35,)), types: (tf.string, tf.int64)>

In [15]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def configure_for_performance(ds):
    ds = ds.cache()
    ds = ds.shuffle(buffer_size=1000)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds

In [16]:
batch_size = 8

train_ds_batched = configure_for_performance(train_ds)
val_ds_batched = configure_for_performance(val_ds)
test_ds_batched = configure_for_performance(test_ds)

#### Configure BERT Models for preprocessing and vectorisation

In [17]:
# BERT model for vectorization

bert_vec_model = 'experts_pubmed'

map_name_to_handle = {
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
}

tfhub_handle_encoder = map_name_to_handle[bert_vec_model]

bert_model = hub.KerasLayer(tfhub_handle_encoder) #wraps this as a Keras layer

INFO:absl:Using C:\Users\JOEZ~1\AppData\Local\Temp\tfhub_modules to cache modules.


In [18]:
#BERT model for pre-processing

map_model_to_preprocess = {
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_preprocess = map_model_to_preprocess[bert_vec_model]

bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess) #wraps this as a Keras layer

In [19]:
print(f'BERT model selected: {tfhub_handle_encoder}')
print(f'Pre-process model selected: {tfhub_handle_preprocess}')

BERT model selected: https://tfhub.dev/google/experts/bert/pubmed/2
Pre-process model selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


#### Construct the multilabel classifier

In [20]:
def build_classifier_model(seq_length=256):

    # Define input layer
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text_input")
    
    # Load the pretrained preprocessor
    bert_preprocessor = hub.load(tfhub_handle_preprocess)
    
    # Tokenize the input text
    tokenizer = hub.KerasLayer(bert_preprocessor.tokenize, name='tokenizer')
    tokenized_inputs = [tokenizer(text_input)]

    # Pack the tokenized input for the encoder
    bert_pack_inputs = hub.KerasLayer(bert_preprocessor.bert_pack_inputs,
                                      arguments=dict(seq_length=seq_length), name='packer')
    
    encoder_inputs = bert_pack_inputs(tokenized_inputs)
    
    #BERT encoding layer
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    
    #Output layers
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(len(labels), activation='sigmoid', name='classifier')(net)
    
    return tf.keras.Model(text_input, net)

In [21]:
classifier_model = build_classifier_model()

classifier_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_input (InputLayer)         [(None,)]            0                                            
__________________________________________________________________________________________________
tokenizer (KerasLayer)          (None, None, None)   0           text_input[0][0]                 
__________________________________________________________________________________________________
packer (KerasLayer)             {'input_mask': (None 0           tokenizer[0][0]                  
__________________________________________________________________________________________________
BERT_encoder (KerasLayer)       {'sequence_output':  109482241   packer[0][0]                     
                                                                 packer[0][1]                 

In [22]:
#loss
loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)

#metrics
metrics = ['BinaryAccuracy']

#epochs
epochs = 4

#optimization
steps_per_epoch = tf.data.experimental.cardinality(train_ds_batched).numpy()
print(steps_per_epoch)
num_train_steps = steps_per_epoch * epochs
print(num_train_steps)
num_warmup_steps = int(0.1*num_train_steps)
print(num_warmup_steps)
init_lr = 1e-5

optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')


# models.optimization dependent on tf-official-models, which depends on pycococo which does not install on windows without workaround:
# https://github.com/philferriere/cocoapi
# 1) upgrade visual basic to 2019 and install C++ tools in that library
# 2) install pycococo using direct from git installation in the github link (may need git library first)
# 3) then install tf-official-models using pip

##alternative optimizer
#optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
#                                        name='Adam'
#                                        )

INFO:absl:using Adamw optimizer
INFO:absl:gradient_clip_norm=1.000000


3361
13444
1344


In [23]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_ds_batched,
                               validation_data=val_ds_batched,
                               epochs=epochs
                               )

Training model with https://tfhub.dev/google/experts/bert/pubmed/2
Epoch 1/4


In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
test_ds

In [None]:
##re-test on test_ds

loss, accuracy = classifier_model.evaluate(test_ds)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

In [31]:
y_pred = classifier_model.predict(test_df['text'])

In [32]:
y_pred = np.round(y_pred)

In [33]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [34]:
print(classification_report(test_df[labels], y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1276
           1       1.00      0.92      0.95       449
           2       0.96      0.86      0.91       176
           3       0.95      0.82      0.88       258
           4       0.96      0.83      0.89        54
           5       0.00      0.00      0.00        24
           6       1.00      0.90      0.94        67
           7       0.00      0.00      0.00         1
           8       0.91      0.80      0.85        51
           9       0.57      0.64      0.60        70
          10       0.94      0.89      0.91      1020
          11       0.98      1.00      0.99       107
          12       0.91      0.97      0.94       290
          13       0.97      0.98      0.97       362
          14       0.97      0.99      0.98       171
          15       0.97      0.91      0.94       101
          16       0.95      0.92      0.93       151
          17       0.96    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
labels

['algo_neural_net',
 'algo_support_vector',
 'algo_regression',
 'algo_decision_tree',
 'algo_discriminant',
 'algo_naive_bayes',
 'algo_transfer',
 'algo_federated',
 'algo_k_nearest',
 'algo_unsupervised',
 'feat_imaging',
 'feat_xr',
 'feat_ct',
 'feat_mri',
 'feat_eeg',
 'feat_ecg',
 'feat_us',
 'feat_echo',
 'feat_histo',
 'feat_oct',
 'feat_mamm',
 'feat_endoscop',
 'feat_derm',
 'feat_gene',
 'feat_bio',
 'feat_nlp',
 'feat_ehr',
 'feat_sensor',
 'feat_phone',
 'subspec_icu',
 'subspec_ed',
 'spec_id',
 'subspec_sepsis',
 'subspec_hiv',
 'subspec_cov19',
 'subspec_tb',
 'subspec_malaria',
 'spec_derm',
 'subspec_dermca',
 'spec_onc',
 'subspec_rx',
 'subspec_gynonc',
 'subspec_lungca',
 'subspec_brainca',
 'subspec_gica',
 'subspec_hepca',
 'subspec_prosca',
 'subspec_renalca',
 'subspec_haemonc',
 'subspec_breast',
 'spec_psych',
 'subspec_suicide',
 'spec_msk',
 'subspec_frac',
 'spec_rheum',
 'spec_gi',
 'spec_hep',
 'spec_resp',
 'subspec_pneum',
 'spec_neuro',
 'subspec_epi

In [50]:
#EXPORT MODEL

saved_model_path = 'models/multilabel_charactersitics_bert'

classifier_model.save(saved_model_path, include_optimizer=False)

