In [3]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

tf.get_logger().setLevel('ERROR')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import datetime

#for BERT
import transformers

In [None]:
# GPU options to limit OOM erors
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
print(tf.__version__)

In [None]:
#make sure csv only contains labelled examples
#in excel, have concat title and abstract into same text block = feature. 
labelled = pd.read_csv('training_data/training_maturity.csv', index_col=0)

In [None]:
analysisdf = labelled[['include', 'feature']].copy()       
                      
#pipeline now independent from csv
analysisdf['include'] = analysisdf['include'].astype(np.int64)
analysisdf.head()

In [None]:
#Significant imbalance -> but apparently not too much of a problem when using BERT? 
#Could try stratified k-folds in future iterations?

print("labels:")
print(analysisdf['include'].value_counts())

plt.figure()
pd.value_counts(analysisdf['include']).plot.bar(title="inclusion 0 vs 1")
plt.xlabel("inclusion")
plt.ylabel("number")
plt.show()

In [None]:
analysisdf[analysisdf['feature'].isna()]

In [None]:
#analysisdf.dropna(subset=['feature'], inplace=True)

## creating training/validation, and test sets

In [None]:
from sklearn.model_selection import train_test_split

tempdf, valdf = train_test_split(analysisdf, test_size=0.1, stratify=analysisdf['include'])

print("tempdf")
plt.figure()
pd.value_counts(tempdf['include']).plot.bar(title="inclusion 0 vs 1")
plt.xlabel("inclusion")
plt.ylabel("number")
plt.show()

print("valdf")
plt.figure()
pd.value_counts(valdf['include']).plot.bar(title="inclusion 0 vs 1")
plt.xlabel("inclusion")
plt.ylabel("number")
plt.show()

In [None]:
traindf, testdf = train_test_split(tempdf, test_size=0.12, stratify=tempdf['include'])

print("traindf")
plt.figure()
pd.value_counts(traindf['include']).plot.bar(title="inclusion 0 vs 1")
plt.xlabel("inclusion")
plt.ylabel("number")
plt.show()

print("testdf")
plt.figure()
pd.value_counts(testdf['include']).plot.bar(title="inclusion 0 vs 1")
plt.xlabel("inclusion")
plt.ylabel("number")
plt.show()

In [None]:
pd.set_option('display.max_colwidth', None)

traindf.head(10)

## Putting dataframes into tensor wrappers

https://www.tensorflow.org/guide/data

from_tensor_slices creates a tensor wrapper that combines training features with labels

In [None]:
##TRAINING WRAPPER
train_ds = tf.data.Dataset.from_tensor_slices((traindf['feature'].to_numpy().reshape(-1,1),
                                               traindf['include'].to_numpy().reshape(-1,1)))

In [None]:
##VALIDATION WRAPPER

In [None]:
val_ds = tf.data.Dataset.from_tensor_slices((valdf['feature'].to_numpy().reshape(-1,1),
                                             valdf['include'].to_numpy().reshape(-1,1)))

In [None]:
##TEST SET WRAPPER

In [None]:
test_ds = tf.data.Dataset.from_tensor_slices((testdf['feature'].to_numpy().reshape(-1,1),
                                              testdf['include'].to_numpy().reshape(-1,1)))

In [None]:
##now have three tensor data wrappers - train_ds and val_ds that can be fed into tf pipe, and test_ds for evaluation

## Batch and optimise datasets

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def configure_for_performance(ds):
    ds = ds.cache()
    ds = ds.shuffle(buffer_size=1000)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds

In [None]:
batch_size = 8

train_ds_batched = configure_for_performance(train_ds)
val_ds_batched = configure_for_performance(val_ds)
test_ds_batched = configure_for_performance(test_ds)

## Configuring BERT: 1) preprocessing 2) vectorization

Code is largely adapted from here (also has links to other BERT libraries): https://www.tensorflow.org/text/tutorials/classify_text_with_bert

In [None]:
# BERT model for vectorization

bert_vec_model = 'experts_pubmed'

map_name_to_handle = {
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
}

tfhub_handle_encoder = map_name_to_handle[bert_vec_model]

bert_model = hub.KerasLayer(tfhub_handle_encoder) #wraps this as a Keras layer

In [None]:
#BERT model for pre-processing

map_model_to_preprocess = {
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_preprocess = map_model_to_preprocess[bert_vec_model]

bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess) #wraps this as a Keras layer

In [None]:
print(f'BERT model selected: {tfhub_handle_encoder}')
print(f'Pre-process model selected: {tfhub_handle_preprocess}')

In [None]:
#check bert_preprocess_model on test text - preprocessing should split tokens into word ids / mask /type

text_test = ['This artificial intelligence model predicts cardiovascular risk from echocardiogram images']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

In [None]:
#check bert_model is functional for given inputs

bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

## Building BERT classifier pipeline

This pipeline will: take raw data wrappers -> pre-process and encode with BERT (using Keraslayers defined above) -> classify using simple net

In [None]:
def build_classifier_model(seq_length=512):

    # Define input layer
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text_input")
    
    # Load the pretrained preprocessor
    bert_preprocessor = hub.load(tfhub_handle_preprocess)
    
    # Tokenize the input text
    tokenizer = hub.KerasLayer(bert_preprocessor.tokenize, name='tokenizer')
    tokenized_inputs = [tokenizer(text_input)]

    # Pack the tokenized input for the encoder
    bert_pack_inputs = hub.KerasLayer(bert_preprocessor.bert_pack_inputs,
                                      arguments=dict(seq_length=seq_length), name='packer')
    
    encoder_inputs = bert_pack_inputs(tokenized_inputs)
    
    #BERT encoding layer
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    
    #Output layers
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    
    return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

classifier_model.summary()

In [None]:
#loss
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

#metrics
#metrics = ['BinaryAccuracy']
#metrics = ['accuracy']

metrics = tf.keras.metrics.BinaryAccuracy(name="binary_accuracy", dtype=None, threshold=0.5)

#epochs
epochs = 3

#optimization
steps_per_epoch = tf.data.experimental.cardinality(train_ds_batched).numpy()
print(steps_per_epoch)
num_train_steps = steps_per_epoch * epochs
print(num_train_steps)
num_warmup_steps = int(0.1*num_train_steps)
print(num_warmup_steps)
init_lr = 1e-5

optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')


# models.optimization dependent on tf-official-models, which depends on pycococo which does not install on windows without workaround:
# https://github.com/philferriere/cocoapi
# 1) upgrade visual basic to 2019 and install C++ tools in that library
# 2) install pycococo using direct from git installation in the github link (may need git library first)
# 3) then install tf-official-models using pip

##alternative optimizer
#optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
#                                        name='Adam'
#                                        )

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

## Fit classifier model

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_ds_batched,
                               validation_data=val_ds_batched,
                               epochs=epochs
                               )

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

In [None]:
##re-test on test_ds

loss, binary_accuracy = classifier_model.evaluate(test_ds)

print(f'Loss: {loss}')
print(f'Accuracy: {binary_accuracy}')

In [None]:
y_pred = classifier_model.predict(testdf['feature'])
y_pred = y_pred.reshape(463)
y_pred = np.asarray(tf.round(tf.nn.sigmoid(y_pred)))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [None]:
print(classification_report(testdf['include'], y_pred))

In [None]:
cm = ConfusionMatrixDisplay(confusion_matrix(testdf['include'], y_pred))

cm.plot()

In [None]:
testdf.loc[:,'predicted'] = np.asarray(y_pred)

In [None]:
np.asarray(y_pred).shape

In [None]:
misclassified = testdf[testdf.include != testdf.predicted]
misclassified.to_csv('bert_misclassified_maturity.csv')

In [None]:
import finish_early as now

## Save model

In [None]:
#EXPORT MODEL

saved_model_path = 'models/maturity_bert'

classifier_model.save(saved_model_path, include_optimizer=False)