In [1]:
import os
from os.path import dirname, realpath
import sys

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split

from tensorflow.keras.callbacks import EarlyStopping

sys.path.insert(0, dirname(realpath('')))
from helper_functions import regular_encode, build_model

## Create tokenizer

In [4]:
MODEL = 'roberta-base' #'xlnet-base-cased'

# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [5]:
train=pd.DataFrame()

train1 = pd.read_csv('../data/bioscope_abstract.csv', usecols=['sentence', 'target'])
train2 = pd.read_csv('../data/bioscope_full.csv', usecols=['sentence', 'target'])
train3 = pd.read_csv('../data/sfu_negation.csv', usecols=['sentence', 'target'])

# use Bioscope Full for validation
valid = train2

test = pd.read_excel('../data/DrugVisData - All Annotations.xlsx', sheet_name='DrugVisData - Copy', usecols=['sentence', 'annotation_expert_1']).dropna().drop_duplicates().astype({'annotation_expert_1': 'uint8'}).rename(columns={'annotation_expert_1': 'target'})
test = test[~test.sentence.duplicated(keep=False)]

TRAINING_CORPUS = ['Bioscope Abstract'] # options: 'Bioscope Abstract', 'Bioscope Full', 'SFU'
if 'Bioscope Abstract' in TRAINING_CORPUS:
    train = pd.concat([train, train1])
if 'Bioscope Full' in TRAINING_CORPUS:
    train = pd.concat([train, train2])
if 'SFU' in TRAINING_CORPUS:
    train = pd.concat([train, train3])
train = train.sample(frac=1).reset_index(drop=True)

In [7]:
max_len = int(int(test.sentence.str.len().max()))
avg_len = int(test.sentence.str.len().median())

# Configuration
EPOCHS = 8
MAX_LEN = 512

In [8]:
%%time 

x_train = regular_encode(train.sentence.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid.sentence.values, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(test.sentence.values, tokenizer, maxlen=MAX_LEN)

y_train = train.target.values
y_valid = valid.target.values
y_test  = test.target.values

Wall time: 3.98 s


In [9]:
es = EarlyStopping(monitor='val_accuracy', 
                    min_delta=0.001, 
                    patience=2,
                    verbose=1, 
                    mode='max', 
                    restore_best_weights=True)

In [14]:
strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())

with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
    
BATCH_SIZE = 2 * strategy.num_replicas_in_sync
model.summary()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_roberta_model_2 (TFRobert ((None, 512, 768), (None, 124645632 
_________________________________________________________________
tf_op_layer_strided_slice_2  [(None, 768)]             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 769       
Total params: 124,646,401
Trainable params: 124,646,401
Non-trainable params: 0
_________________________________________________________________


# 1st step training

In [15]:
train_history = model.fit(
                        x_train, y_train,
                        batch_size=BATCH_SIZE,
                        validation_data=(x_valid, y_valid),
                        callbacks=[es],
                        epochs=EPOCHS
                        )

Train on 11993 samples, validate on 2469 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 00006: early stopping


In [16]:
# Evaluate
test_loss, test_recall, test_precision ,test_accuracy = model.evaluate(x_test, y_test)
print('test_loss: {:.3f}, test_recall: {:.3f} ,test_precision: {:.3f} ,test_accuracy: {:.3f}'.format(test_loss, test_recall, test_precision ,test_accuracy))

test_loss: 1.346, test_recall: 0.103 ,test_precision: 0.125 ,test_accuracy: 0.803


# 2nd step training

In [23]:
train_history_2 = model.fit(
                        x_test, y_test,
                        batch_size=BATCH_SIZE,
                        epochs=3
                        )

Train on 238 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [25]:
model.save('negator_model')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: negator_model\assets
