<a href="https://colab.research.google.com/github/isaacmg/task-vt/blob/biobert_finetune/drug_treatment_extraction/notebooks/BioBERT_RE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finetuning BioBERT for RE
This is a fine-tuning notebook that we used to finetune BioBERT for relation classification (on our own data, GAD and Euadr) and then convert the resulting model checkpoint to PyTorch HuggingFace library for model inference. This was done for the vaccine and therapeutics task in order to identify drug treatment relations.


In [5]:
!git clone https://github.com/dmis-lab/biobert 
from google.colab import auth
from datetime import datetime
auth.authenticate_user()
!pip install tensorflow==1.15


Successfully installed gast-0.2.2 tensorboard-1.15.0 tensorflow-1.15.0 tensorflow-estimator-1.15.1


In [0]:
import os
os.chdir('biobert')

### Downloading data

In [0]:
!./download.sh
!fileid="1GJpGjQj6aZPV-EfbiQELpBkvlGtoKiyA"
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1GJpGjQj6aZPV-EfbiQELpBkvlGtoKiyA' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1GJpGjQj6aZPV-EfbiQELpBkvlGtoKiyA" -O biobert_w.tar.gz && rm -rf /tmp/cookies.txt
!tar -xvf biobert_w.tar.gz
%set_env RE_DIR datasets/RE/GAD/1
%set_env TASK_NAME=gad
%set_env OUTPUT_DIR=./re_outputs_1

In [3]:
%set_env BIOBERT_DIR=biobert_large

env: BIOBERT_DIR=biobert_large


In [4]:
!python run_re.py --task_name=$TASK_NAME --do_train=true --do_eval=true --do_predict=true --vocab_file=$BIOBERT_DIR/vocab_cased_pubmed_pmc_30k.txt --bert_config_file=$BIOBERT_DIR/bert_config_bio_58k_large.json --init_checkpoint=$BIOBERT_DIR/bio_bert_large_1000k.ckpt.index --max_seq_length=128 --train_batch_size=32 --learning_rate=2e-5 --num_train_epochs=3.0 --do_lower_case=false --data_dir=$RE_DIR --output_dir=$OUTPUT_DIR

2020-04-08 17:53:02.882983: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
Traceback (most recent call last):
  File "run_re.py", line 25, in <module>
    import optimization
  File "/content/biobert/optimization.py", line 87, in <module>
    class AdamWeightDecayOptimizer(tf.train.Optimizer):
AttributeError: module 'tensorflow._api.v2.train' has no attribute 'Optimizer'


In [0]:
#Uncomment this if you want to temporarily stash weights on GCS also collect garbage
#!gsutil -m cp -r ./re_outputs_1/model.ckpt-0.data-00000-of-00001 gs://coronaviruspublicdata/new_data .
#import gc 
#gc.collect()


### Converting the model to HuggingFace

In [0]:
!pip install transformers
import logging
import torch
logger = logging.getLogger('spam_application')

def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
    """ Load tf checkpoints in a pytorch model.
    """
    try:
        import re
        import numpy as np
        import tensorflow as tf
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_path = os.path.abspath(tf_checkpoint_path)
    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    excluded = ['BERTAdam','_power','global_step']
    init_vars = list(filter(lambda x:all([True if e not in x[0] else False for e in excluded]),init_vars))
    names = []
    arrays = []
    for name, shape in init_vars:
        logger.info("Loading TF weight {} with shape {}".format(name, shape))
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array)
    print("A name",names)
    for name, array in zip(names, arrays):
        if name in ['output_weights', 'output_bias']:
          name = 'classifier/' + name
        name = name.split("/")
        # if name in ['output_weights', 'output_bias']:
        #   name = 'classifier/' + name
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(
            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
            for n in name
        ):
            logger.info("Skipping {}".format("/".join(name)))
            continue
        pointer = model
        # if name in ['output_weights' , 'output_bias']:
        #   name = 'classifier/' + name
        for m_name in name:

            print("model",m_name)
            #print(scope_names)
            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
                scope_names = re.split(r"_(\d+)", m_name)
            else:
                scope_names = [m_name]
            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                print(scope_names)
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
            # elif scope_names[0] == "beta":
            #     print(scope_names)
                pointer = getattr(pointer, "bias")
            # elif scope_names[0] == "output_bias":
            #     print(scope_names)
            #   pointer = getattr(pointer, "cls")
            elif scope_names[0] == "output_weights":
                print(scope_names)
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "squad":
                print(scope_names)
                pointer = getattr(pointer, "classifier")
            else:
                try:
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    logger.info("Skipping {}".format("/".join(name)))
                    continue
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]
        if m_name[-11:] == "_embeddings":
            pointer = getattr(pointer, "weight")
        elif m_name == "kernel":
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        logger.info("Initialize PyTorch weight {}".format(name))
        pointer.data = torch.from_numpy(array)
    return model


In [2]:
from transformers import BertConfig, BertForSequenceClassification, BertForPreTraining
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    config.num_labels = 2
    model = BertForSequenceClassification(config)
    #model = BertForSequenceClassification(config)
    # Load "weights from tf checkpoint
    load_tf_weights_in_bert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    model.save_pretrained(pytorch_dump_path)
    return model
# Alternatevely you can download existing stashed data
#!gsutil cp -r gs://coronaviruspublicdata/re_outputs_1 .

In [7]:

import os
!mkdir pytorch_output_temp
model2 = convert_tf_checkpoint_to_pytorch("re_outputs_1", "biobert_large/bert_config_bio_58k_large.json", "pytorch_output_temp")

Building PyTorch model from configuration: BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "model_type": "bert",
  "no_repeat_ngram_size": 0,
  "num_attention_heads": 16,
  "num_beams": 1,
  "num_hidden_layers": 24,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pa

### Upload converted checkpoint and test inference
If everything goes smoothly we should be able to upload weights and use the converted model.

In [12]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('biobert_large/vocab_cased_pubmed_pmc_30k.txt')
model2.eval()
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model2(input_ids)

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


In [13]:
outputs = model2(input_ids)
outputs

(tensor([[-0.2204,  0.6557]], grad_fn=<AddmmBackward>),)

In [0]:
input_ids = torch.tensor(tokenizer.encode("All our results indicate that the presence of the @GENE$ genotype (++) in patients with structural @DISEASE$, severe left ventricular dysfunction and malignant ventricular arrhythmias increases the risk for these patients of hemodynamic collapse during these arrhythmias"))

In [32]:
outputs = model2(input_ids.unsqueeze(0))
outputs

(tensor([[-0.4003,  0.4932]], grad_fn=<AddmmBackward>),)

In [17]:
values, indices = torch.max(outputs[0], 1, keepdim=False)
indices


tensor([1])

**Lets refactor this into something nicer**

In [0]:
from transformers import BertConfig, BertForSequenceClassification, BertForPreTraining
from transformers import BertTokenizer
class InferSequenceClassifier(object):
  def __init__(self, pytorch_model_path, token_path, add_special_tokens=False):
    self.tokenizer = BertTokenizer.from_pretrained(token_path)
    self.model = BertForSequenceClassification.from_pretrained(pytorch_model_path)
    self.add_special_tokens = add_special_tokens

  def make_prediction(self, text):
    input_ids = torch.tensor(self.tokenizer.encode(text, add_special_tokens=self.add_special_tokens))
    outputs = self.model(input_ids.unsqueeze(0))
    print(outputs)
    values, indices = torch.max(outputs[0], 1, keepdim=False)
    return indices

In [0]:
!cp biobert_large/vocab_cased_pubmed_pmc_30k.txt pytorch_output_temp/vocab.txt
!cp biobert_large/bert_config_bio_58k_large.json pytorch_output_temp/config.json

In [0]:
seq_infer = InferSequenceClassifier("pytorch_output_temp", "pytorch_output_temp", True)

In [22]:
seq_infer.make_prediction("@GENE$ influences brain beta-@DISEASE$ load, cerebrospinal fluid levels of beta-amyloid peptides and phosphorylated tau, and the genetic risk of late-onset sporadic AD.")

(tensor([[-0.3223,  0.5159]], grad_fn=<AddmmBackward>),)


tensor([1])

In [33]:
seq_infer.make_prediction("All our results indicate that the presence of the @GENE$ genotype (++) in patients with structural @DISEASE$, severe left ventricular dysfunction and malignant ventricular arrhythmias increases the risk for these patients of hemodynamic collapse during these arrhythmias")

(tensor([[-0.4003,  0.4932]], grad_fn=<AddmmBackward>),)


tensor([1])

In [37]:
seq_infer.make_prediction("Functional studies to unravel the biological significance of this region in regulating @GENE$ production is clearly indicated, which may lead to new strategies to modify the disease course of severe @DISEASE$.")

(tensor([[-0.3648,  0.4784]], grad_fn=<AddmmBackward>),)


tensor([1])

In [0]:
!gsutil cp -r pytorch_output_temp gs://coronavirusqa/re_convert