# **R-BERT Notebook**


## Imports and environment configuration

In [None]:
!pip install transformers==3.0.0
!pip install ipython-autotime

%load_ext autotime

In [None]:
import os
import sys
import json
import random
import numpy as np
from pathlib import Path
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from sklearn.metrics import f1_score, precision_score, recall_score

random.seed(42)

if 'google.colab' in str(get_ipython()):
  print('Running on Google Colab')
  root = '/content/drive/My Drive/Colab Notebooks/'
else:
  print('Running locally')
  root = Path(os.getcwd()).parent

basepath = os.path.join(root, 'relation-extraction/')
sys.path.append(os.path.join(basepath, 'R-BERT/code'))

from transformers import (BertConfig, BertTokenizer)
from transformers import AdamW, get_linear_schedule_with_warmup

from model import BertForSequenceClassification
from utils import (convert_examples_to_features, InputExample)

additional_special_tokens = ["[E11]", "[E12]", "[E21]", "[E22]"]

Switch for data usage: If True FewRel data will be used, if False Future Engineering data is used.

In [None]:
use_fewrel_data=False

In [None]:
if (use_fewrel_data):
    rel2id_map = {'P105':0, 'P135':1, 'P155':2, 'P31':3, 'P800':4, 'P921':5, 'NOTA':6}
    output_dir = os.path.join(basepath, 'R-BERT/output_fewrel')
    num_labels = 7
else:
    rel2id_map = {'NOTA':0, 'A manufactures product B':1, 'A operates B':2, 'A operates \[something\] in location B':3, 'A orders B':4, 'A uses/employs charging technology B':5, 'A orders something from B':6}
    output_dir = os.path.join(basepath, 'R-BERT/output_fe')
    num_labels = 7

Definition of fine-tuning parameters

In [None]:
seed = 12345
pretrained_model_name='bert-base-uncased'

num_train_epochs=5
learning_rate=3e-5
train_batch_size=16
eval_batch_size=8
no_cuda=False
 
max_seq_len=128

evaluate_during_training=True

gradient_accumulation_steps=1
 
weight_decay=1e-3
adam_epsilon=1e-8
max_grad_norm=1.0

max_steps=-1
warmup_steps=0
logging_steps=40

local_rank=-1

In [None]:
if (use_fewrel_data): 
    data_dir = os.path.join(root, 'fewrel-training-data/fewrel/')
    train_data_file = "dev_%d_classes_disjoint.json" % num_labels
    test_data_file = "test_%d_classes_disjoint.json" % num_labels
else:
    data_dir = os.path.join(root, 'fe-training-data')
    train_data_file = 'train_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json'
    test_data_file = 'test_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json'

FewRelProcessor class for extracting examples and input features from FewRel and Future Engineering input files.

In [None]:
class FewrelProcessor():
    def get_train_examples(self, data_dir,file_name):
        examples = self._create_examples(
            self._read_json(os.path.join(data_dir, file_name)), "train")
        labels = set([x.label for x in examples])
        return examples, list(labels)

    def get_dev_examples(self, data_dir,file_name):
        examples = self._create_examples(
            self._read_json(os.path.join(data_dir, file_name)), "dev")
        labels = set([x.label for x in examples])
        return examples, list(labels)
        
    def get_test_examples(self, data_dir, file_name):
        examples = self._create_examples(
            self._read_json(os.path.join(data_dir, file_name)), "test")
        labels = set([x.label for x in examples])
        return examples, list(labels)

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            for x in line['ents']:
                if x[1] == 1:
                    x[1] = 0
            text_a = (line['text'], line['ents'])
            ex_text_a = line['text']
            h = line['ents'][0]
            t = line['ents'][1]
            h_name = ex_text_a[h[1]:h[2]]
            t_name = ex_text_a[t[1]:t[2]]
            if h[1] < t[1]:
                ex_text_a = ex_text_a[:h[1]] + " [E11] "+h_name+" [E12] " + ex_text_a[h[2]:t[1]] + " [E21] "+t_name+" [E22] " + ex_text_a[t[2]:]
                
            else:
                ex_text_a = ex_text_a[:t[1]] + " [E21] "+t_name+" [E22] " + ex_text_a[t[2]:h[1]] + " [E11] "+h_name+" [E12] " + ex_text_a[h[2]:]
            text_a = ex_text_a
            label = rel2id_map[line['label']]
            examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples

    def _read_json(cls, input_file):
        with open(input_file, "r", encoding='utf-8') as f:
            return json.loads(f.read())

Initialization of model and tokenizer

In [None]:
if local_rank == -1 or no_cuda:
    device = torch.device(
        "cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
    # config.n_gpu = torch.cuda.device_count()
    n_gpu = 1
else:
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(local_rank)
    device = torch.device("cuda", local_rank)
    torch.distributed.init_process_group(backend='nccl')
    n_gpu = 1

bertconfig = BertConfig.from_pretrained(pretrained_model_name, num_labels=num_labels)
do_lower_case = "-uncased" in pretrained_model_name

tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, do_lower_case=do_lower_case, additional_special_tokens=additional_special_tokens)    

model = BertForSequenceClassification.from_pretrained(pretrained_model_name, config=bertconfig)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

print('Loaded model & tokenizer')

Helper function to calculate accuracy, precision, recall and f1 values

In [None]:
def accuracy_precision_recall_f1(outputs, labels):
    accuracy = np.sum(outputs == labels)/len(outputs)
    precision = precision_score(labels, outputs, average='macro', labels=np.unique(labels))
    recall = recall_score(labels, outputs, average='macro', labels=np.unique(labels))
    f1 = f1_score(labels, outputs, average='macro', labels=np.unique(labels))
    return accuracy, precision, recall, f1, outputs

Helper function for evaluating while training process

In [None]:
def evaluate(model, tokenizer, eval_dataset, prefix=""):
    results = {}

    eval_sampler = SequentialSampler(eval_dataset) if local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=eval_batch_size, shuffle=False)

    print("***** Running evaluation *****")
    print("  Num examples = %d" % len(eval_dataset))
    print("  Batch size = %d" % eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in eval_dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels':      batch[3],
                      'e1_mask': batch[4],
                      'e2_mask': batch[5],
                      }
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(
                out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    preds = np.argmax(preds, axis=1)

    eval_accuracy, eval_precision, eval_recall, eval_f1,_ = accuracy_precision_recall_f1(preds, out_label_ids)

    print("***** Eval results *****")
    print("   Loss: %f" % eval_loss)
    print("   Accuracy: %f" % eval_accuracy)
    print("   Precision (macro-averaged): %f" % eval_precision)
    print("   Recall (macro-averaged): %f" % eval_recall)
    print("   F1-Score (macro-averaged): %f" % eval_f1)

Helper function for loading and extracting input features from training data files

In [None]:
def load_and_cache_fewrel_examples(tokenizer, evaluate=False):
    processor = FewrelProcessor()

    if (evaluate):
        train_examples, train_label_list = processor.get_train_examples(data_dir, test_data_file)
    else:
        train_examples, train_label_list = processor.get_dev_examples(data_dir, train_data_file)
    
    train_features = convert_examples_to_features(train_examples, train_label_list, max_seq_len, tokenizer, "classification", use_entity_indicator=True)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_e1_mask = torch.tensor([f.e1_mask for f in train_features], dtype=torch.long)  # add e1 mask
    all_e2_mask = torch.tensor([f.e2_mask for f in train_features], dtype=torch.long)  # add e2 mask

    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
    
    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_e1_mask, all_e2_mask)
    return dataset

## Fine-Tuning

Loading training and validation data

In [None]:
train_dataset = load_and_cache_fewrel_examples(tokenizer)
eval_dataset = load_and_cache_fewrel_examples(tokenizer, evaluate=True)

if local_rank == -1:
    train_sampler = RandomSampler(train_dataset)
else:
    DistributedSampler(train_dataset)

train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)

Preparing optimizer and scheduler for training process

In [None]:
if max_steps > 0:
    t_total = max_steps
    num_train_epochs = max_steps // (len(train_dataloader) // gradient_accumulation_steps) + 1
else:
    t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters,lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

if local_rank != -1:
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank],
                                                      output_device=local_rank,
                                                      find_unused_parameters=True)

Training process of the approach

In [None]:
print("***** Running training *****")
print("  Num examples = %d" % len(train_dataset))
print("  Num Epochs = %d" % num_train_epochs)
print("  Train batch size = %d" % train_batch_size)
print("  Gradient Accumulation steps = %d" % gradient_accumulation_steps)
print("  Total optimization steps = %d" % t_total)

global_step = 0
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()

for epoch in range(num_train_epochs):
    print("------------- Epoch %d -------------" % (epoch+1))
    for step, batch in enumerate(train_dataloader):
        model.train()
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'token_type_ids': batch[2],
                  'labels':      batch[3],
                  'e1_mask': batch[4],
                  'e2_mask': batch[5],
                  }

        outputs = model(**inputs)
        loss = outputs[0]
        if n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        tr_loss += loss.item()

        if (step + 1) % logging_steps == 0:
            print('[Epoch: %d, Step: %d] average loss: %.3f' % (epoch + 1, (step + 1), tr_loss/(step + (len(train_dataset)/train_batch_size)*epoch)))

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1        

    if local_rank == -1 and evaluate_during_training:
        evaluate(model, tokenizer, eval_dataset)    

print('-------------------')
print('Training results')
print(" global_step = %s, average loss = %s" % (global_step, tr_loss/global_step))

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)

***** Running training *****
  Num examples = 1068
  Num Epochs = 5
  Train batch size = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 335
------------- Epoch 1 -------------
[Epoch: 1, Step: 40] average loss: 1.570
***** Running evaluation *****
  Num examples = 356
  Batch size = 8
***** Eval results *****
   Loss: 0.815295
   Accuracy: 0.747191
   Precision (macro-averaged): 0.742199
   Recall (macro-averaged): 0.742868
   F1-Score (macro-averaged): 0.738781
------------- Epoch 2 -------------
[Epoch: 2, Step: 40] average loss: 1.017
***** Running evaluation *****
  Num examples = 356
  Batch size = 8
***** Eval results *****
   Loss: 0.653049
   Accuracy: 0.792135
   Precision (macro-averaged): 0.800121
   Recall (macro-averaged): 0.787660
   F1-Score (macro-averaged): 0.791628
------------- Epoch 3 -------------
[Epoch: 3, Step: 40] average loss: 0.754
***** Running evaluation *****
  Num examples = 356
  Batch size = 8
***** Eval results *****
   Loss: 0.70455