# **ERNIE Relation Extraction Notebook**

## Imports and environment configuration

In [None]:
!pip install ipython-autotime
!pip install boto3
!pip install simplejson

%load_ext autotime

Imports

In [None]:
import sys
import os
import re
import random
import time
from pathlib import Path
import numpy as np
import json
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from sklearn.metrics import f1_score, precision_score, recall_score

if 'google.colab' in str(get_ipython()):
  print('Running on Google Colab')
  root = '/content/drive/My Drive/Colab Notebooks/'
else:
  print('Running locally')
  root = Path(os.getcwd()).parent

basepath = os.path.join(root, 'relation-extraction/')
sys.path.append(os.path.join(basepath, 'ERNIE/code'))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [None]:
from knowledge_bert.tokenization import BertTokenizer
from knowledge_bert.modeling import BertForSequenceClassification
from knowledge_bert.optimization import BertAdam
from knowledge_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE

Switch for data usage: If True FewRel data will be used, if False Future Engineering data is used

In [None]:
use_fewrel_data=False

Initialization of parameters and data paths

In [None]:
ernie_model = os.path.join(basepath, 'ERNIE/ernie_base')

max_seq_length = 256
do_train = True
do_eval = False
do_lower_case = True
train_batch_size = 32
eval_batch_size = 8
learning_rate = 2e-5
num_train_epochs = 4
warmup_proportion = 0.1
local_rank = -1
seed = 42
gradient_accumulation_steps = 1
fp16 = False
loss_scale = 128
threshold = 0.3

num_labels_task = 7

if (use_fewrel_data):
    output_dir = os.path.join(basepath, 'ERNIE/output_fewrel_test')

    data_dir = os.path.join(root, 'fewrel-training-data/fewrel/')
    test_data_file = "test_%d_classes_disjoint.json" % num_labels_task
    train_data_file = "dev_%d_classes_disjoint.json" % num_labels_task

    data_type = 'fewrel'
else:
    output_dir = os.path.join(basepath, 'ERNIE/output_fe')
    
    data_dir = os.path.join(root, 'fe-training-data')
    test_data_file = 'test_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json'
    train_data_file = 'train_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json'

    data_type = 'fe'

Loading of pre-trained entity embeddings which are provided by the authors of ERNIE approach.

In [None]:
vecs = []
vecs.append([0]*100)
with open(os.path.join(basepath, "ERNIE/kg_embed/entity2vec.vec"), 'r') as fin:
    for i, line in enumerate(fin):
        vec = line.strip().split('\t')
        vec = [float(x) for x in vec]
        vecs.append(vec)
        if (i % 1000000==0):
            print('Processed %d lines' % i)

embed = torch.FloatTensor(vecs)
embed = torch.nn.Embedding.from_pretrained(embed)

print("Shape of entity embedding: " + str(embed.weight.size()))
del vecs

Processed 0 lines
Processed 1000000 lines
Processed 2000000 lines
Processed 3000000 lines
Processed 4000000 lines
Processed 5000000 lines
Shape of entity embedding: torch.Size([5040987, 100])
time: 6min 44s


## Helper functions and commonly needed elements

Helper function for running evaluation while fine-tuning

In [None]:
def run_evaluation(dataloader, model):
    eval_loss, eval_accuracy, eval_precision, eval_recall, eval_f1 = 0, 0, 0, 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for input_ids, input_mask, segment_ids, input_ent, ent_mask, label_ids in dataloader:
        input_ent = embed(input_ent+1) # -1 -> 0
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        input_ent = input_ent.to(device)
        ent_mask = ent_mask.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            tmp_eval_loss = model(input_ids, segment_ids, input_mask, input_ent, ent_mask, label_ids)
            logits = model(input_ids, segment_ids, input_mask, input_ent, ent_mask)

        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        tmp_eval_accuracy, tmp_eval_precision, tmp_eval_recall, tmp_eval_f1, pred = accuracy_precision_recall_f1(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        eval_precision += tmp_eval_precision
        eval_recall += tmp_eval_recall
        eval_f1 += tmp_eval_f1

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples
    eval_precision = eval_precision / nb_eval_steps
    eval_recall = eval_recall / nb_eval_steps
    eval_f1 = eval_f1 / nb_eval_steps

    print("***** Eval results *****")
    print("   Loss: %f" % eval_loss)
    print("   Accuracy: %f" % eval_accuracy)
    print("   Precision (macro-averaged): %f" % eval_precision)
    print("   Recall (macro-averaged): %f" % eval_recall)
    print("   F1-Score (macro-averaged): %f" % eval_f1)

Helper function to calculate accuracy while fine-tuning

In [None]:
def accuracy_precision_recall_f1(out, labels):
    outputs = np.argmax(out, axis=1)
    accuracy = np.sum(outputs == labels)
    precision = precision_score(labels, outputs, average='macro', labels=np.unique(labels))
    recall = recall_score(labels, outputs, average='macro', labels=np.unique(labels))
    f1 = f1_score(labels, outputs, average='macro', labels=np.unique(labels))
    return accuracy, precision, recall, f1, outputs

## Fine-Tuning

In [None]:
from run_fewrel import InputExample, InputFeatures, DataProcessor, FewrelProcessor
from run_fewrel import convert_examples_to_features, accuracy, warmup_linear

Preparing needed information for fine-tuning and setting random seeds

In [None]:
train_batch_size = int(train_batch_size / gradient_accumulation_steps)

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)

os.makedirs(output_dir, exist_ok=True)

Preparing tokenizer and model for fine-tuning

In [None]:
tokenizer = BertTokenizer.from_pretrained(ernie_model, do_lower_case=do_lower_case)

# Prepare model
model, _ = BertForSequenceClassification.from_pretrained(ernie_model,
          cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank),
          num_labels = num_labels_task)

model.to(device)

if n_gpu > 1:
    model = torch.nn.DataParallel(model)

Loading training and evaluation data

In [None]:
processor = FewrelProcessor()

# Prepare train data for fine-tuning
train_examples, label_list = processor.get_train_examples(data_dir, train_data_file)

train_features, label_map = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer, threshold, os.path.join(basepath, "ERNIE/kg_embed/entity2id.txt"))

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
all_ent = torch.tensor([f.input_ent for f in train_features], dtype=torch.long)
all_ent_masks = torch.tensor([f.ent_mask for f in train_features], dtype=torch.long)

train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_ent, all_ent_masks, all_label_ids)

if local_rank == -1:
    train_sampler = RandomSampler(train_data)
else:
    train_sampler = DistributedSampler(train_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

print('Number of training examples: %d' % len(train_examples))

#Prepare evaluation data for fine-tuning
dev_examples = processor.get_dev_examples(data_dir, test_data_file)
dev_features,_ = convert_examples_to_features(dev_examples, label_list, max_seq_length, tokenizer, threshold,os.path.join(basepath, "ERNIE/kg_embed/entity2id.txt"))

all_input_ids_dev = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long)
all_input_mask_dev = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long)
all_segment_ids_dev = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long)
all_label_ids_dev = torch.tensor([f.label_id for f in dev_features], dtype=torch.long)
all_ent_dev = torch.tensor([f.input_ent for f in dev_features], dtype=torch.long)
all_ent_masks_dev = torch.tensor([f.ent_mask for f in dev_features], dtype=torch.long)

dev_data = TensorDataset(all_input_ids_dev, all_input_mask_dev, all_segment_ids_dev, all_ent_dev, all_ent_masks_dev, all_label_ids_dev)

# Run prediction for full data
dev_sampler = SequentialSampler(dev_data)
dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=train_batch_size)

print('Number of evaluation examples: %d' % len(dev_examples))

Number of training examples: 1068
Number of evaluation examples: 356
time: 9.34 s


Preparing optimizer

In [None]:
num_train_steps = int(
    len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_grad = ['bert.encoder.layer.11.output.dense_ent', 'bert.encoder.layer.11.output.LayerNorm_ent']
param_optimizer = [(n, p) for n, p in param_optimizer if not any(nd in n for nd in no_grad)]
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
t_total = num_train_steps

optimizer = BertAdam(optimizer_grouped_parameters,
                      lr=learning_rate,
                      warmup=warmup_proportion,
                      t_total=t_total)

Actual training process for fine-tuning

In [None]:
global_step = 0

with open(os.path.join(output_dir, 'label_map_%s_%d_classes.json' %(data_type, num_labels_task)), 'w') as f:
    json.dump(label_map, f)

print("***** Running training *****")
print("  Num examples = %d" % len(train_examples))
print("  Batch size = %d" % train_batch_size)
print("  Num steps = %d" % num_train_steps)

all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
all_ent = torch.tensor([f.input_ent for f in train_features], dtype=torch.long)
all_ent_masks = torch.tensor([f.ent_mask for f in train_features], dtype=torch.long)

train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_ent, all_ent_masks, all_label_ids)

if local_rank == -1:
    train_sampler = RandomSampler(train_data)
else:
    train_sampler = DistributedSampler(train_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

train_len = len(train_data)
update_size = len(train_dataloader)//10

model.train()

for epoch in range(num_train_epochs):
    print("------------- Epoch %d -------------" % (epoch+1))
    start_time = time.time()
    
    tr_loss = 0
    tr_steps = 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch))
        input_ids, input_mask, segment_ids, input_ent, ent_mask, label_ids = batch
        input_ent = embed(input_ent+1).to(device) # -1 -> 0

        loss = model(input_ids, segment_ids, input_mask, input_ent.half(), ent_mask, label_ids)
        
        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps

        loss.backward()

        tr_loss += loss.item()
        tr_steps += 1

        if (step + 1) % gradient_accumulation_steps == 0:
            # modify learning rate with special warm up BERT uses
            lr_this_step = learning_rate * warmup_linear(global_step/t_total, warmup_proportion)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_this_step
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1

        if (step % update_size) == (update_size - 1):
            batch_loss = tr_loss/(update_size * step)
            print('[Epoch: %d, %5d/ %d points] loss for batch: %.3f' % (epoch + 1, (step + 1)*train_batch_size, train_len, batch_loss))
            
    print("Epoch finished, took %.2f seconds." % (time.time() - start_time))
    print("Cumulated loss for epoch: %f" % (tr_loss/tr_steps))

    print("***** Running evaluation *****")
    print("  Num examples = %d", len(dev_examples))
    print("  Batch size = %d", train_batch_size)
    run_evaluation(dev_dataloader, model)

# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = os.path.join(output_dir, "pytorch_model_%s_%d_classes.bin" % (data_type, num_labels_task))
torch.save(model_to_save.state_dict(), output_model_file)

***** Running training *****
  Num examples = 1068
  Batch size = 32
  Num steps = 133
------------- Epoch 1 -------------


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:766.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


[Epoch: 1,    96/ 1068 points] loss for batch: 1.017
[Epoch: 1,   192/ 1068 points] loss for batch: 0.802
[Epoch: 1,   288/ 1068 points] loss for batch: 0.740
[Epoch: 1,   384/ 1068 points] loss for batch: 0.709
[Epoch: 1,   480/ 1068 points] loss for batch: 0.689
[Epoch: 1,   576/ 1068 points] loss for batch: 0.681
[Epoch: 1,   672/ 1068 points] loss for batch: 0.677
[Epoch: 1,   768/ 1068 points] loss for batch: 0.672
[Epoch: 1,   864/ 1068 points] loss for batch: 0.667
[Epoch: 1,   960/ 1068 points] loss for batch: 0.662
[Epoch: 1,  1056/ 1068 points] loss for batch: 0.659
Epoch finished, took 35.92 seconds.
Cumulated loss for epoch: 1.914726
***** Running evaluation *****
  Num examples = %d 356
  Batch size = %d 32


  _warn_prf(average, modifier, msg_start, len(result))


***** Eval results *****
   Loss: 1.929131
   Accuracy: 0.230337
   Precision (macro-averaged): 0.062378
   Recall (macro-averaged): 0.141251
   F1-Score (macro-averaged): 0.064004
------------- Epoch 2 -------------
[Epoch: 2,    96/ 1068 points] loss for batch: 0.985
[Epoch: 2,   192/ 1068 points] loss for batch: 0.780
[Epoch: 2,   288/ 1068 points] loss for batch: 0.722
[Epoch: 2,   384/ 1068 points] loss for batch: 0.694
[Epoch: 2,   480/ 1068 points] loss for batch: 0.681
[Epoch: 2,   576/ 1068 points] loss for batch: 0.677
[Epoch: 2,   672/ 1068 points] loss for batch: 0.670
[Epoch: 2,   768/ 1068 points] loss for batch: 0.663
[Epoch: 2,   864/ 1068 points] loss for batch: 0.660
[Epoch: 2,   960/ 1068 points] loss for batch: 0.656
[Epoch: 2,  1056/ 1068 points] loss for batch: 0.654
Epoch finished, took 35.67 seconds.
Cumulated loss for epoch: 1.900312
***** Running evaluation *****
  Num examples = %d 356
  Batch size = %d 32
***** Eval results *****
   Loss: 1.890708
   Accurac