# **BERT PAIR Relation Extraction Notebook**


## Imports and environment configuration

In [None]:
!pip install transformers==3.0.0
!pip install ipython-autotime

%load_ext autotime

In [None]:
import os
import sys
import json
import random
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

from transformers import BertTokenizer, BertForSequenceClassification

if 'google.colab' in str(get_ipython()):
  print('Running on Google Colab')
  root = '/content/drive/My Drive/Colab Notebooks/'
else:
  print('Running locally')
  root = Path(os.getcwd()).parent

basepath = os.path.join(root, 'relation-extraction/')
sys.path.append(os.path.join(basepath, 'bert-pair/code'))

In [None]:
from pair import Pair
from fewshot_re_kit.data_loader import FewRelDatasetPair
from fewshot_re_kit.framework import FewShotREFramework

Switch for data usage: If True FewRel data will be used, if False Future Engineering data is used

In [None]:
use_fewrel_data=False

Defining relevant_relations and paths to data files

In [None]:
if (use_fewrel_data):
    relevant_relations = ['P105', 'P135', 'P155', 'P31', 'P800', 'P921']

    data_dir = os.path.join(root, 'fewrel-training-data/bert-pair/')

    train_file = os.path.join(data_dir, 'train_wiki')
    val_file = os.path.join(data_dir, 'val_wiki')
    test_file = os.path.join(data_dir, 'val_pubmed')

    support_set_file_name = os.path.join(basepath, 'bert-pair/support_sets/support_fewrel_%d_%d.json' %(len(relevant_relations), 3))
else:
    relevant_relations = ['A manufactures product B', 'A operates B', 'A operates \[something\] in location B', 'A orders B', 'A uses/employs charging technology B', 'A orders something from B']
    
    data_dir = os.path.join(root, 'fe-training-data/')

    train_file = os.path.join(data_dir, 'train_examples_nota_manufact_operate_operatesth_order_uses_ordersth_per_label')
    val_file = os.path.join(data_dir, 'test_examples_nota_manufact_operate_operatesth_order_uses_ordersth_per_label')
    test_file = os.path.join(data_dir, 'test_examples_nota_manufact_operate_operatesth_order_uses_ordersth_per_label')

    support_set_file_name = os.path.join(basepath, 'bert-pair/support_sets/support_fe_%d_%d.json' %(len(relevant_relations), 3))

Sentence encoder class for the BERT Pair approach which manages the model and the tokenizer

In [None]:
class BERTPAIRSentenceEncoder(nn.Module):
    def __init__(self, pretrain_path, max_length): 
        nn.Module.__init__(self)
        self.bert = BertForSequenceClassification.from_pretrained(
                pretrain_path,
                num_labels=2)
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def forward(self, inputs):
        x = self.bert(inputs['word'], token_type_ids=inputs['seg'], attention_mask=inputs['mask'])[0]
        return x
    
    def tokenize(self, raw_tokens, pos_head, pos_tail):
        # token -> index
        # tokens = ['[CLS]']
        tokens = []
        cur_pos = 0
        pos1_in_index = 0
        pos2_in_index = 0
        for token in raw_tokens:
            token = token.lower()
            if cur_pos == pos_head[0]:
                tokens.append('[unused0]')
                pos1_in_index = len(tokens)
            if cur_pos == pos_tail[0]:
                tokens.append('[unused1]')
                pos2_in_index = len(tokens)
            tokens += self.tokenizer.tokenize(token)
            if cur_pos == pos_head[-1]:
                tokens.append('[unused2]')
            if cur_pos == pos_tail[-1]:
                tokens.append('[unused3]')
            cur_pos += 1
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokens)
        
        return indexed_tokens

## Fine-Tuning

Defining some parameters for training of the model

In [None]:
trainN = 6
N = 6
K = 1
Q = 1
batch_size = 2
max_length = 100
hidden_size = 768
na_rate = 5

val_step = 1000
train_iter = 10000
val_iter = 1000
test_iter = 1000

if (use_fewrel_data):
    ckpt = os.path.join(basepath, 'bert-pair/checkpoint/bert-pair-fewrel-N6-K1.pth.tar')
    prefix = 'bert-pair-fewrel-N6-K1.pth.tar'
else:
    ckpt = os.path.join(basepath, 'bert-pair/checkpoint/bert-pair-fe-N6-K1.pth.tar')
    prefix = 'bert-pair-fe-N6-K1.pth.tar'

Initializing sentence encoder and model for BERT Pair

In [None]:
sentence_encoder = BERTPAIRSentenceEncoder('bert-base-uncased', max_length)

model = Pair(sentence_encoder, hidden_size=hidden_size)

if torch.cuda.is_available():
    model.cuda()

Loading train-, validation- and test-data and initializing the FewShotREFramework with the different data loaders

In [None]:
def collate_fn_pair(data):
    batch_set = {'word': [], 'seg': [], 'mask': []}
    batch_label = []
    fusion_sets, query_labels = zip(*data)
    for i in range(len(fusion_sets)):
        for k in fusion_sets[i]:
            batch_set[k] += fusion_sets[i][k]
        batch_label += query_labels[i]
    for k in batch_set:
        batch_set[k] = torch.stack(batch_set[k], 0)
    batch_label = torch.tensor(batch_label)
    return batch_set, batch_label

def get_loader_pair(name, encoder, N, K, Q, batch_size, 
        num_workers=0, collate_fn=collate_fn_pair, na_rate=0, root='./data', encoder_name='bert'):
    dataset = FewRelDatasetPair(name, encoder, N, K, Q, na_rate, root, encoder_name)
    data_loader = torch.utils.data.DataLoader(dataset=dataset,
            batch_size=batch_size,
            shuffle=False,
            pin_memory=True,
            num_workers=num_workers,
            collate_fn=collate_fn)
    return iter(data_loader)

In [None]:
train_data_loader = get_loader_pair(train_file, sentence_encoder, N=trainN, K=K, Q=Q, na_rate=na_rate, batch_size=batch_size, encoder_name='bert')
val_data_loader = get_loader_pair(val_file, sentence_encoder, N=N, K=K, Q=Q, na_rate=na_rate, batch_size=batch_size, encoder_name='bert')
test_data_loader = get_loader_pair(test_file, sentence_encoder, N=N, K=K, Q=Q, na_rate=na_rate, batch_size=batch_size, encoder_name='bert')

framework = FewShotREFramework(train_data_loader, val_data_loader, test_data_loader)

Training the model using the provided FewShotREFramework from the authors of the FewRel dataset

In [None]:
framework.train(model, prefix, batch_size, trainN, N, K, Q,
        pytorch_optim=optim.SGD, na_rate=na_rate, val_step=val_step, pair=True, 
        train_iter=train_iter, val_iter=val_iter, bert_optim=True,
        save_ckpt=ckpt)

## Generation of support set for BERT Pair approach

In [None]:
K = 3

json_data_train = json.load(open(train_file + ".json"))
json_data_val = json.load(open(val_file + ".json"))
json_data = {**json_data_train, **json_data_val}

labels_support = []
support = []

# building support set out of random example sentences from the dataset
for i, class_name in enumerate(relevant_relations):
    indices = np.random.choice(list(range(len(json_data[class_name]))), K, False)

    for j in indices:
        item = json_data[class_name][j]
        word = sentence_encoder.tokenize(item['tokens'], item['h'][2][0], item['t'][2][0])
        support.append(word)

    labels_support.append((i, class_name))

with open(support_set_file_name, 'w') as support_file:
    support_obj = dict()
    support_obj['labels_support'] = labels_support
    support_obj['support_set'] = support
    json.dump(support_obj, support_file)