<a href="https://colab.research.google.com/github/utkarshk30/Relation-extraction/blob/main/20ucs215_RelationExtraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Neccessary Libraries

In [None]:
import os
import csv
import torch
import transformers
import numpy as np
import pandas as pd
import torch.utils.data as Data

from transformers import AutoTokenizer,WEIGHTS_NAME,CONFIG_NAME,XLNetForSequenceClassification,AdamW
from sklearn.model_selection import train_test_split

In [None]:
from spacy.training.example import Example
from transformers import AutoModelForTokenClassification, AutoTokenizer
!pip install spacy_transformers
import spacy_transformers

Defined Function to read our datset-TACRED

In [None]:
import json
import pickle
from pprint import pprint
from google.colab import drive
drive.mount('/content/drive/')

%cd '/content/drive/My Drive/Utkarsh_Khurana/src'

with open('/content/train.json', 'r') as f:
  tacred_train = json.load(f)

with open('/content/dev.json', 'r') as f:
  tacred_dev = json.load(f)

with open('/content/test.json', 'r') as f:
  tacred_test = json.load(f)

def read_tacred(ex): # make it easier to read examples from the dataset
  print('subject: ', ' '.join(ex['token'][ex['subj_start']:ex['subj_end'] + 1]))
  print('object: ', ' '.join(ex['token'][ex['obj_start']:ex['obj_end'] + 1]))
  print('relation: ', ex['relation'], '\n')
  pprint(str(' '.join(ex['token'])))

In [None]:
read_tacred(tacred_train[21404])

subject:  Marco Contiero
object:  Greenpeace European Unit
relation:  per:employee_of 

('`` We look forward to the day when the European Commission also puts defence '
 'of the public interest before the interests of US agribusiness and its '
 "lobbyists in Brussels and at the WTO , '' said Marco Contiero , policy "
 'adviser on GMOs at Greenpeace European Unit .')


Preprocessing step to convert the TACRED dataset into a format suitable for a relation extraction task

In [None]:
def get_entities_from_example(ex, entity_label):
    return ' '.join([t for t, l in zip(ex['token'], ex['label']) if l == entity_label])

def get_indices_from_per_title_reln(ex):
    candidate_orgs, seen = [], 0
    for idx, ent_type in enumerate(ex['stanford_ner']):
        if ent_type == 'ORGANIZATION' and idx > seen:
            b, e = idx, idx
            while e < len(ex['stanford_ner']):
                if ex['stanford_ner'][e] == 'ORGANIZATION':
                    e += 1
                    continue
                break
            candidate_orgs.append((b, e))
            seen = e  # don't consider the same span twice

    if len(candidate_orgs) == 1:
        comp_start, comp_end = candidate_orgs[0]
        person_indices = set(range(ex['subj_start'], ex['subj_end'] + 1))
        org_indices = set(range(comp_start, comp_end))
        return person_indices, org_indices
    else:
        return None

def convert_tacred(examples):
    RELATIONS = set(['org:founded_by', 'org:shareholders', 'org:top_members/employees', 'per:employee_of'])
    bad_subjects = set(['he', 'his', 'she', 'mom', 'her'])
    task_split = []

    for example in examples:
        if example['relation'] not in RELATIONS:
            continue

        if example['relation'] == 'per:title':
            try:
                person_indices, org_indices = get_indices_from_per_title_reln(example)
            except:
                continue
        elif example['relation'] == 'per:employee_of':
            person_indices = set(range(example['subj_start'], example['subj_end'] + 1))
            org_indices = set(range(example['obj_start'], example['obj_end'] + 1))
        else:
            person_indices = set(range(example['obj_start'], example['obj_end'] + 1))
            org_indices = set(range(example['subj_start'], example['subj_end'] + 1))

        new_ex = {
            'id': example['id'],
            'relation': example['relation'],
            'token': example['token'],
            'pos': example['stanford_pos'],
            'ner': example['stanford_ner'],
            'dep_head': [idx - 1 for idx in example['stanford_head']],  # convert from 1-based index
            'dep_reln': example['stanford_deprel'],
            'per_span': list(person_indices),
            'org_span': list(org_indices),
            'label': [0 if i not in (person_indices | org_indices) else 1 if i in person_indices else 2
                      for i in range(len(example['token']))]
        }

        if get_entities_from_example(new_ex, 1).lower() in bad_subjects:
            continue

        task_split.append(new_ex)

    return task_split

In [None]:

task_sanity = convert_tacred(tacred_train[:5])
assert tacred_train[0]['token'][tacred_train[0]['obj_start']:tacred_train[0]['obj_end']+1] == [t for t, l in zip(task_sanity[0]['token'], task_sanity[0]['label']) if l == 1]

In [None]:
# process and serialize the refactoreed dataset

train, dev, test = convert_tacred(tacred_train), convert_tacred(tacred_dev), convert_tacred(tacred_test)

with open('data/refactored/train.pkl', 'wb') as d:
  pickle.dump(train, d)

with open('data/refactored/dev.pkl', 'wb') as d:
  pickle.dump(dev, d)

with open('data/refactored/test.pkl', 'wb') as d:
  pickle.dump(test, d)

Evaluation Metrics

In [None]:
!pip install --quiet seqeval

from seqeval.metrics import classification_report
from seqeval.scheme import IOB2

def schemify(labels, person_span, org_span): # convert from 0, 1, 2 labels to IOB2 format for evaluating naive token class approach
  if len(person_span) == 0 or len(org_span) == 0:
    return ['O']*len(labels)

  formatted_preds, bper, borg = [], min(person_span), min(org_span)
  for i, l in enumerate(labels):

    if i == bper:
      formatted_preds.append('B-PER')
    elif i == borg:
      formatted_preds.append('B-ORG')
    elif l == 1:
      formatted_preds.append('I-PER')
    elif l == 2:
      formatted_preds.append('I-ORG')
    else:
      formatted_preds.append('O')
  return formatted_preds

TC_TRUTH_TRAIN = [schemify(ex['label'], set(ex['per_span']), set(ex['org_span'])) for ex in train]
TC_TRUTH_DEV = [schemify(ex['label'], set(ex['per_span']), set(ex['org_span'])) for ex in dev]
TC_TRUTH_TEST = [schemify(ex['label'], set(ex['per_span']), set(ex['org_span'])) for ex in test]

[?25l[K     |███████▌                        | 10 kB 37.5 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 43.2 MB/s eta 0:00:01[K     |██████████████████████▌         | 30 kB 41.1 MB/s eta 0:00:01[K     |██████████████████████████████  | 40 kB 24.6 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 1.9 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


 **Fine-tuning XLNET for Relation Extraction**


In [None]:
!pip install -U pip setuptools wheel
!pip install spacy
!python -m spacy download en_core_web_trf
!pip install spacy transformers

import pickle
import numpy as np
from pprint import pprint
from google.colab import drive
from spacy.tokens import Span, DocBin, Doc
from spacy.vocab import Vocab
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import spacy

drive.mount('/content/drive/')
%cd '/content/drive/My Drive/Riley/src'

with open('data/refactored/train.pkl', 'rb') as p:
  train = pickle.load(p)

with open('data/refactored/dev.pkl', 'rb') as p:
  dev = pickle.load(p)

with open('data/refactored/test.pkl', 'rb') as p:
  test = pickle.load(p)


LABELS = ['PERSON_AND_COMPANY']

%env TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD=5368709120

In [None]:
nlp = spacy.blank("en")
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizers = AutoTokenizer.from_pretrained("xlnet-base-cased")
model1 = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
transformers.logging.set_verbosity_error()
tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

best_score = 0
batch_size = 32
classes_list = list()

output_dir = './models/'
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

filename = 'train.json'
data_processor = DataProcessor(filename)
name_list = data_processor.name_list
sentence_list = data_processor.sentence_list
label_list = data_processor.label_list
classes_list = data_processor.classes_list
num_classes = data_processor.num_classes

In [None]:
data_converter = DataConverter(name_list, sentence_list, label_list)
input_ids = data_converter.input_ids
token_type_ids = data_converter.token_type_ids
attention_mask = data_converter.attention_mask
labels = data_converter.target

In [None]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, random_state=1, test_size=0.1)
train_token, val_token, _, _ = train_test_split(token_type_ids, labels, random_state=1, test_size=0.1)
train_mask, val_mask, _, _ = train_test_split(attention_mask, labels, random_state=1, test_size=0.1)

train_data = Data.TensorDataset(train_inputs, train_token, train_mask, train_labels)
train_dataloader = Data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

validation_data = Data.TensorDataset(val_inputs, val_token, val_mask, val_labels)
validation_dataloader = Data.DataLoader(validation_data, batch_size=batch_size, shuffle=True)

model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=num_classes).to(device)

**Converting Data to Spacy Format**

In [None]:
def convert_to_spacy(dataset, outf):
  misses = 0
  Doc.set_extension('rel', default={}, force=True)
  vocab = Vocab()

  docs, ids = [], set()

  for ex in dataset:

    span_starts, entities, relations = set(), [], {}
    s2e = get_span2entity(ex)
    neg, pos = 0, 0
    doc = Doc(nlp.vocab, words=ex['token'])

    # Parse the GGP entities
    seen = 0
    for (sb, se), ent in s2e.items():
      name = ' '.join([t for i,t in enumerate(ex['token']) if i in set(range(sb, se))])
      if seen == 0:
        start, end = doc.text.index(name), doc.text.index(name) + len(name)
      else:
        start, end = doc.text[seen:].index(name) + seen, doc.text[seen:].index(name) + seen + len(name)
      seen = end
      entity = doc.char_span(start, end, label=ent)
      if entity is not None:
        entities.append(entity)
        span_starts.add(sb)
      else:
        misses += 1
    doc.ents = entities

    # Parse the Relations
    for s1 in span_starts:
      for s2 in span_starts:
        relations[(s1, s2)] = {}
        if s1 == min(ex['per_span']) and s2 == min(ex['org_span']):
          relations[(s1, s2)]['PERSON_AND_COMPANY'] = 1.0
        else:
          relations[(s1, s2)]['PERSON_AND_COMPANY'] = 0.0
    doc._.rel = relations

    if len(doc.ents) > 1:
      docs.append(doc)

  print(misses)
  docbin = DocBin(docs=docs, store_user_data=True)
  docbin.to_disk(outf)

In [None]:
convert_to_spacy(train, 'data/train.spacy')
convert_to_spacy(dev, 'data/dev.spacy')
convert_to_spacy(test, 'data/test.spacy')

6
1
0


**Evaluation on Training Dataset**

In [None]:
%env TRF_PATH="model/xlnet-base-cased"
%env MODEL_STRING ="xlnet-base-cased"
%env TRAIN_BIN="train.spacy"
%env DEV_BIN="dev.spacy"
%env TEST_BIN="test.spacy"

!spacy project run train

env: TRF_PATH="philschmid/distilroberta-base-ner-conll2003"
env: MODEL_STRING="distilroberta-base-ner-conll2003"
env: TRAIN_BIN="train.spacy"
env: DEV_BIN="dev.spacy"
env: TEST_BIN="test.spacy"
[1m
Running command: /usr/bin/python3 -m spacy train configs/rel_trf.cfg --output models/distilroberta-base-ner-conll2003 --components.transformer.model.name philschmid/distilroberta-base-ner-conll2003 --paths.train data/train.spacy --paths.dev data/dev.spacy -c ./scripts/custom_functions.py --gpu-id 0
[38;5;4mℹ Saving to output directory:
models/distilroberta-base-ner-conll2003[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2022-06-20 13:12:52,176] [INFO] Set up nlp object from config
[2022-06-20 13:12:52,189] [INFO] Pipeline: ['transformer', 'relation_extractor']
[2022-06-20 13:12:52,195] [INFO] Created vocabulary
[2022-06-20 13:12:52,196] [INFO] Finished initializing nlp object
Some weights of the model checkpoint at philschmid/distilroberta-base-ner-conll2003 were not used when initializing Roberta

**Evaluation On Test Dataset**

In [None]:
%env TRF_PATH="model/xlnet-base-cased"
%env MODEL_STRING ="xlnet-base-cased"
%env TRAIN_BIN="train.spacy"
%env DEV_BIN="dev.spacy"
%env TEST_BIN="test.spacy"

!spacy project run evaluate

env: TRF_PATH="philschmid/distilroberta-base-ner-conll2003"
env: MODEL_STRING="distilroberta-base-ner-conll2003"
env: TRAIN_BIN="train.spacy"
env: DEV_BIN="dev.spacy"
env: TEST_BIN="test.spacy"
[1m
Running command: /usr/bin/python3 ./scripts/evaluate.py models/distilroberta-base-ner-conll2003/model-best data/test.spacy False

Random baseline:
threshold 0.00 	 {'rel_micro_p': '3.50', 'rel_micro_r': '100.00', 'rel_micro_f': '6.77'}
threshold 0.05 	 {'rel_micro_p': '3.51', 'rel_micro_r': '95.09', 'rel_micro_f': '6.77'}
threshold 0.10 	 {'rel_micro_p': '3.54', 'rel_micro_r': '90.75', 'rel_micro_f': '6.81'}
threshold 0.20 	 {'rel_micro_p': '3.52', 'rel_micro_r': '80.35', 'rel_micro_f': '6.75'}
threshold 0.30 	 {'rel_micro_p': '3.56', 'rel_micro_r': '71.39', 'rel_micro_f': '6.78'}
threshold 0.40 	 {'rel_micro_p': '3.54', 'rel_micro_r': '61.27', 'rel_micro_f': '6.69'}
threshold 0.50 	 {'rel_micro_p': '3.37', 'rel_micro_r': '48.55', 'rel_micro_f': '6.31'}
threshold 0.60 	 {'rel_micro_p': '3.4

Finding Validation Accuracy

In [None]:
epoch = 2
for _ in range(epoch):
    for i, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        loss = model(batch[0], token_type_ids=batch[1], attention_mask=batch[2], labels=batch[3])[0]
        print(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        evaluator = ModelEvaluator(device)
        if i % 1 == 0:
            evaluator.eval(model, validation_dataloader)

2.593186378479004
Validation Accuracy: 0.1347934472934473
3.0099892616271973
Validation Accuracy: 0.114494301994302
2.777106523513794
Validation Accuracy: 0.1398682336182336
2.742886543273926
Validation Accuracy: 0.16310541310541313
2.644493579864502
Validation Accuracy: 0.17094017094017092
2.7937374114990234
Validation Accuracy: 0.14841524216524216
2.5341219902038574
Validation Accuracy: 0.13417022792022792
2.4992127418518066
Validation Accuracy: 0.1339031339031339
2.587545871734619
Validation Accuracy: 0.15411324786324787
3.078270435333252
Validation Accuracy: 0.1607905982905983
2.791926860809326
Validation Accuracy: 0.17556980056980057
2.6393654346466064
Validation Accuracy: 0.17405626780626782
2.7735531330108643
Validation Accuracy: 0.17405626780626782
2.620316743850708
Validation Accuracy: 0.16452991452991453
2.4722647666931152
Validation Accuracy: 0.16773504273504275
2.6068339347839355
Validation Accuracy: 0.18046652421652423
2.6746883392333984
Validation Accuracy: 0.187678062678