# Data preparation & evaluation Notebook

This notebook provides a collection of different functions for generation & evaluating training data from inception data as well as the selecting of training data from FewRel dataset. 

In [None]:
import json
import os
import random
import copy
from nltk.tokenize import sent_tokenize
from pathlib import Path

if 'google.colab' in str(get_ipython()):
  print('Running on Google Colab')
  root = '/content/drive/My Drive/Colab Notebooks/'
else:
  print('Running locally')
  root = Path(os.getcwd()).parent

basepath = os.path.join(root, 'relation-extraction/')

## Preparation of annotated data generated with Inception 

In [None]:
data_dir = "fe-training-data/"

Read JSON file with all extracted examples

In [None]:
inputfile = os.path.join(root, os.path.join(data_dir, "all_examples.json"))
with open(inputfile, "r") as in_file:
    lines = json.load(in_file)

In [None]:
outputfile = os.path.join(root, os.path.join(data_dir, "examples_nota_manufact_operate_operatesth_order_uses_ordersth.json"))
train_file = os.path.join(root, os.path.join(data_dir, "train_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json"))
test_file = os.path.join(root, os.path.join(data_dir, "test_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json"))
val_file = os.path.join(root, os.path.join(data_dir, "val_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json"))

relevant_relations = ['NOTA', 'A manufactures product B', 'A operates B', 'A operates \[something\] in location B', 'A orders B', 'A uses/employs charging technology B', 'A orders something from B']
relevant_relations_9 = ['NOTA', 'A manufactures product B', 'A operates B', 'A operates \[something\] in location B', 'A orders B', 'A uses/employs charging technology B', 'A orders something from B', 'A researches/develops technology or product B', 'A delivers something to B']

### Data evaluation and insights

Helper function for cleaning the incoming examples from inception data.

In [None]:
def clean_examples(data):
    examples = copy.deepcopy(data)
    things_to_clean = ['(', '-', ',']
    cleaned_examples = []
    for sent in examples:
        if (not sent['ents'][0][0].isnumeric() and not sent['ents'][1][0].isnumeric()):
            for thing in things_to_clean:
                # check for trailing odd character and remove it
                if (sent['ents'][0][0].endswith(thing)):
                    sent['ents'][0][0] = sent['ents'][0][0][:-1]
                    sent['ents'][0][2] -= 1
                    # check for trailing whitespace and remove it
                    if (sent['ents'][0][0].endswith(' ')):
                        sent['ents'][0][0] = sent['ents'][0][0][:-1]
                        sent['ents'][0][2] -= 1
                # do the same for second entity
                if (sent['ents'][1][0].endswith(thing)):
                    sent['ents'][1][0] = sent['ents'][1][0][:-1]
                    sent['ents'][1][2] -= 1
                    if (sent['ents'][1][0].endswith(' ')):
                        sent['ents'][1][0] = sent['ents'][1][0][:-1]
                        sent['ents'][1][2] -= 1

            if (not sent['ents'][0][0].isnumeric() and not sent['ents'][1][0].isnumeric() and sent['ents'][0][0] and sent['ents'][1][0]):
                cleaned_examples.append(sent)
            else:
                print(sent)
    return cleaned_examples

Counting the number of examples per relation and the distinct number of sentence sequences.

In [None]:
valid_training_data = clean_examples([x for x in lines if x['label'] in relevant_relations])

print('Number of relevant examples in dataset: %d' % len(valid_training_data))
print('Number of distinct origin sentences: %d' % len(set([x['text'] for x in valid_training_data])))

valid_exp_per_label = dict()
for line in valid_training_data:
    if (line['label'] not in valid_exp_per_label.keys()):
        valid_exp_per_label[line['label']] = []
    valid_exp_per_label[line['label']].append(line)

print('\nOverview of examples per relation type:')
valid_x = {k: len(v) for k, v in valid_exp_per_label.items()}
for key in {k: v for k, v in sorted(valid_x.items(), key=lambda item: item[1], reverse=True)}:
    print('%d examples for relation %s' %(len(valid_exp_per_label[key]), key))

Number of relevant examples in dataset: 1780
Number of distinct origin sentences: 707

Overview of examples per relation type:
396 examples for relation A manufactures product B
345 examples for relation A orders B
286 examples for relation NOTA
236 examples for relation A operates B
200 examples for relation A operates \[something\] in location B
160 examples for relation A uses/employs charging technology B
157 examples for relation A orders something from B


Dumping the extracted valid examples to an outputfile.

In [None]:
with open(outputfile, "w") as out_file:
    json.dump(valid_training_data, out_file)

### Train-Test-Split of data

Splitting the generated examples into disjoint training-, test- and validation-datasets

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_json(open(outputfile, "r"))
y = df[['label']]

train, test, y_train, _ = train_test_split(df, y, test_size=0.2, stratify=y)
train, val, _, _ = train_test_split(train, y_train, test_size=0.25, stratify=y_train)

train_examples = json.loads(train.to_json(orient="records"))
test_examples = json.loads(test.to_json(orient="records"))
val_examples = json.loads(val.to_json(orient="records"))

with open(train_file, "w") as out_file:
    json.dump(train_examples, out_file)
with open(test_file, "w") as out_file:
    json.dump(test_examples, out_file)
with open(val_file, "w") as out_file:
    json.dump(val_examples, out_file)

In [None]:
print('Training dataset length: %d' % len(train))
print('Test dataset length: %d' % len(test))
print('Validation dataset length: %d' % len(val))

Training dataset length: 1068
Test dataset length: 356
Validation dataset length: 356


### Generation of Per-label-format (needed for BERT Pair)

Helper function for converting a sentence with entities to the needed format of BERT Pair with the entity containing tokens instead of fixed positions in the sentence.

In [None]:
def convert_sentence_to_query(sentence, ent1, ent2):
    ent1_name = sentence[ent1[1]-1:ent1[2]]
    ent1_start = ent1[1]
    ent1_end = ent1[2]
    ent2_name = sentence[ent2[1]-1:ent2[2]]
    ent2_start = ent2[1]
    ent2_end = ent2[2]
    
    tokens = sentence.split()

    ent1_tokens = []
    ent2_tokens = []
    pos = 0

    for i, token in enumerate(tokens):
        if ((pos >= ent1_start-1 and pos <= ent1_end) or (pos >= ent1_start-2 and pos <= ent1_end+2 and ent1_name in token)):
            ent1_tokens.append(i)
        if ((pos >= ent2_start-1 and pos <= ent2_end) or (pos >= ent2_start-2 and pos <= ent2_end+2 and ent2_name in token)):
            ent2_tokens.append(i)
        
        pos += len(token)+1

    sentence_obj = dict()
    sentence_obj['tokens'] = tokens
    sentence_obj['h'] = [ent1_name.lower().lstrip(), '', [ent1_tokens]]
    sentence_obj['t'] = [ent2_name.lower().lstrip(), '', [ent2_tokens]]

    if (len(ent1_tokens) == 0 or len(ent2_tokens) == 0):
        print (sentence_obj)
        
    return sentence_obj

Generation of the BERT Pair specific format for train-, test- and validation-dataset

In [None]:
data_files = [train_file, test_file, val_file]
for data_file in data_files:
    with open(data_file, "r") as in_file:
        lines = json.load(in_file)

    exp_per_label = dict()
    for line in lines:
        if (line['label'] != 'NOTA'):
            if (line['label'] not in exp_per_label.keys()):
                exp_per_label[line['label']] = []
            exp_per_label[line['label']].append(line)
    
    output_examples = dict()
    for key in exp_per_label:
        output_examples[key] = []
        for example in exp_per_label[key]:
            output_example = convert_sentence_to_query(example['text'], example['ents'][0], example['ents'][1])
            output_examples[key].append(output_example)

    with open(os.path.splitext(data_file)[0] + "_per_label.json", "w") as out_file:
        json.dump(output_examples, out_file)

## Generation of Test and Train data from FewRel containing NOTA class

Definition of file names and relevant relation classes

In [None]:
data_dir = os.path.join(root, 'fewrel-training-data/fewrel/')

in_train_data_file = os.path.join(data_dir,"train_80_classes.json")
in_test_data_file = os.path.join(data_dir,"test_80_classes.json")
in_dev_data_file = os.path.join(data_dir,"dev_80_classes.json")

out_train_data_file = os.path.join(data_dir,"train_7_classes_disjoint.json")
out_test_data_file = os.path.join(data_dir,"test_7_classes_disjoint.json")
out_dev_data_file = os.path.join(data_dir,"dev_7_classes_disjoint.json")

relevant_relations = ['P105', 'P135', 'P155', 'P31', 'P800', 'P921']

nota_train_classes = ['P740', 'P26', 'P710', 'P86', 'P931', 'P361', 'P3450', 'P57', 'P6', 'P175', 'P22', 'P1877', 'P1411', 'P178', 'P127', 'P1923', 'P412', 'P2094', 'P1408', 'P137', 'P39', 'P974', 'P118', 'P136', 'P27', 'P364', 'P150', 'P40', 'P176', 'P1303', 'P495', 'P1346', 'P937', 'P25', 'P264', 'P102', 'P460', 'P159', 'P400', 'P991']

nota_inference_classes = random.sample(['P407', 'P403', 'P449', 'P551', 'P59', 'P177', 'P674', 'P706', 'P527', 'P84', 'P306', 'P123', 'P750', 'P413', 'P206', 'P466', 'P131', 'P463', 'P3373', 'P58', 'P1344', 'P1435', 'P410', 'P140', 'P355', 'P101', 'P156', 'P241', 'P641', 'P276', 'P4552', 'P1001', 'P106', 'P17'],25)

Reading FewRel data from inputfile

In [None]:
with open(in_train_data_file, 'r', encoding='utf-8') as inputfile:
    data = json.load(inputfile)

Helper function for generating a random dataset of given input examples with artificially created NOTA class

In [None]:
def generate_dataset_with_nota(input_data_file, relevant_relations, nota_classes):
    def set_label_to_nota(elem):
        elem['label'] = 'NOTA'
        return elem

    with open(input_data_file, 'r', encoding='utf-8') as inputfile:
        data = json.load(inputfile)
        
        relevant_data = [x for x in data if x['label'] in relevant_relations]

        examples_per_class = int(int(len(relevant_data)/len(relevant_relations))/len(nota_classes))
        
        nota_data_per_rel = dict()
        nota_data = [x for x in data if x['label'] in nota_classes]

        for example in nota_data:
            if (example['label'] not in nota_data_per_rel):
                nota_data_per_rel[example['label']] = []
            nota_data_per_rel[example['label']].append(example)

        for relation in nota_data_per_rel:
            random_examples = random.sample(nota_data_per_rel[relation], examples_per_class)
            random_examples = list(map(set_label_to_nota, random_examples))
            relevant_data.extend(random_examples)
        
        print('Length of generated dataset: %d' % len(relevant_data))
    return relevant_data

Generating and saving train-, test- and validation-dataset from FewRel data containing NOTA-examples as well.

In [None]:
with open(out_train_data_file, 'w') as outputfile:
    train_data = generate_dataset_with_nota(in_train_data_file, relevant_relations, nota_inference_classes)
    json.dump(train_data, outputfile)

with open(out_test_data_file, 'w') as outputfile:
    test_data = generate_dataset_with_nota(in_test_data_file, relevant_relations, nota_inference_classes)
    json.dump(test_data, outputfile)

with open(out_dev_data_file, 'w') as outputfile:
    dev_data = generate_dataset_with_nota(in_dev_data_file, relevant_relations, nota_train_classes)
    json.dump(dev_data, outputfile)

Length of generated dataset: 700
Length of generated dataset: 1400
Length of generated dataset: 1400
