In [2]:
import json
from random import seed

seed(42)

In [3]:
train_data = json.load(open('../data/unlabelled/train_fact_verification_support_data.json'))

test_data_common = json.load(open('../data/unlabelled/gt_fact_verification_support_data.json'))
test_data_only_dbp = json.load(open('../data/unlabelled/gt_only_dbp_support_data.json'))
test_data = test_data_common + test_data_only_dbp

In [4]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="simple")

In [5]:
test_data_persons = set()
for data in test_data:
    for person in data['pair']:
        test_data_persons.add(person)

print(f'Total persons in test facts: {len(test_data_persons)}')

Total persons in test facts: 396


In [15]:
from tqdm import tqdm
from random import shuffle
from collections import defaultdict

train_facts = []
train_facts_direct = []
train_facts_only_negative_others = []
train_facts_only_positive_others = []

for data in tqdm(train_data):
    person_one, person_two = data['pair']
    if any((person in test_data_persons) for person in data['pair']):
        continue

    supports = sorted(data['supports'], key=lambda x: x['score'])
    for i, support in enumerate(supports):
        other_persons = []
        pair_names = defaultdict(list)

        evidence = support['content'].replace('\n', ' ')
        entities = token_classifier(evidence)

        for entity in entities:
            if entity['entity_group'] != 'PER':
                continue

            person_name = entity['word']
            if person_one in person_name or person_name in person_one:
                pair_names[person_one].append(person_name)
                continue

            if person_two in person_name or person_name in person_two:
                pair_names[person_two].append(person_name)
                continue

            other_persons.append(person_name)

        person_one_names, person_two_names = pair_names[person_one], pair_names[person_two]
        shuffle(person_one_names), shuffle(person_two_names), shuffle(other_persons)

        if all(v for v in pair_names.values()):  # Add +ve entry for only most confident support
            train_facts.append({'input': f'Is {person_one} married with {person_two}?\n{evidence}', 'output': 'yes'})
            train_facts.append({'input': f'Is {person_two} married with {person_one}?\n{evidence}', 'output': 'yes'})

            train_facts_direct.append({'input': f'Is {person_one} married with {person_two}?\n{evidence}', 'output': 'yes'})
            train_facts_direct.append({'input': f'Is {person_two} married with {person_one}?\n{evidence}', 'output': 'yes'})

            if person_one_names[0] != person_one or person_two_names[0] != person_two:
                train_facts.append({'input': f'Is {person_one_names[0]} married with {person_two_names[0]}?\n{evidence}', 'output': 'yes'})
                train_facts.append({'input': f'Is {person_two_names[0]} married with {person_one_names[0]}?\n{evidence}', 'output': 'yes'})

                train_facts_only_positive_others.append({'input': f'Is {person_one_names[0]} married with {person_two_names[0]}?\n{evidence}', 'output': 'yes'})
                train_facts_only_positive_others.append({'input': f'Is {person_two_names[0]} married with {person_one_names[0]}?\n{evidence}', 'output': 'yes'})

        else:
            train_facts.append({'input': f'Is {person_one} married with {person_two}?\n{evidence}', 'output': 'no'})
            train_facts.append({'input': f'Is {person_two} married with {person_one}?\n{evidence}', 'output': 'no'})

            train_facts_direct.append({'input': f'Is {person_one} married with {person_two}?\n{evidence}', 'output': 'no'})
            train_facts_direct.append({'input': f'Is {person_two} married with {person_one}?\n{evidence}', 'output': 'no'})

        if len(other_persons) >= 2:
            train_facts.append({'input': f'Is {other_persons[0]} married with {other_persons[1]}?\n{evidence}', 'output': 'no'})
            train_facts.append({'input': f'Is {other_persons[1]} married with {other_persons[0]}?\n{evidence}', 'output': 'no'})

            train_facts_only_negative_others.append({'input': f'Is {other_persons[0]} married with {other_persons[1]}?\n{evidence}', 'output': 'no'})
            train_facts_only_negative_others.append({'input': f'Is {other_persons[1]} married with {other_persons[0]}?\n{evidence}', 'output': 'no'})


100%|██████████| 2000/2000 [05:04<00:00,  6.57it/s]


In [16]:
dataset_sizes = [500, 1000, 2000, 4000, 8000, 20000]
datasets = {
    'all': train_facts,
    'direct': train_facts_direct,
    'indirect_pos': train_facts_direct + train_facts_only_positive_others,
    'indirect_neg': train_facts_direct + train_facts_only_negative_others
}


In [17]:
# Delete all training files from data/unifiedQA before creating new training files
for name, dataset in datasets.items():
    pos_facts = [fact for fact in dataset if fact['output'] == 'yes']
    neg_facts = [fact for fact in dataset if fact['output'] == 'no']

    for size in dataset_sizes:
        shuffle(pos_facts), shuffle(neg_facts)
        per_label_size = size // 2

        pos_selection_size = per_label_size if per_label_size < len(pos_facts) else len(pos_facts)
        neg_selection_size = per_label_size if per_label_size < len(neg_facts) else len(neg_facts)

        label_selection_size = min(pos_selection_size, neg_selection_size)

        pos_selection = pos_facts[:label_selection_size]
        neg_selection = neg_facts[:label_selection_size]
        selection = pos_selection + neg_selection

        filename = f'../data/unifiedQA/train_{name}_{len(selection)}.json'
        print(f'{filename}: pos_count:{len(pos_selection)} neg_count:{len(neg_selection)} total:{len(selection)}')

        json.dump(selection, open(filename, 'w'))

        if label_selection_size < per_label_size:
            break

../data/unifiedQA/train_all_500.json: pos_count:250 neg_count:250 total:500
../data/unifiedQA/train_all_1000.json: pos_count:500 neg_count:500 total:1000
../data/unifiedQA/train_all_2000.json: pos_count:1000 neg_count:1000 total:2000
../data/unifiedQA/train_all_4000.json: pos_count:2000 neg_count:2000 total:4000
../data/unifiedQA/train_all_8000.json: pos_count:4000 neg_count:4000 total:8000
../data/unifiedQA/train_all_15144.json: pos_count:7572 neg_count:7572 total:15144
../data/unifiedQA/train_direct_500.json: pos_count:250 neg_count:250 total:500
../data/unifiedQA/train_direct_1000.json: pos_count:500 neg_count:500 total:1000
../data/unifiedQA/train_direct_2000.json: pos_count:1000 neg_count:1000 total:2000
../data/unifiedQA/train_direct_4000.json: pos_count:2000 neg_count:2000 total:4000
../data/unifiedQA/train_direct_5208.json: pos_count:2604 neg_count:2604 total:5208
../data/unifiedQA/train_indirect_pos_500.json: pos_count:250 neg_count:250 total:500
../data/unifiedQA/train_indire