In [3]:
import json

In [83]:
train_data_wiki = json.load(open('../data/unlabelled/train_fact_verification_support_data.json'))

test_data_common = json.load(open('../data/unlabelled/gt_fact_verification_support_data.json'))
test_data_only_dbp = json.load(open('../data/unlabelled/gt_only_dbp_support_data.json'))
test_data = test_data_common + test_data_only_dbp

In [84]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "huggingface-course/bert-finetuned-ner"
token_classifier = pipeline("token-classification", model=model_checkpoint, aggregation_strategy="simple")

In [85]:
test_data_persons = set()
for data in test_data:
    for person in data['pair']:
        test_data_persons.add(person)

print(f'Total persons in test facts: {len(test_data_persons)}')

Total persons in test facts: 396


In [91]:
from tqdm import tqdm
from random import shuffle
from collections import defaultdict

train_facts = []
for data in tqdm(train_data):
    person_one, person_two = data['pair']
    if any((person in test_data_persons) for person in data['pair']):
        continue

    for support in data['supports']:
        other_persons = []
        pair_names = defaultdict(list)

        evidence = support['content'].replace('\n', ' ')
        entities = token_classifier(evidence)

        for entity in entities:
            if entity['entity_group'] != 'PER':
                continue

            person_name = entity['word']
            if person_one in person_name or person_name in person_one:
                pair_names[person_one].append(person_name)
                continue

            if person_two in person_name or person_name in person_two:
                pair_names[person_two].append(person_name)
                continue

            other_persons.append(person_name)

        person_one_names, person_two_names = pair_names[person_one], pair_names[person_two]
        shuffle(person_one_names), shuffle(person_two_names), shuffle(other_persons)
        if all(v for v in pair_names.values()):
            train_facts.append({'input': f'Is {person_one} married with {person_two}?\n{evidence}', 'output': 'yes'})
            train_facts.append({'input': f'Is {person_two} married with {person_one}?\n{evidence}', 'output': 'yes'})

            if person_one_names[0] != person_one or person_two_names[0] != person_two:
                train_facts.append({'input': f'Is {person_one_names[0]} married with {person_two_names[0]}?\n{evidence}', 'output': 'yes'})
                train_facts.append({'input': f'Is {person_two_names[0]} married with {person_one_names[0]}?\n{evidence}', 'output': 'yes'})

        else:
            train_facts.append({'input': f'Is {person_one} married with {person_two}?\n{evidence}', 'output': 'no'})
            train_facts.append({'input': f'Is {person_two} married with {person_one}?\n{evidence}', 'output': 'no'})


        if len(other_persons) >= 2:
            train_facts.append({'input': f'Is {other_persons[0]} married with {other_persons[1]}?\n{evidence}', 'output': 'no'})
            train_facts.append({'input': f'Is {other_persons[1]} married with {other_persons[0]}?\n{evidence}', 'output': 'no'})


100%|██████████| 2000/2000 [05:05<00:00,  6.55it/s]


In [92]:
json.dump(train_facts, open('../data/unifiedQA/train.json', 'w'))

In [93]:
pos_count = len([fact for fact in train_facts if fact['output'] == 'yes'])
neg_count = len([fact for fact in train_facts if fact['output'] == 'no'])
total = len(train_facts)

total, pos_count, neg_count, total - (pos_count + neg_count)

(16548, 7582, 8966, 0)