# Document retrieval environment setup
# conda create
`conda create -n pyserini python=3.8`
`conda activate pyserini`
`conda install -c conda-forge openjdk=11`
`pip install pyserini`
`pip install torch==1.8.1 torchvision==0.9.1 torchaudio===0.8.1 -f https://download.pytorch.org/whl/torch_stable.html`
`conda install faiss-cpu -c pytorch`
`pip install ipykernel`
`conda install jupyter notebook`

In [2]:
import json
from pyserini.search import FaissSearcher
from pyserini.search.lucene import LuceneSearcher
from tqdm import tqdm
from random import seed, shuffle

dpr_faiss_index_path = '/data/table-understanding/data/indexes/dindex-wikipedia-dpr_multi-bf-20200127-f403c3.29eb39fe0b00a03c36c0eeae4c24f775'
lucene_index_path = '/data/table-understanding/data/indexes/index-wikipedia-dpr-20210120-d1b9e6.c28f3a56b2dfcef25bf3bf755c264d04'
searcher = FaissSearcher(dpr_faiss_index_path, 'facebook/dpr-question_encoder-multiset-base')
doc_retriever = LuceneSearcher(lucene_index_path)

seed(41)

Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.weight', 'question_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def get_document_retrieval_data(file_path, count=100):
    NAME = 1
    SPOUSE_NAME = 3

    data = []
    with open(file_path) as f:
        for line in f:
            row = line.strip().split('\t')
            person_one = row[NAME]
            person_two = row[SPOUSE_NAME]
            data.append({
                'pair': (person_one, person_two),
                'questions': [f'Is {person_one} married to {person_two}?', f'Is {person_two} married to {person_one}?']
            })

    shuffle(data)

    return data[1:count + 1]

In [4]:
def retrieve_docs(data):
    for spouse_data in tqdm(data):
        supports = []
        covered_docs_ids = set()
        for q in spouse_data['questions']:
            hits = searcher.search(q, k=3)
            for hit in hits:
                score = hit.score
                doc_id = hit.docid
                if doc_id in covered_docs_ids:
                    continue

                covered_docs_ids.add(doc_id)
                content = json.loads(doc_retriever.doc(hit.docid).raw())['contents']
                if all([any([name in content for name in person.split(' ')]) for person in spouse_data['pair']]):
                    supports.append({
                        'content': content,
                        'score': float(score),
                        'doc_id': doc_id
                    })

        spouse_data['supports'] = supports

    return data

In [7]:
fact_verification_data = get_document_retrieval_data('./fact_verification_data.tsv', 2000)
fact_verification_data = retrieve_docs(fact_verification_data)
fact_verification_gt_data = fact_verification_data[:100]
fact_verification_train_data = fact_verification_data[100:]

only_dbp_data = get_document_retrieval_data('./only_dbp_spouse_data.tsv')
only_dbp_data = retrieve_docs(only_dbp_data)

100%|██████████| 2000/2000 [7:00:02<00:00, 12.60s/it]  


In [8]:
import json

with open('./gt_fact_verification_support_data.json', 'w') as f:
    json.dump(fact_verification_gt_data, f)

with open('./train_fact_verification_support_data.json', 'w') as f:
    json.dump(fact_verification_train_data, f)

with open('./gt_only_dbp_support_data.json', 'w') as f:
    json.dump(only_dbp_data, f) 