In [1]:
from pyserini.search import FaissSearcher
from pyserini.search.lucene import LuceneSearcher
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dpr_faiss_index_path = '/data02/amandeep_wikidata/faiss_indices/pyserini/indexes/dindex-wikipedia-dpr_multi-bf-20200127-f403c3.29eb39fe0b00a03c36c0eeae4c24f775'
lucene_index_path = '/data02/amandeep_wikidata/faiss_indices/pyserini/indexes/index-wikipedia-dpr-20210120-d1b9e6.c28f3a56b2dfcef25bf3bf755c264d04'

In [3]:
searcher = FaissSearcher(
    dpr_faiss_index_path,
    'facebook/dpr-question_encoder-multiset-base'
)

Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
doc_retriever = LuceneSearcher(lucene_index_path)

In [5]:
hits = searcher.search('WHen did Virat Kohli get married?')

for i in range(0, 10):
    _d = json.loads(doc_retriever.doc(hits[i].docid).raw())['contents']
    print(f'{i+1:2} {hits[i].docid:7} {hits[i].score:.5f} {_d}')

 1 11635285 78.72609 "Virat Kohli"
the second most successful centurion in ODI cricket after Sachin Tendulkar. Kohli started dating Bollywood actress Anushka Sharma in 2013; the couple soon earned the celebrity couple nickname ""Virushka"". Their relationship attracted substantial media attention, with persistent rumours and speculations in the media, as neither of the two publicly talked about it. The couple married on 11 December 2017 in a private ceremony in Florence, Italy. Kohli has admitted that he is superstitious. He used to wear black wristbands as a cricket superstition; earlier, he used to wear the same pair of gloves with which he had ""been scoring"". Apart
 2 14636392 76.21272 "Vikram Chatwal"
June 2005. He was also featured on the June 19, 2006 cover of ""Forbes Asia"". He is also attributed as revitalizing the hotel industry in Manhattan, opening the Time Hotel during a time when few other hotel groups were investing in the area. Chatwal was previously married to Indian

In [12]:
questions = json.load(open('marriages_in_wikidata.json'))

In [13]:
print(len(questions))

68159


In [14]:
print(questions[0])

{'q': 'when did Ranbir Kapoor marry Alia?', 'a': '2022-04-14', 'n1': 'Q1063412', 'n2': 'Q4725343', 'n1_label': 'Ranbir Kapoor', 'n2_label': 'Alia Bhatt', 'url': 'http://en.wikipedia.org/wiki/Ranbir_Kapoor'}


In [15]:
import copy
import pandas as pd

In [16]:
%%time
answers = []
count = 0
o = open('marriages_wikidata.dpr.answers.jl', 'w')
for question in questions:
    hits = searcher.search(question['q'], k=100)
    s = copy.deepcopy(question)
    s['dpr_answers'] = []
    for i in range(len(hits)):
        _d = json.loads(doc_retriever.doc(hits[i].docid).raw())['contents']
        ss = {}
        ss['dpr_answer'] = _d
        ss['dpr_score'] = float(hits[i].score)
        ss['dpr_docid'] = hits[i].docid
        s['dpr_answers'].append(ss)
    o.write(json.dumps(s))
    o.write('\n')
    if count % 10000 == 0:
        print(f'done {count} questions.')
    count += 1

done 0 questions.
done 10000 questions.
done 20000 questions.
done 30000 questions.
done 40000 questions.
done 50000 questions.
done 60000 questions.
CPU times: user 4d 8h 8min 19s, sys: 12h 36min 11s, total: 4d 20h 44min 31s
Wall time: 4d 20h 36min 9s


In [17]:
!head -1 marriages_wikidata.dpr.answers.jl | jq .

[1;39m{
  [0m[34;1m"q"[0m[1;39m: [0m[0;32m"when did Ranbir Kapoor marry Alia?"[0m[1;39m,
  [0m[34;1m"a"[0m[1;39m: [0m[0;32m"2022-04-14"[0m[1;39m,
  [0m[34;1m"n1"[0m[1;39m: [0m[0;32m"Q1063412"[0m[1;39m,
  [0m[34;1m"n2"[0m[1;39m: [0m[0;32m"Q4725343"[0m[1;39m,
  [0m[34;1m"n1_label"[0m[1;39m: [0m[0;32m"Ranbir Kapoor"[0m[1;39m,
  [0m[34;1m"n2_label"[0m[1;39m: [0m[0;32m"Alia Bhatt"[0m[1;39m,
  [0m[34;1m"url"[0m[1;39m: [0m[0;32m"http://en.wikipedia.org/wiki/Ranbir_Kapoor"[0m[1;39m,
  [0m[34;1m"dpr_answers"[0m[1;39m: [0m[1;39m[
    [1;39m{
      [0m[34;1m"dpr_answer"[0m[1;39m: [0m[0;32m"\"Ranbir Kapoor\"\nlife. Rumours of an affair with Katrina Kaif first emerged during the production of \"\"Ajab Prem Ki Ghazab Kahani\"\" in 2009. In August 2013, a set of paparazzi photographs of Kapoor and Kaif at a beach in Spain were published by \"\"Stardust\"\". Although Kapoor initially declined to speak of the relationship, he admitte

In [25]:
!head -3405 marriages_wikidata.dpr.answers.jl | tail -1 | jq .

[1;39m{
  [0m[34;1m"q"[0m[1;39m: [0m[0;32m"when did Hitomi Saito marry Jirō?"[0m[1;39m,
  [0m[34;1m"a"[0m[1;39m: [0m[0;32m"2010-08-03"[0m[1;39m,
  [0m[34;1m"n1"[0m[1;39m: [0m[0;32m"Q1067677"[0m[1;39m,
  [0m[34;1m"n2"[0m[1;39m: [0m[0;32m"Q11326398"[0m[1;39m,
  [0m[34;1m"n1_label"[0m[1;39m: [0m[0;32m"Hitomi Saito"[0m[1;39m,
  [0m[34;1m"n2_label"[0m[1;39m: [0m[0;32m"Jirō Hachimitsu"[0m[1;39m,
  [0m[34;1m"url"[0m[1;39m: [0m[0;32m"http://en.wikipedia.org/wiki/Hitomi_Saito"[0m[1;39m,
  [0m[34;1m"dpr_answers"[0m[1;39m: [0m[1;39m[
    [1;39m{
      [0m[34;1m"dpr_answer"[0m[1;39m: [0m[0;32m"\"Hitomi (singer)\"\n\"\"Yume Hakonda Randoseru\"\" and was released as a digital single on February 2, 2017. In November 2017, she also started a new radio program called \"\"Hitomi Radio\"\" on Inter FM. On December 1, 2002 Hitomi married Keisuke Uesugi, a businessman and former member of hip-hop group Gasboys. They divorced in November 