# Matching KILT to Fid Format

- Converting KILT nq bm25 to FiD Data format

In [1]:
# FiD_nq_dpr 
# /data/philhoon-relevance/FiD/open_domain_data/NQ
# /data/philhoon-relevance/FiD/open_domain_data/NQ/train.json
# /data/philhoon-relevance/FiD/open_domain_data/NQ/dev.json
# /data/philhoon-relevance/FiD/open_domain_data/NQ/test.json

# kilt_nq_dpr
# /data/philhoon-relevance/KILT/kilt-dpr-retrieval/nq-train-multikilt.json
# /data/philhoon-relevance/KILT/kilt-dpr-retrieval/nq-dev-multikilt.json

In [1]:
import json
from pprint import pprint
import torch
import numpy as np
import random
from copy import deepcopy
from util import utils

## FiD NQ input format 
    - DPR retrieved data

In [2]:
# fid_nq_train_file = '/data/philhoon-relevance/FiD/open_domain_data/NQ/train.json'
fid_nq_dev_file = '/data/philhoon-relevance/FiD/open_domain_data/NQ/dev.json'

In [3]:
fid_nq_dpr_dev = utils.open_json(fid_nq_dev_file)
# fid_nq_dpr_train = open_json(fid_nq_train_file)

In [4]:
len(fid_nq_dpr_dev)

8757

In [6]:
# fid_nq_dpr_dev[0]

In [6]:
fid_nq_dpr_dev[-1].keys()

dict_keys(['question', 'answers', 'ctxs'])

In [7]:
# pprint(fid_nq_dpr_dev[-1])

In [8]:
# fid_nq_train_file[0].keys()

In [9]:
# pprint(fid_nq_train_file[0])

### Optional keys
- id : optional
- target : optional 
- answer used for model training, 
- if not given, the target is randomly sampled from the 'answers' list

### Required keys
- question - question text
- ansswers - list of answers for evalutaion
- ctx - a list of passages where each item is a dictionary contraining 
     - title : article title 
     - text : text

# FiD input format

```
{
  'question': 'What element did Marie Curie name after her native land?',
  'answers': ['Polonium', 'Po (chemical element)', 'Po'],
  'ctxs': [
            {
                "title": "Marie Curie",
                "text": "them on visits to Poland. She named the first chemical element that she discovered in 1898 \"polonium\", after her native country. Marie Curie died in 1934, aged 66, at a sanatorium in Sancellemoz (Haute-Savoie), France, of aplastic anemia from exposure to radiation in the course of her scientific research and in the course of her radiological work at field hospitals during World War I. Maria Sk\u0142odowska was born in Warsaw, in Congress Poland in the Russian Empire, on 7 November 1867, the fifth and youngest child of well-known teachers Bronis\u0142awa, \"n\u00e9e\" Boguska, and W\u0142adys\u0142aw Sk\u0142odowski. The elder siblings of Maria"
            },
            {
                "title": "Marie Curie",
                "text": "was present in such minute quantities that they would eventually have to process tons of the ore. In July 1898, Curie and her husband published a joint paper announcing the existence of an element which they named \"polonium\", in honour of her native Poland, which would for another twenty years remain partitioned among three empires (Russian, Austrian, and Prussian). On 26 December 1898, the Curies announced the existence of a second element, which they named \"radium\", from the Latin word for \"ray\". In the course of their research, they also coined the word \"radioactivity\". To prove their discoveries beyond any"
            }
    ]
}

```

## KILT NQ bm25
- KILT NQ BM25 retrieved data (for DPR training with KILT wikipedia data)
- https://github.com/facebookresearch/DPR/issues/186

In [8]:
kilt_nq_dev_file = '/scratch/philhoon-relevance/KILT/kilt-dpr-retrieval/nq-dev-multikilt.json'
# kilt_nq_train_file = '/data/philhoon-relevance/KILT/kilt-dpr-retrieval/nq-train-multikilt.json'
kilt_nq_bm25_dev = utils.open_json(kilt_nq_dev_file)
# kilt_nq_train = utils.open_json(kilt_nq_train_file)

In [13]:
len(kilt_nq_bm25_dev)

2837

In [9]:
kilt_nq_bm25_dev[0].keys()

dict_keys(['question', 'answers', 'positive_ctxs', 'hard_negative_ctxs'])

In [11]:
pprint(kilt_nq_bm25_dev[0])

{'answers': ['the therefore sign',
             'therefore sign',
             'the therefore sign ( ∴ ) is generally used before a logical '
             'consequence , such as the conclusion of a syllogism',
             'a logical consequence , such as the conclusion of a syllogism'],
 'hard_negative_ctxs': [{'psg_id': 8725785,
                         'text': 'difficult, having these pre-assessments done '
                                 'will help you spend more time teaching '
                                 "students what they don't know and just "
                                 'refreshing them on what they do already do '
                                 'know. For example, if you are going to be '
                                 'starting a new unit in math, how to add and '
                                 'subtract. Just by asking the students "What '
                                 'does addition mean?", "What does subtraction '
                                 'mean

In [12]:
print(kilt_nq_bm25_dev[1]['question'])
print(kilt_nq_bm25_dev[1]['answers'])
pprint(kilt_nq_bm25_dev[1]['hard_negative_ctxs'])

who wrote the song photograph by ringo starr
['George Harrison', 'Ringo Starr', 'Richard Starkey']
[{'psg_id': 10056405,
  'text': ', he performed "Photograph" and a cover of Carl Perkins\' "Honey '
          'Don\'t" at the "Concert for George" held in the Royal Albert Hall, '
          'London. Early the following year, he released the album "Ringo '
          'Rama", which contained a song he co-wrote as a tribute to Harrison, '
          '"Never Without You". Also in 2003, he formed Pumkinhead Records '
          'with All-Starr Band member Mark Hudson. The label was not prolific, '
          'but their first signing was Liam Lynch, who',
  'title': 'Ringo Starr'},
 {'psg_id': 27073260,
  'text': "mark the first anniversary of Harrison's death. According to the "
          'Concert for George website: "Ringo Starr caught everyone with a '
          "tear in their eye with a rendition of 'Photograph', a composition "
          'he wrote with George, which seemed to sum up how everyo

#### KILT NQ Adversarial Negative (dpr)
    - only provide adversarial negatives in train

In [13]:
# kilt_nq_train_ad_file = '/data/philhoon-relevance/KILT/kilt-dpr-ad-retrieval/nq-train-adv.json'
# kilt_nq_train_ad = utils.open_json(kilt_nq_train_ad_file)
# pprint(kilt_nq_train[0])
# pprint(kilt_nq_train_ad[0])

In [14]:
len(kilt_nq_bm25_dev)

2837

In [15]:
pprint(kilt_nq_bm25_dev[0].keys())

dict_keys(['question', 'answers', 'positive_ctxs', 'hard_negative_ctxs'])


In [16]:
pprint(kilt_nq_bm25_dev[0])

{'answers': ['the therefore sign',
             'therefore sign',
             'the therefore sign ( ∴ ) is generally used before a logical '
             'consequence , such as the conclusion of a syllogism',
             'a logical consequence , such as the conclusion of a syllogism'],
 'hard_negative_ctxs': [{'psg_id': 8725785,
                         'text': 'difficult, having these pre-assessments done '
                                 'will help you spend more time teaching '
                                 "students what they don't know and just "
                                 'refreshing them on what they do already do '
                                 'know. For example, if you are going to be '
                                 'starting a new unit in math, how to add and '
                                 'subtract. Just by asking the students "What '
                                 'does addition mean?", "What does subtraction '
                                 'mean

In [17]:
# def seed_everything(seed):
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.cuda.manual_seed_all(seed)  # if use multi-GPU
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False
#     np.random.seed(seed)
#     random.seed(seed)

In [18]:
seed = 42

In [19]:
# json_file = kilt_nq_dpr_dev

# cut_off = 0
# cut_off_pos = 0
# instances = []
# sample_size = 5
# position = 1
# total_questions = len(json_file) 
# ids = 1

In [37]:
def convert_foramt(json_file, sample_size:int, position:int):
    """
    Convert KILT retrieval foramt to FiD input format
    Output format
    {
      'question': text,
      'answers': answer_lst,
      'ctxs': [
                {
                    "title": "Marie Curie",
                    "text": "them on visits to Poland. She named the first chemical element that she discovered in 1898 \"polonium\", after her native country. Marie Curie died in 1934, aged 66, at a sanatorium in Sancellemoz (Haute-Savoie), France, of aplastic anemia from exposure to radiation in the course of her scientific research and in the course of her radiological work at field hospitals during World War I. Maria Sk\u0142odowska was born in Warsaw, in Congress Poland in the Russian Empire, on 7 November 1867, the fifth and youngest child of well-known teachers Bronis\u0142awa, \"n\u00e9e\" Boguska, and W\u0142adys\u0142aw Sk\u0142odowski. The elder siblings of Maria"
                },
                ...
                {
                    "title": "Marie Curie",
                    "text": "was present in such minute quantities that they would eventually have to process tons of the ore. In July 1898, Curie and her husband published a joint paper announcing the existence of an element which they named \"polonium\", in honour of her native Poland, which would for another twenty years remain partitioned among three empires (Russian, Austrian, and Prussian). On 26 December 1898, the Curies announced the existence of a second element, which they named \"radium\", from the Latin word for \"ray\". In the course of their research, they also coined the word \"radioactivity\". To prove their discoveries beyond any"
                }
        ]
    }
    
    Position means location on positive context on ctxs list
    """
    cut_off = 0
    cut_off_pos = 0
    instances = []
    sample_size = sample_size
    position = position
    total_questions = len(json_file) 
    ids = 1
    
    for idx, samples in enumerate(json_file):
        answers = samples['answers']
        question = samples['question']
        negative_samples = []

        if len(samples['hard_negative_ctxs']) < sample_size or len(samples['positive_ctxs']) < 1:
            cut_off += 1
        else:
            cnt_negative_sample = 0
            for neg_sample in samples['hard_negative_ctxs']:
                if cnt_negative_sample > sample_size - 1:
                    break
                ng_temp = {
                    "title" : neg_sample['title'].replace('\n', ' '),
                    "text" : neg_sample['text'].replace('\n', ' ')
                }
                negative_samples.append(ng_temp)
                cnt_negative_sample += 1

            # 'hard_negative_ctxs' might be sorted by its score, so shuffle them
            random.shuffle(negative_samples)

            # replace 1 negative_sample with one positive_sample in designated position
            flag = 0
            for answer in answers:
                for pos_sample in samples['positive_ctxs']:
                    if answer in pos_sample['text'] or answer in pos_sample['title']:
                        positive_temp = {
                            "psg_id" : pos_sample['psg_id'],
                            "title" : pos_sample['title'].replace('\n', ' '),
                            "text" : pos_sample['text'].replace('\n', ' ')
                        } 
                        flag = 1
                        break
                else:
                    check_temp = {
                            "psg_id" : pos_sample['psg_id'],
                            "title" : pos_sample['title'].replace('\n', ' '),
                            "text" : pos_sample['text'].replace('\n', ' '),
                            'answer' : answer,
                            'question' : question,
                    } 
                    continue
                break
            if flag == 0:
                pprint(check_temp)

            if flag:
                ctx_samples = deepcopy(negative_samples)
                ctx_samples[position-1] = positive_temp
                ctxs_template={
                    'id' : str(ids),
                    'question' : question,
                    'answers' : answers,
                    'ctxs' : ctx_samples
                }
                ids += 1
                instances.append(ctxs_template)
            else:
                cut_off_pos += 1   
    
    return instances, cut_off, cut_off_pos, total_questions

In [38]:
pos = 1
sample_size = 5
instances, cut_off, cut_off_pos, total = convert_foramt(kilt_nq_bm25_dev, sample_size, pos)

{'answer': 'September 28 , 2017',
 'psg_id': 9694855,
 'question': "when does the 14th season of grey's anatomy come out",
 'text': 'for the episode "Cold as Ice" as Dr. Herman to present a new '
         'opportunity for Arizona. Production. "Grey\'s Anatomy" was renewed '
         'for a 14th season on February 10, 2017. It premiered on September '
         '28, 2017, with a two-hour premiere. Ellen Pompeo announced that she '
         'would be directing several episodes in the 14th season. On April 28, '
         '2017, veteran writer Krista Vernoff announced that she would return '
         'to the show as a writer after leaving the show after the seventh '
         'season.',
 'title': "Grey's Anatomy (season 14)"}
{'answer': '54 Mbit / s',
 'psg_id': 23820958,
 'question': 'what is the maximum data rate for the 802.11a standard select '
             'one',
 'text': 'IEEE 802.11 IEEE 802.11 is part of the IEEE 802 set of LAN '
         'protocols, and specifies the set of media a

In [24]:
print(len(instances))
print(cut_off)
print(cut_off_pos)
print(total)

2539
0
298
2837


In [None]:
pos = 1
sample_size = 5
instances, loss, total = convert_foramt(kilt_nq_bm25_dev, sample_size, pos)

In [None]:
print(len(instances))
print(loss)
print(total)
print(total-loss)

In [None]:
# position = 1
# check_ids = 1
# position = 1
# for i in instances:
#     id_ = i['id']
#     # check consecutive id
#     if i['id'] != str(check_ids):
#         print(f'id not matching : should be {check_ids} instead of {id_}')
    
#     check_ids += 1
#     question = i['question']
#     answer_ = i['answers']
#     pos = i['ctx'][position-1]['text']
#     # check whether negative ctx has answers
#     for answer in i['answers']:
#         for neg in i['ctx'][:position-1] + i['ctx'][position:]:
#             neg_context = neg['title'] + ' : ' + neg['text']
#             if answer in neg_context:
#                 print('answer in neg_context')
#                 print(f'Question : {question}')
#                 print(f'Answer : {answer_}')
#                 print(f'neg_passage : {neg_context}')
#                 print(f'pos_passage : {pos}')
        
#                 break

In [None]:
pprint(instances[50])

In [None]:
filename = f'/data/philhoon-relevance/FiD/open_domain_data/NQ_KILT_BM25/kilt_bm25_nq_dev_pos{pos}.json'

In [None]:
filename

In [None]:
# utils.save_json(instances, filename)

# Swappings up to position 5
- Based on previous result, incrementally create new files with different position of positive ctx

In [None]:
def create_new_instance(instances, previous_pos, new_position):
    new_instance = deepcopy(instances)
    for ins in new_instance:
        ins['ctxs'][previous_pos-1], ins['ctxs'][new_position-1] = ins['ctxs'][new_position-1], ins['ctxs'][previous_pos-1]
    return new_instance

In [None]:
def checking_with_previous(instances, instances2, previous_pos, new_position):
    # checking
    for in1, in2 in zip(instances, instances2):
        if in1['id'] != in2['id']:
            print('not matching id')
            return False
        if in1['question'] != in2['question']:
            print('not mathcing question')
            return False
        if in1['answers'] != in2['answers']:
            print('not mathcing question')
            return False
        if in1['ctxs'][:previous_pos-1] + in1['ctxs'][previous_pos:] != in2['ctxs'][:new_position-1] + in2['ctxs'][new_position:]:
            print('negative not mathcing')
            pprint(in1['ctxs'])
            print('-----')
            pprint(in2['ctxs'])
            return False
        if in1['ctxs'][previous_pos-1] != in2['ctxs'][new_position-1]:
            print('positive not mathcing')
            print(in1['ctxs'][previous_pos-1])
            print(in2['ctxs'][new_position-1])
            return False
    return True
    

In [None]:
# prev_pos = 1
# next_pos = 2
# instance2 = create_new_instance(instances, prev_pos, next_pos)

In [None]:
# checking_with_previous(instances, instance2, prev_pos, next_pos)

In [None]:
# instance2 = deepcopy(instances)

In [None]:
# filename = f'/data/philhoon-relevance/FiD/open_domain_data/NQ_KILT_BM25/kilt_dpr_nq_dev_pos{next_pos}.json'

In [None]:
# prev_pos, next_pos = next_pos, next_pos + 1
# print(prev_pos, next_pos)

In [None]:
last_ins_testing = deepcopy(instances)
for i in range(1, 6):
    prev_pos = i
    next_pos = prev_pos + 1 if prev_pos < 5 else 1
    filename = f'/data/philhoon-relevance/FiD/open_domain_data/NQ_KILT_BM25/kilt_bm25_nq_dev_pos{next_pos}.json'
    instance2 = create_new_instance(instances, prev_pos, next_pos)
    print(f'prev_pos : {prev_pos} next_pos : {next_pos}')
    
    if next_pos == 1 and checking_with_previous(instances, last_ins_testing, prev_pos, next_pos):
        print(f'checking the first one and the last instance')
    elif checking_with_previous(instances, instance2, prev_pos, next_pos):
        utils.save_json(instance2, filename)
        print(f'Saving {next_pos} instance')
        
    instances = deepcopy(instance2)
        

## ====== The End ======



In [None]:

for i in range(1, 6):
    prev_pos = i
    next_pos = prev_pos + 1 if prev_pos < 5 else 1
    print(f'prev_pos : {prev_pos} next_pos : {next_pos}')
    instance2 = create_new_instance(instances, prev_pos, next_pos)
    print(checking_with_previous(instances, instance2, prev_pos, next_pos))
    filename = f'/data/philhoon-relevance/FiD/open_domain_data/NQ_KILT_BM25/kilt_dpr_nq_dev_pos{next_pos}.json'
    print(filename)
    print('==========')
    instances = deepcopy(instance2)
        

In [None]:
previous_pos = 1
new_position = 2

In [None]:
for ins in instance2:
    ins['ctxs'][previous_pos-1], ins['ctxs'][new_position-1] = ins['ctxs'][new_position-1], ins['ctxs'][previous_pos-1]

In [None]:
# checking
for in1, in2 in zip(instances, instance2):
    if in1['id'] != in2['id']:
        print('not matching id')
        break
    if in1['question'] != in2['question']:
        print('not mathcing question')
        break
    if in1['answers'] != in2['answers']:
        print('not mathcing question')
        break
    if in1['ctxs'][:previous_pos-1] + in1['ctxs'][previous_pos:] != in2['ctxs'][:new_position-1] + in2['ctxs'][new_position:]:
        print('negative not mathcing')
        pprint(in1['ctxs'][:previous_pos-1] + in1['ctxs'][previous_pos:])
        pprint(in2['ctxs'][:new_position-1] + in2['ctxs'][new_position:])
        break
    if in1['ctxs'][previous_pos-1] != in2['ctxs'][new_position-1]:
        print('positive not mathcing')
        print(in1['ctxs'][previous_pos-1])
        print(in2['ctxs'][new_position-1])
        break
    

In [None]:
filename = '/data/philhoon-relevance/FiD/open_domain_data/NQ_KILT_DPR/kilt_dpr_nq_dev_pos2.json'

In [None]:
save_json(instance2, filename)

In [None]:
instance3 = deepcopy(instance2)

In [None]:
previous_pos = 2
new_position = 3

In [None]:
for ins in instance3:
    ins['ctxs'][previous_pos-1], ins['ctxs'][new_position-1] = ins['ctxs'][new_position-1], ins['ctxs'][previous_pos-1]

In [None]:
# checking
for in1, in2 in zip(instance2, instance3):
    if in1['id'] != in2['id']:
        print('not matching id')
        break
    if in1['question'] != in2['question']:
        print('not mathcing question')
        break
    if in1['answers'] != in2['answers']:
        print('not mathcing question')
        break
    if in1['ctxs'][:previous_pos-1] + in1['ctxs'][previous_pos:] != in2['ctxs'][:new_position-1] + in2['ctxs'][new_position:]:
        print('negative not mathcing')
        pprint(in1['ctxs'][:previous_pos-1] + in1['ctxs'][previous_pos:])
        pprint(in2['ctxs'][:new_position-1] + in2['ctxs'][new_position:])
        break
    if in1['ctxs'][previous_pos-1] != in2['ctxs'][new_position-1]:
        print('positive not mathcing')
        print(in1['ctxs'][previous_pos-1])
        print(in2['ctxs'][new_position-1])
        break
    

In [None]:
filename = '/data/philhoon-relevance/FiD/open_domain_data/NQ_KILT_DPR/kilt_dpr_nq_dev_pos3.json'

In [None]:
save_json(instance3, filename)

In [None]:
instance4 = deepcopy(instance3)

In [None]:
previous_pos = 3
new_position = 4

In [None]:
for ins in instance4:
    ins['ctxs'][previous_pos-1], ins['ctxs'][new_position-1] = ins['ctxs'][new_position-1], ins['ctxs'][previous_pos-1]

In [None]:
# checking
for in1, in2 in zip(instance3, instance4):
    if in1['id'] != in2['id']:
        print('not matching id')
        break
    if in1['question'] != in2['question']:
        print('not mathcing question')
        break
    if in1['answers'] != in2['answers']:
        print('not mathcing question')
        break
    if in1['ctxs'][:previous_pos-1] + in1['ctxs'][previous_pos:] != in2['ctxs'][:new_position-1] + in2['ctxs'][new_position:]:
        print('negative not mathcing')
        pprint(in1['ctxs'][:previous_pos-1] + in1['ctxs'][previous_pos:])
        pprint(in2['ctxs'][:new_position-1] + in2['ctxs'][new_position:])
        break
    if in1['ctxs'][previous_pos-1] != in2['ctxs'][new_position-1]:
        print('positive not mathcing')
        print(in1['ctxs'][previous_pos-1])
        print(in2['ctxs'][new_position-1])
        break
    

In [None]:
filename = '/data/philhoon-relevance/FiD/open_domain_data/NQ_KILT_DPR/kilt_dpr_nq_dev_pos4.json'

In [None]:
save_json(instance4, filename)

In [None]:
instance5 = deepcopy(instance4)

In [None]:
previous_pos = 4
new_position = 5

In [None]:
for ins in instance5:
    ins['ctxs'][previous_pos-1], ins['ctxs'][new_position-1] = ins['ctxs'][new_position-1], ins['ctxs'][previous_pos-1]

In [None]:
# checking
for in1, in2 in zip(instance4, instance5):
    if in1['id'] != in2['id']:
        print('not matching id')
        break
    if in1['question'] != in2['question']:
        print('not mathcing question')
        break
    if in1['answers'] != in2['answers']:
        print('not mathcing question')
        break
    if in1['ctxs'][:previous_pos-1] + in1['ctxs'][previous_pos:] != in2['ctxs'][:new_position-1] + in2['ctxs'][new_position:]:
        print('negative not mathcing')
        pprint(in1['ctxs'][:previous_pos-1] + in1['ctxs'][previous_pos:])
        pprint(in2['ctxs'][:new_position-1] + in2['ctxs'][new_position:])
        break
    if in1['ctxs'][previous_pos-1] != in2['ctxs'][new_position-1]:
        print('positive not mathcing')
        print(in1['ctxs'][previous_pos-1])
        print(in2['ctxs'][new_position-1])
        break
    

In [None]:
filename = '/data/philhoon-relevance/FiD/open_domain_data/NQ_KILT_DPR/kilt_dpr_nq_dev_pos5.json'

In [None]:
save_json(instance5, filename)

In [None]:
instance5[0]['ctxs']

In [None]:
previous_pos

In [None]:
# checking instance5 and instances
previous_pos = 5
new_position = 1

In [None]:
instance5[0]['ctxs']

In [None]:
# checking
for in1, in2 in zip(instance5, instances):
    if in1['id'] != in2['id']:
        print('not matching id')
        break
    if in1['question'] != in2['question']:
        print('not mathcing question')
        break
    if in1['answers'] != in2['answers']:
        print('not mathcing question')
        break
    if in1['ctxs'][:previous_pos-1] + in1['ctxs'][previous_pos:] != in2['ctxs'][:new_position-1] + in2['ctxs'][new_position:]:
        print('negative not mathcing')
        pprint(in1['ctxs'][:previous_pos-1] + in1['ctxs'][previous_pos:])
        print('=====')
        pprint(in2['ctxs'][:new_position-1] + in2['ctxs'][new_position:])
        break
    if in1['ctxs'][previous_pos-1] != in2['ctxs'][new_position-1]:
        print('positive not mathcing')
        print(in1['ctxs'][previous_pos-1])
        print(in2['ctxs'][new_position-1])
        break
    

In [None]:
k = [1,2,3]
q = [1,2,3]

In [None]:
k == q

In [None]:
previous_pos = 1
new_position = 2

In [None]:
k[:previous_pos-1] + k[previous_pos:] == q[:new_position-1] + q[new_position:]

In [None]:
q[:new_position-1] + q[new_position:]