In [1]:
import json
import numpy as np
import os
import random
from copy import deepcopy
from pprint import pprint
from tqdm import tqdm

DATA_DIR = '{}/research/data'.format(os.getenv('HOME'))

In [2]:
model_no = 2
beam = 10
length_penalty = 2.5
split = 'train_para'

checkpoint_dir = os.getenv('HOME') + '/research/XLM/dumped'
if model_no == 1:
    model_dir = 'umt.comparison.paired.wp=0.15.sa=0.5.ebs=256.lr=0.0001.ws=0.wd=0.0.wb=0.0/17507568'
elif model_no == 2:
    model_dir = 'umt.comparison.paired.ebs=256.lr=0.0001.ws=0.wd=0.0.wb=0.03/17410140'
else:
    assert False, 'Bad model_no'
subqs_filename = f'hyp.bs={beam}.lp={length_penalty}.es=False.mh-sh.{split}.pred.bleu.sh.txt'

subqs_filepath = os.path.join(checkpoint_dir, model_dir, subqs_filename)
hotpot_split = split.replace('valid', 'dev').replace('_para', '')
assert hotpot_split in {'train', 'dev', 'test'}

with open(subqs_filepath) as f:
    raw_subqs = f.readlines()
print(f'Read {len(raw_subqs)} Sub-Q pairs')
print(f'model_no={model_no}, beam={beam}, length_penalty={length_penalty}')

Read 17435 Sub-Q pairs
model_no=2, beam=10, length_penalty=2.5


In [3]:
subqs = []
for raw_subq in raw_subqs:
    ex_subqs = raw_subq.strip('\n').strip().split(' ?')
    proc_ex_subqs = []
    for ex_subq in ex_subqs:
        proc_ex_subq = ex_subq.strip()
        if len(proc_ex_subq) > 0:
            proc_ex_subqs.append(proc_ex_subq + '?')
    subqs.append(proc_ex_subqs)

subq_lens = np.array([len(subq) for subq in subqs])
print('Mean # of Sub-Qs:', round(subq_lens.mean(), 3))
subqs[0]

Mean # of Sub-Qs: 1.982


["arthur 's magazine was started when?", 'first for women was started when?']

In [4]:
with open(f'{DATA_DIR}/umt/comparison.paired/{split}.qids.txt') as f:
    qids = f.readlines()
qids = [qid.strip('\n') for qid in qids]
qid2subqs = {qid: subq_pair for qid, subq_pair in zip(qids, subqs)}
print(f'Read {len(qids)} QIDs')

Read 17435 QIDs


In [5]:
with open(f'{DATA_DIR}/hotpot-all/{hotpot_split}.json') as f:
    data_hotpot = json.load(f)

num_hotpot_examples = len(data_hotpot['data'])
print(f'Read {num_hotpot_examples} HotpotQA examples')
qid2examples = {}
for example in data_hotpot['data']:
    qid = example['paragraphs'][0]['qas'][0]['id']
    qid2examples[qid] = example

print(example)

Read 90447 HotpotQA examples
{'paragraphs': [{'context': ['<title> vietnam national cricket team </title> the vietnam national cricket team represents vietnam in international cricket. it will debut in the cricket tournament at the 2017 southeast asian games in kuala lumpur, malaysia.', "<title> pickwick cricket club </title> pickwick cricket club is a barbados cricket club. the club was founded on 23 november 1882, the second oldest cricket club in barbados after wanderers cricket club. the club's home from its foundation until 2005 was kensington oval in bridgetown, the main venue for matches involving the barbados national cricket team and the barbados venue for test cricket involving the west indies cricket team. the ground was built on land on kensington plantation leased by the club for a penny per annum. pickwick club developed kensington oval into the finest ground in barbados and it soon replaced the wanderers ground and the garrison savanna as the venue for inter-colonial mat

In [6]:
new_data_hotpot = {'data': []}
for qid, subqpair in qid2subqs.items():
    assert len(subqpair) > 0, 'There must be at least 1 Sub-Q!'
    example = deepcopy(qid2examples[qid])
    example['paragraphs'][0]['qas'] = [
        {'question': subq,
         'answers': [[] for _ in range(len(example['paragraphs'][0]['qas'][0]['answers']))],
         'id': qid + '-' + str(i)}
        for i, subq in enumerate(subqpair)]
    new_data_hotpot['data'].append(example)

print(new_data_hotpot['data'][0])

{'paragraphs': [{'context': ["<title> radio city (indian radio station) </title> radio city is india's first private fm radio station and was started on 3 july 2001. it broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from mumbai (where it was started in 2004), bengaluru (started first in 2001), lucknow and new delhi (since 2003). it plays hindi, english and regional songs. it was launched in hyderabad in march 2006, in chennai on 7 july 2006 and in visakhapatnam october 2007. radio city recently forayed into new media in may 2008 with the launch of a music portal - planetradiocity.com that offers music related news, videos, songs, and other music-related features. the radio station currently plays a mix of hindi and regional music. abraham thomas is the ceo of the company.", "<title> history of albanian football </title> football in albania existed before the albanian football federation (fshf) was created. this was evidenced by the team's registration at the balkan cup tour

In [7]:
save_dir = f'{DATA_DIR}/hotpot-all.umt.comparison.paired.model={model_no}.beam={beam}.lp={float(length_penalty)}'
print('Saving to', save_dir)
os.makedirs(save_dir, exist_ok=False)
assert not os.path.isfile(f'{save_dir}/{hotpot_split}.json'), 'File already exists!'
with open(f'{save_dir}/{hotpot_split}.json', 'w') as f:
    json.dump(new_data_hotpot, f, indent=2)

Saving to: /Users/ethanperez/research/data/hotpot-all.umt.comparison.paired.model=2.beam=10.lp=2.5
