In [1]:
import json
import numpy as np
import os
import pandas as pd
import simplejson
import string
from copy import deepcopy
from pprint import pprint
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import re

def normalize_answer(s):

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


DATA_DIR = '{}/research/data'.format(os.getenv('HOME'))
seed = 42

In [2]:
### Combine Q, sub-Qs, sub-As
# Read QIDs (to split between train/dev)
qid2example = {}
qid2split2example = {}
qid2final_answers = None
DATA_DIR = '{}/research/data'.format(os.getenv('HOME'))
for split in ['train', 'dev_distractor']:
    file_path = '{}/hotpot-orig/hotpot_{}_v1.json'.format(DATA_DIR, split)
    with open(file_path, 'r') as f:
        data_hotpot = json.load(f)
    splitqid2example = {example['_id']: example for example in data_hotpot}
    qid2example.update(splitqid2example)
    qid2split2example[split.replace('_distractor', '')] = splitqid2example

# Read data and predictions (Q, sub-Qs, sub-As, and context)
data_with_subqs = {'data': []}
subas = {}
num_shards = 500
base_name = 'filter_name=bi-cond-lm.top_knn=10.max_mods_per_nn=4'
for shard_no in range(num_shards):
    with open('{}/research/DecompRC/DecompRC/out/hotpot/bert_predict.retrieve_and_edit.split=train-dev.{}.num_shards={}.shard_no={}.nbest_predictions.json'.format(
            os.getenv('HOME'), base_name, num_shards, shard_no)) as f:
        subas.update(json.load(f))
    with open('{}/research/data/hotpot-all/split=train-dev.{}.num_shards={}.shard_no={}.json'.format(
            os.getenv('HOME'), base_name, num_shards, shard_no)) as f:
        data_with_subqs['data'] += json.load(f)['data']

data_hotpot_squad = {}
for split in ['train', 'dev_distractor']:
    file_path = '{}/hotpot-squad/hotpot_{}_v1.json'.format(DATA_DIR, split)
    with open(file_path, 'r') as f:
        data_hotpot_squad[split.replace('_distractor', '')] = json.load(f)

In [3]:
all_subqs = []
all_subas = []
for example in data_with_subqs['data']:
    for para in example['paragraphs']:
        subqas = para['qas']
        qid = subqas[0]['id'].split('-')[0]
        for s in range(len(subqas)):
            all_subqs.append(subqas[s]['question'].strip())
            all_subas.append(subas[qid + '-' + str(s)][0]['text'].strip())

In [4]:
# Process/combine data
use_q = False
use_subq = True
use_random_subq = False
use_suba = True
use_random_suba = False

subq_sep = '/'
random.seed(seed)
num_question_words = []
answer_lens = []
qid2q_subqs_subas = {}
for example in data_with_subqs['data']:
    for para in example['paragraphs']:
        question = para['original_question'].strip() if use_q else ''
        subqas = para['qas']
        qid = subqas[0]['id'].split('-')[0]
        if use_subq or use_suba:
            for s in range(len(subqas)):
                # NB: Can also extract: probability, logit, no_answer_logit, evidence
                subqid = qid + '-' + str(s)
                question += ' ' + subq_sep
                if use_subq:
                    subq = random.sample(all_subqs, 1)[0] if use_random_subq else subqas[s]['question']
                    question += ' ' + subq.strip()
                if use_suba:
                    suba = random.sample(all_subas, 1)[0] if use_random_suba else subas[subqid][0]['text']
                    question += ' ' + suba.strip()
        question = question.strip(' ' + subq_sep + ' ')
        qid2q_subqs_subas[qid] = question
        answer_lens.append(sum([1 for answer in qid2example[qid]['answer'] if len(answer) > 0]))
        num_question_words.append(len(question.split()))
num_question_words = np.array(num_question_words)
answer_lens = np.array(answer_lens)
print('# words in Q, sub-Qs, and sub-As: {:.2f}'.format(num_question_words.mean()))
print('answer lengths:', answer_lens.min(), answer_lens.mean(), answer_lens.max())

# words in Q, sub-Qs, and sub-As: 469.97
answer lengths: 1 13.848710297183501 623


In [5]:
qid2q_subqs_subas['5a7a06935542990198eaf050']

"first for women magazine started? 1989 / arthur's magazine started? ready set go! / arthur's magazine magazine started? ready set go! / where was or first for? philadelphia / which university magazine is a started first arthur magazine? first for women / which university magazine is a first arthur's magazine? ready set go! / which university magazine is a first arthur magazine? ready set go! / which university magazine is was started first magazine? arthur's magazine / when was arthur's magazine magazine started? 1989 / when was first for women law magazine started? 19th century / when was the first for women law magazine started? 1844 / when was the magazine magazine started? 1989 / which university magazine is a started first arthur magazine? first for women / which university magazine is a first arthur's magazine? ready set go! / which university magazine is a first arthur magazine? ready set go! / which university magazine is a free started first arthur magazine? first for women /

In [6]:
answer_f1s = []
data_hotpot_squad_new = deepcopy(data_hotpot_squad)
for split in data_hotpot_squad_new.keys():
    print('split:', split)
    for example_no in tqdm(range(len(data_hotpot_squad_new[split]['data']))):
        if '_id' not in data_hotpot_squad_new[split]['data'][example_no]['paragraphs'][0]:
            continue # SQuAD example: No modification necessary
        # Treat input (Qs, sub-Qs, and sub-As) all as one paragraph
        data_hotpot_squad_new[split]['data'][example_no]['paragraphs'] = data_hotpot_squad_new[split]['data'][example_no]['paragraphs'][:1]
        qid = data_hotpot_squad_new[split]['data'][example_no]['paragraphs'][0]['_id']
        if qid in qid2q_subqs_subas.keys():
            data_hotpot_squad_new[split]['data'][example_no]['paragraphs'][0]['context'] = 'yes no ' + qid2q_subqs_subas[qid].strip().lower()
        
        # Find answer text in Qs, sub-Qs, and sub-As
        answer_text = qid2split2example[split][qid]['answer'].lower().strip()
        answer_start = None
        answer_words = answer_text.split()
        answer_f1 = 0.
        for span_len in range(len(answer_words), 0, -1):
            for answer_word_start_index in range(len(answer_words) - span_len + 1):
                answer_text_slice = ' '.join(answer_words[answer_word_start_index: answer_word_start_index + span_len])
                if answer_text_slice in {'a', 'an', 'the'}:
                    continue  # Exclude articles as SQuAD evaluation does
                if answer_text_slice in data_hotpot_squad_new[split]['data'][example_no]['paragraphs'][0]['context']:
                    answer_start = data_hotpot_squad_new[split]['data'][example_no]['paragraphs'][0]['context'].index(answer_text_slice)
                    answer_prec = 1.
                    answer_recall = span_len / float(len(answer_words))
                    answer_f1 = 2 * (answer_prec * answer_recall) / (answer_prec + answer_recall)
                    break
            if answer_start is not None:
                break
        answer_f1s.append(answer_f1)
        assert len(data_hotpot_squad_new[split]['data'][example_no]['paragraphs'][0]['qas']) == 1
        data_hotpot_squad_new[split]['data'][example_no]['paragraphs'][0]['qas'][0]['answers'] = [{'text': answer_text_slice, 'answer_start': answer_start}] if answer_start is not None else []
        data_hotpot_squad_new[split]['data'][example_no]['paragraphs'][0]['qas'][0]['is_impossible'] = answer_start is None
        if not use_q:
            data_hotpot_squad_new[split]['data'][example_no]['paragraphs'][0]['qas'][0]['question'] = '?'
pprint(data_hotpot_squad_new['dev']['data'][0]['paragraphs'])
answer_f1s = np.array(answer_f1s)
print('Max F1:', answer_f1s.mean())
print('Max EM:', (answer_f1s == 1.).mean())  # 71044: 1 EM, 26808: 0 EM

  4%|▍         | 3839/90889 [00:00<00:02, 38388.57it/s]

split: train


100%|██████████| 90889/90889 [00:02<00:00, 41107.91it/s]
100%|██████████| 7440/7440 [00:00<00:00, 39288.97it/s]


split: dev
[{'_id': '5a8b57f25542995d1e6f1371',
  'context': 'yes no derrickson the dorsland trekkers? conrad brooks / what '
             'derrickson the dorsland trekkers? conrad brooks / derrickson and '
             'were the dorsland trekkers? no / ed wood were the dorsland '
             'trekkers? no / what is the derrickson and ed sergewa? filmmaker, '
             'actor, writer, producer, and director / what is derrickson and '
             'ed of seham sergewa? filmmaker, actor, writer, producer, and '
             'director / what is the were scott derrickson sergewa? american '
             'actor / what is the were scott derrickson seham sergewa? '
             'american actor / derrickson and ed evlija chelebija? no / scott '
             'derrickson evlija chelebija? conrad brooks / and ed wood evlija '
             'chelebija? sinister / ed wood of evlija chelebija? tyler bates / '
             'what was the were scott derrickson of ed policy? american '
             '

In [7]:
pprint(data_hotpot_squad_new['train']['data'][0]['paragraphs'])

[{'_id': '5a7a06935542990198eaf050',
  'context': "yes no first for women magazine started? 1989 / arthur's "
             "magazine started? ready set go! / arthur's magazine magazine "
             'started? ready set go! / where was or first for? philadelphia / '
             'which university magazine is a started first arthur magazine? '
             "first for women / which university magazine is a first arthur's "
             'magazine? ready set go! / which university magazine is a first '
             'arthur magazine? ready set go! / which university magazine is '
             "was started first magazine? arthur's magazine / when was "
             "arthur's magazine magazine started? 1989 / when was first for "
             'women law magazine started? 19th century / when was the first '
             'for women law magazine started? 1844 / when was the magazine '
             'magazine started? 1989 / which university magazine is a started '
             'first arthur magaz

In [8]:
pprint(data_hotpot_squad_new['dev']['data'][-1]['paragraphs'])

[{'context': 'Philosophers in antiquity used the concept of force in the study '
             'of stationary and moving objects and simple machines, but '
             'thinkers such as Aristotle and Archimedes retained fundamental '
             'errors in understanding force. In part this was due to an '
             'incomplete understanding of the sometimes non-obvious force of '
             'friction, and a consequently inadequate view of the nature of '
             'natural motion. A fundamental error was the belief that a force '
             'is required to maintain motion, even at a constant velocity. '
             'Most of the previous misunderstandings about motion and force '
             'were eventually corrected by Galileo Galilei and Sir Isaac '
             'Newton. With his mathematical insight, Sir Isaac Newton '
             'formulated laws of motion that were not improved-on for nearly '
             'three hundred years. By the early 20th century, Einstein '
 

           'is_impossible': False,
           'question': 'What is the magnitude of force divided by when '
                       'external force is added?'},
          {'answers': [],
           'id': '5ad266f6d7d075001a4291fe',
           'is_impossible': True,
           'plausible_answers': [{'answer_start': 71, 'text': 'forces'}],
           'question': 'In an open system of particles, there are no internal '
                       'what?'},
          {'answers': [],
           'id': '5ad266f6d7d075001a4291ff',
           'is_impossible': True,
           'plausible_answers': [{'answer_start': 71, 'text': 'forces'}],
           'question': 'What are balance in an open system of particles?'},
          {'answers': [],
           'id': '5ad266f6d7d075001a429200',
           'is_impossible': True,
           'plausible_answers': [{'answer_start': 455, 'text': 'acceleration'}],
           'question': 'If an internal force acts on the system, the center of '
                       'ma

          {'answers': [],
           'id': '5ad26839d7d075001a42925a',
           'is_impossible': True,
           'plausible_answers': [{'answer_start': 0, 'text': 'Forces'}],
           'question': 'What acts in no particular direction?'},
          {'answers': [],
           'id': '5ad26839d7d075001a42925b',
           'is_impossible': True,
           'plausible_answers': [{'answer_start': 0, 'text': 'Forces'}],
           'question': 'What has sizes depending on how weak the push or pull '
                       'is?'},
          {'answers': [],
           'id': '5ad26839d7d075001a42925c',
           'is_impossible': True,
           'plausible_answers': [{'answer_start': 195, 'text': 'forces'}],
           'question': 'What are classified as "vintage quantities"?'},
          {'answers': [],
           'id': '5ad26839d7d075001a42925d',
           'is_impossible': True,
           'plausible_answers': [{'answer_start': 195, 'text': 'forces'}],
           'question': 'What follows

           'is_impossible': True,
           'plausible_answers': [{'answer_start': 395, 'text': 'mass'}],
           'question': 'Newton realized that gravitational deceleration was '
                       'proportional to what?'}]},
 {'context': 'In this equation, a dimensional constant  is used to describe '
             'the relative strength of gravity. This constant has come to be '
             "known as Newton's Universal Gravitation Constant, though its "
             "value was unknown in Newton's lifetime. Not until 1798 was Henry "
             'Cavendish able to make the first measurement of  using a torsion '
             'balance; this was widely reported in the press as a measurement '
             'of the mass of the Earth since knowing  could allow one to solve '
             "for the Earth's mass given the above equation. Newton, however, "
             'realized that since all celestial bodies followed the same laws '
             'of motion, his law of gravity had

                       {'answer_start': 298,
                        'text': 'the shortest space-time path between two '
                                'space-time events.'}],
           'id': '57378b141c456719005744a0',
           'is_impossible': False,
           'question': 'What space-time path is seen as a curved line in '
                       'space?'},
          {'answers': [{'answer_start': 1117, 'text': 'gravitational force'},
                       {'answer_start': 1117, 'text': 'gravitational force'},
                       {'answer_start': 1117, 'text': 'gravitational force'},
                       {'answer_start': 1117, 'text': 'gravitational force'}],
           'id': '57378b141c456719005744a1',
           'is_impossible': False,
           'question': "What is the derivative of an object's changing "
                       'momentum called?'},
          {'answers': [{'answer_start': 498, 'text': 'global'},
                       {'answer_start': 496, 'text': 'a glob

                                  'text': 'stress tensor'}],
           'question': 'What does not cause strain in structures?'},
          {'answers': [],
           'id': '5ad2877cd7d075001a429913',
           'is_impossible': True,
           'plausible_answers': [{'answer_start': 132,
                                  'text': 'pressure terms'}],
           'question': 'What is associated with abnormal forces?'},
          {'answers': [],
           'id': '5ad2877cd7d075001a429914',
           'is_impossible': True,
           'plausible_answers': [{'answer_start': 262, 'text': 'shear terms'}],
           'question': 'What is associated with horizontal forces?'},
          {'answers': [],
           'id': '5ad2877cd7d075001a429915',
           'is_impossible': True,
           'plausible_answers': [{'answer_start': 322,
                                  'text': 'cross-sectional area'}],
           'question': 'Another term for on-diagonal elements is what?'}]},
 {'context': 'Torque 

In [9]:
dir_postfix = ''
if use_q:
    dir_postfix += '-q'
if use_random_subq:
    dir_postfix += '-randsubqs'
elif use_subq:
    dir_postfix += '-subqs'
if use_random_suba:
    dir_postfix += '-randsubas'
elif use_suba:
    dir_postfix += '-subas'
dir_postfix += '-nop'

if dir_postfix != '':
    for split in ['dev_distractor', 'train']:
        save_dir = '{}/hotpot-squad{}'.format(DATA_DIR, dir_postfix)
        print('save_dir:', save_dir, 'split:', split)
        os.makedirs(save_dir, exist_ok=True)
        with open(os.path.join(save_dir, 'hotpot_{}_v1.json'.format(split)), 'w') as f:
            json.dump(data_hotpot_squad_new[split.replace('_distractor', '')], f, indent=2)

save_dir: /Users/ethanperez/research/data/hotpot-squad-subqs-subas-nop split: dev_distractor
save_dir: /Users/ethanperez/research/data/hotpot-squad-subqs-subas-nop split: train
