In [7]:
import json
import re

In [None]:
file_path = 'v1.0-simplified_nq-dev-all.jsonl'
with open(file_path, 'r') as json_file:
    json_list = list(json_file)

data = []
for json_str in json_list:
    data.append(json.loads(json_str))

In [3]:
def get_nq_tokens(simplified_nq_example):

    if "document_text" not in simplified_nq_example:
        raise ValueError("`get_nq_tokens` should be called on a simplified NQ"
                     "example that contains the `document_text` field.")

    return simplified_nq_example["document_text"].split(" ")

In [4]:
def simplify_nq_example(nq_example):

    def _clean_token(token):
        return re.sub(u" ", "_", token["token"])

    text = " ".join([_clean_token(t) for t in nq_example["document_tokens"]])

    def _remove_html_byte_offsets(span):
        if "start_byte" in span:
            del span["start_byte"]

        if "end_byte" in span:
            del span["end_byte"]

        return span

    def _clean_annotation(annotation):
        annotation["long_answer"] = _remove_html_byte_offsets(
            annotation["long_answer"])
        annotation["short_answers"] = [
            _remove_html_byte_offsets(sa) for sa in annotation["short_answers"]
        ]
        return annotation

    simplified_nq_example = {
      "question_text": nq_example["question_text"],
      "example_id": nq_example["example_id"],
      "document_url": nq_example["document_url"],
      "document_text": text,
      "long_answer_candidates": [
          _remove_html_byte_offsets(c)
          for c in nq_example["long_answer_candidates"]
      ],
      "annotations": [_clean_annotation(a) for a in nq_example["annotations"]]
    }

    if len(get_nq_tokens(simplified_nq_example)) != len(
      nq_example["document_tokens"]):
        raise ValueError("Incorrect number of tokens.")

    return simplified_nq_example

In [5]:
def make_human_readable_example(example):
    
    tokens = example['document_text'].split()
    
    question = example['question_text']
    long_answer_info = example['annotations'][0]['long_answer']  # why first annotation?
    short_answer_info = example['annotations'][0]['short_answers']
    if long_answer_info['candidate_index'] == -1:
        return None, None, None
    long_answer = tokens[long_answer_info['start_token']:long_answer_info['end_token']]
    
    short_answers = []
    for ans in short_answer_info:
        short_answers.append(' '.join(tokens[ans['start_token']:ans['end_token']]))
    if not short_answers:
        return None, None, None
    
    return question, ' '.join(long_answer), ' '.join(short_answers)

In [8]:
ind = 0
example = data[ind]
simple_example = simplify_nq_example(example)

q, l, s = make_human_readable_example(simple_example)
print(f'Question: {q}\n')
print(f'Long answer: {l}\n')
print(f'Short answer: {s}')

Question: what do the 3 dots mean in math

Long answer: <P> In logical argument and mathematical proof , the therefore sign ( ∴ ) is generally used before a logical consequence , such as the conclusion of a syllogism . The symbol consists of three dots placed in an upright triangle and is read therefore . It is encoded at U + 2234 ∴ therefore ( HTML & # 8756 ; &there4 ; ) . For common use in Microsoft Office hold the ALT key and type `` 8756 '' . While it is not generally used in formal writing , it is used in mathematics and shorthand . It is complementary to U + 2235 ∵ because ( HTML & # 8757 ; ) . </P>

Short answer: the therefore sign ( ∴ ) is generally used before a logical consequence , such as the conclusion of a syllogism


- **question_text:** question
- **document_text:** whole wiki page
- **annotations:** list of 5 annotations including long answer info & short answers info
- **short_answers:** list of dicts with start and end tokens of actual short answers (can be empty)
- **long_answer:** dict with start and end tokens of actual long answer (token can be =-1 meaning no answer)
- **long_answer_candidates:** list of paragphaphs from wiki page with start and end tokens