In [None]:
import os

if 'runonce' not in locals():
    os.chdir('..')
    runonce = True
os.getcwd()

'/home/stud/liao/shen/flamingo-gnn'

# Convert OBQA data to our format.

In [None]:
import pickle
import shutil
from pathlib import Path

import srsly
from tqdm import tqdm

original_statement_paths = {
    'train': 'data/obqa/statement_origin/train.statement.jsonl',
    'dev': 'data/obqa/statement_origin/dev.statement.jsonl',
    'test': 'data/obqa/statement_origin/test.statement.jsonl'
}
processed_statement_paths = {
    'train': 'data/obqa/statement/train.statement.jsonl',
    'dev': 'data/obqa/statement/dev.statement.jsonl',
    'test': 'data/obqa/statement/test.statement.jsonl'
}
original_subgraph_paths = {
    'train':  'data/obqa/graph/train.graph.adj.pk',
    'dev': 'data/obqa/graph/dev.graph.adj.pk',
    'test': 'data/obqa/graph/test.graph.adj.pk'
}
processed_subgraph_dir = {
    'train':  'data/obqa/adj/train/',
    'dev': 'data/obqa/adj/dev/',
    'test': 'data/obqa/adj/test/'
}

## 1. Convert Statement

The original statement format:

In [None]:
original_train_statement_path = original_statement_paths['train']
!head -n 1 $original_train_statement_path | jq

[1;39m{
  [0m[34;1m"id"[0m[1;39m: [0m[0;32m"7-980"[0m[1;39m,
  [0m[34;1m"question"[0m[1;39m: [0m[1;39m{
    [0m[34;1m"stem"[0m[1;39m: [0m[0;32m"The sun is responsible for"[0m[1;39m,
    [0m[34;1m"choices"[0m[1;39m: [0m[1;39m[
      [1;39m{
        [0m[34;1m"text"[0m[1;39m: [0m[0;32m"puppies learning new tricks"[0m[1;39m,
        [0m[34;1m"label"[0m[1;39m: [0m[0;32m"A"[0m[1;39m
      [1;39m}[0m[1;39m,
      [1;39m{
        [0m[34;1m"text"[0m[1;39m: [0m[0;32m"children growing up and getting old"[0m[1;39m,
        [0m[34;1m"label"[0m[1;39m: [0m[0;32m"B"[0m[1;39m
      [1;39m}[0m[1;39m,
      [1;39m{
        [0m[34;1m"text"[0m[1;39m: [0m[0;32m"flowers wilting in a vase"[0m[1;39m,
        [0m[34;1m"label"[0m[1;39m: [0m[0;32m"C"[0m[1;39m
      [1;39m}[0m[1;39m,
      [1;39m{
        [0m[34;1m"text"[0m[1;39m: [0m[0;32m"plants sprouting, blooming and wilting"[0m[1;39m,
        [0m[34;1m"label"[

The desired format of statement:
```json
{
    "id": "statement_id",
    "question": "question",
    "answers": ["choice1", "choices2"],
    "context": "",
    "label": 1  # the correct answer index
}
```

In [None]:
for split in ['train', 'dev']:
    original_statements = srsly.read_jsonl(original_statement_paths[split])
    processed_statements = []
    for statement in tqdm(original_statements):
        processed_statement = {
            'id': statement['id'],
            'question': statement['question']['stem'],
            'answers': [choice['text'] for choice in statement['question']['choices']],
            'label': 'ABCDE'.index(statement['answerKey'])
        }
        processed_statements.append(processed_statement)
    srsly.write_jsonl(processed_statement_paths[split], processed_statements)

4957it [00:00, 90679.45it/s]
500it [00:00, 94076.44it/s]


In [None]:
!head -n 1 data/obqa/statement/train.statement.jsonl | jq

[1;39m{
  [0m[34;1m"id"[0m[1;39m: [0m[0;32m"7-980"[0m[1;39m,
  [0m[34;1m"question"[0m[1;39m: [0m[0;32m"The sun is responsible for"[0m[1;39m,
  [0m[34;1m"answers"[0m[1;39m: [0m[1;39m[
    [0;32m"puppies learning new tricks"[0m[1;39m,
    [0;32m"children growing up and getting old"[0m[1;39m,
    [0;32m"flowers wilting in a vase"[0m[1;39m,
    [0;32m"plants sprouting, blooming and wilting"[0m[1;39m
  [1;39m][0m[1;39m,
  [0m[34;1m"label"[0m[1;39m: [0m[0;39m3[0m[1;39m
[1;39m}[0m


## 2. Convert Subgraph

Desired format is a tuple of `(adj, nodes, qmask, amask)`. Each subgraph is stored in a separate file named as `[its id].pkl`.

In [None]:
num_choices = 4
for split in ['train', 'dev']:
    with open(original_subgraph_paths[split], 'rb') as f:
        subgraphs = pickle.load(f)
    statements = srsly.read_jsonl(processed_statement_paths[split])
    Path(processed_subgraph_dir[split]).mkdir(parents=True, exist_ok=True)
    for subgraph, statement in tqdm(zip(subgraphs[::num_choices], statements)):
        subgraph = (subgraph['adj'], subgraph['concepts'], subgraph['qmask'], subgraph['amask'])
        with open(os.path.join(processed_subgraph_dir[split], statement['id'] + '.pkl'), 'wb') as f:
            pickle.dump(subgraph, f)

  subgraphs = pickle.load(f)
4957it [00:00, 10616.69it/s]
500it [00:00, 8478.03it/s]


## 3. Create sanity Check Dataset

I choose the first 192 statements as the sanity check dataset. The format is the same as the statement dataset.

In [None]:
sanity_statement_path = 'data/obqa_sanity/statement/train.statement.jsonl'
sanity_subgraph_path = 'data/obqa_sanity/adj/train/'
n_sanity = 192

In [None]:
processed_train_statements = iter(srsly.read_jsonl(processed_statement_paths['train']))
sainty_statements = []
for _ in range(n_sanity):
    statement = next(processed_train_statements)
    sainty_statements.append(statement)
    os.link(os.path.join(processed_subgraph_dir['train'], statement['id'] + '.pkl'),
            os.path.join(sanity_subgraph_path, statement['id'] + '.pkl'))
srsly.write_jsonl(sanity_statement_path, sainty_statements)

## 4. Create OBQA Pretrain dataset

In the OBQA data, there is a fact associated with each OBQA statement. We pretrain the model by asking the model to output the fact given the subgraph.

The format of the statement:
```json
{
  "id": "train-5ee1ace4",
  "context": "The award was originally sculpted by George Stanley from a design sketch by Cedric Gibbons. AMPAS first presented it in 1929 at a private dinner hosted by Douglas Fairbanks in the Hollywood Roosevelt Hotel. The Academy Awards ceremony was first broadcast on radio in 1930 and televised for the first time in 1953. It is the oldest worldwide entertainment awards ceremony and is now seen live worldwide. Its equivalents – the Emmy Awards for television, the Tony Awards for theater, and the Grammy Awards for music – are modeled after the Academy Awards.",
  "question": "",
  "answers": []
}
```

In [None]:
original_fact_statement_paths = {
    'train': 'data/obqa/statement_origin/train-fact.statement.jsonl',
    'dev': 'data/obqa/statement_origin/dev-fact.statement.jsonl'
}
processed_pretrain_statement_paths = {
    'train': 'data/obqa_pretrain/statement/train.statement.jsonl',
    'dev': 'data/obqa_pretrain/statement/dev.statement.jsonl'
}

In [None]:
for split in ['train', 'dev']:
    original_statements = srsly.read_jsonl(original_fact_statement_paths[split])
    processed_statements = []
    for statement in tqdm(original_statements):
        processed_statement = {
            'id': statement['id'],
            'question': statement['question']['stem'],
            'answers': [],
            'context': statement['fact1'],
        }
        processed_statements.append(processed_statement)
    srsly.write_jsonl(processed_pretrain_statement_paths[split], processed_statements)