In [1]:
import sys; sys.path.append('../../')
import wandb
import pathlib
import os
import json
import numpy as np
import random
import tempfile
from itertools import chain
from collections import defaultdict
random.seed(42)

In [2]:
WANDB_CACHE = str(pathlib.PosixPath('~/.wandb_cache').expanduser())
VERSION     ="v0" ##<- update if you want to use a different verison

In [3]:
FILE_PATH = os.path.join(WANDB_CACHE,"aaac_raw:%s" % VERSION)
def grab_raw_data(path):
    if not os.path.isdir(FILE_PATH):
        with wandb.init() as run: 
            artifact = run.use_artifact(
                'aaac/aaac_model_runs/aaac_raw:%s' % VERSION, 
                type='dataset'
            )
            artifact_dir = artifact.download(root=FILE_PATH)
grab_raw_data(FILE_PATH) 
DATA_JSON=os.path.join(FILE_PATH,"aaac.jsonl")

This grabs the raw `aaac` corpus (with version `v0`) from wandb and places it into a wandb cache. It first requires having some global access to your `WANDB_API_KEY`, which can be set by doing `export WANDB_API_KEY=....` 

In [4]:
LINES = []
with open(DATA_JSON) as my_data: 
    for line in my_data: 
        line      = line.strip()
        json_line = json.loads(line)
        LINES.append(json_line)

In [5]:
len(LINES)

10000

In [6]:
LINES[1]["reason_statements"]

[{'text': 'If, and only if, Keith is a expert of FC Vaduz and Keith is a member of FC Spartak Trnava, then Keith is a critic of FK Jablonec',
  'starts_at': 0,
  'ref_reco': 1},
 {'text': 'Keith is not a expert of PSV Eindhoven or Keith is a critic of OGC Nice',
  'starts_at': 350,
  'ref_reco': 6},
 {'text': 'if it is not the case that Keith is a expert of FC Vaduz and Keith is a member of FC Spartak Trnava, then Keith is not a critic of OGC Nice',
  'starts_at': 430,
  'ref_reco': 7},
 {'text': 'if Keith is a friend of RC Celta de Vigo, then Keith is a expert of PSV Eindhoven',
  'starts_at': 575,
  'ref_reco': 4},
 {'text': 'if it is not the case that Keith is a expert of FC Vaduz and Keith is a member of FC Spartak Trnava, then Keith is not a critic of OGC Nice',
  'starts_at': 662,
  'ref_reco': 7}]

In [7]:
list(LINES[0].keys())

['argument_source',
 'argdown_reconstruction',
 'reason_statements',
 'conclusion_statements',
 'explicit_premises',
 'explicit_premises_formalized',
 'implicit_premises',
 'implicit_premises_formalized',
 'conclusion',
 'conclusion_formalized',
 'intermediary_conclusions_formalized',
 'intermediary_conclusions',
 'id',
 'predicate_placeholders',
 'entity_placeholders',
 'steps',
 'n_premises',
 'base_scheme_groups',
 'scheme_variants',
 'domain_id',
 'domain_type',
 'plcd_subs',
 'argdown_index_map',
 'presentation_parameters']

In [8]:
train_amount = int(len(LINES)*0.7)
eval_amount  = int(len(LINES)*0.15)
random.shuffle(LINES)
train_instances = LINES[:train_amount]
dev_instances   = LINES[train_amount:train_amount+eval_amount]
test_instances  = LINES[train_amount+eval_amount:]

I just took a random train/test/dev split to start with. Not sure how much this makes sense given the corpus.

In [9]:
##open wandb again
run = wandb.init(entity="aaac",project="dataset_versions",name="dataset_upload")

with tempfile.TemporaryDirectory() as tempdir: 
    for sname,split in [
        ("train",train_instances),
        ("dev",dev_instances),
        ("test",test_instances)
    ]:
        ### outputfile 
        over = 0
        total = 0
        
        file_out = os.path.join(tempdir,sname+".jsonl")
        write_file = open(file_out,'w')
    
        for k,instance in enumerate(split):
            arg_source            = instance["argument_source"]
            conclusion_statements = instance["conclusion_statements"]
            reason_statements     = instance["reason_statements"]
            argdown               = instance["argdown_reconstruction"]
        
            ### arbitrary limitation on input size for now
            ## transformer is limited here
            if len(argdown.split()) >= 280: continue 
 
        
            ### json line format and schema for my model 
            ## this is data of the form `arg_source` -> `argdown`
            arg_src_argdown = {}
            arg_src_argdown["id"] = "%s_%d_%s" % (sname,k,"arg_src_argdown")
            arg_src_argdown["question"] = {}
            arg_src_argdown["question"]["stem"] = arg_source #<-- input field
            arg_src_argdown["output"] = argdown ##<-- left in newlines, not sure what the tokenizer will do here 
            arg_src_argdown["prefix"] = "gen:" ##<-- model specific field, indicates the model mode
            
            write_file.write(json.dumps(arg_src_argdown))
            write_file.write("\n")
        
            ## this part of the data as `argdown -> conclusion`
            conclusion_pointers = []
            for conclusion in conclusion_statements: 
                if conclusion["text"] in argdown:
                    conclusion_pointers.append(conclusion["ref_reco"])
            if conclusion_pointers: 
                clist = ','.join([str(v) for v in sorted(conclusion_pointers)])
            
                argdown_concl = {}
                argdown_concl["id"] = "%s_%d_%s" % (sname,k,"argdown_concl")
                argdown_concl["question"] = {}
                argdown_concl["question"]["stem"] = argdown ##<-- input 
                argdown_concl["output"] = clist ##<-- output 
                argdown_concl["prefix"] = "answer:" ##<-- important, indicates that we will measure accuracy
                
                write_file.write(json.dumps(argdown_concl))
                write_file.write("\n")
                
                
        write_file.close()
        
    ### write to wandb 
    artifact = wandb.Artifact("aaac_multi_angle",type='dataset')
    artifact.add_dir(tempdir)
    run.log_artifact(artifact)
    run.finish()

[34m[1mwandb[0m: Currently logged in as: [33myakazimir[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.22 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[34m[1mwandb[0m: Adding directory to artifact (/var/folders/7n/4mvmm_g56gv48s7g5lfzfs880000gp/T/tmpxjtnrdbu)... Done. 0.1s


VBox(children=(Label(value=' 0.00MB of 15.22MB uploaded (0.00MB deduped)\r'), FloatProgress(value=4.9752029368…