In [1]:
import sys; sys.path.append('../../')
import wandb
import pathlib
import os
import json
import numpy as np
import random
import tempfile
from itertools import chain
from collections import defaultdict
random.seed(42)

In [2]:
WANDB_CACHE = str(pathlib.PosixPath('~/.wandb_cache').expanduser())
VERSION     ="v3" ##<- update if you want to use a different verison

In [3]:
FILE_PATH = os.path.join(WANDB_CACHE,"aaac_raw:%s" % VERSION)
def grab_raw_data(path):
    if not os.path.isdir(FILE_PATH):
        with wandb.init() as run: 
            artifact = run.use_artifact(
                'aaac/aaac_model_runs/aaac_raw:%s' % VERSION, 
                type='dataset'
            )
            artifact_dir = artifact.download(root=FILE_PATH)
grab_raw_data(FILE_PATH) 
DATA_JSON=os.path.join(FILE_PATH,"aaac.jsonl")

This grabs the raw `aaac` corpus (with version `v0`) from wandb and places it into a wandb cache. It first requires having some global access to your `WANDB_API_KEY`, which can be set by doing `export WANDB_API_KEY=....` 

In [4]:
LINES = []
with open(DATA_JSON) as my_data: 
    for line in my_data: 
        line      = line.strip()
        json_line = json.loads(line)
        LINES.append(json_line)

In [5]:
len(LINES)

10000

In [6]:
LINES[1]["reason_statements"]

[{'text': 'Every ingredient of Shower Gel is an ingredient of Pure Elegance or an ingredient of EYE COLOUR DUO EC26',
  'starts_at': 0,
  'ref_reco': 8},
 {'text': 'to be an ingredient of Vitamin E Body Wash or an ingredient of EYEBROW PENCIL EB02 is necessary for being an ingredient of Shower Gel',
  'starts_at': 257,
  'ref_reco': 6},
 {'text': 'each thing that is not an ingredient of Matifying Veil is neither an ingredient of Vitamin E Body Wash nor an ingredient of EYEBROW PENCIL EB02',
  'starts_at': 396,
  'ref_reco': 4},
 {'text': 'not being an ingredient of Tinker Bell Tattoo is sufficient for not being an ingredient of Matifying Veil',
  'starts_at': 545,
  'ref_reco': 1},
 {'text': 'being an ingredient of Tinker Bell Tattoo is sufficient for not being an ingredient of EYE COLOUR DUO EC26',
  'starts_at': 656,
  'ref_reco': 2},
 {'text': 'each thing that is not an ingredient of Matifying Veil is neither an ingredient of Vitamin E Body Wash nor an ingredient of EYEBROW PENCIL E

In [7]:
list(LINES[0].keys())

['argument_source',
 'argdown_reconstruction',
 'reason_statements',
 'conclusion_statements',
 'explicit_premises',
 'explicit_premises_formalized',
 'implicit_premises',
 'implicit_premises_formalized',
 'conclusion',
 'conclusion_formalized',
 'intermediary_conclusions_formalized',
 'intermediary_conclusions',
 'id',
 'predicate_placeholders',
 'entity_placeholders',
 'steps',
 'n_premises',
 'base_scheme_groups',
 'scheme_variants',
 'domain_id',
 'domain_type',
 'plcd_subs',
 'argdown_index_map',
 'presentation_parameters']

In [10]:
# defines how to present reason and conclusion statements to the model
def format_statements_list(statements: list) -> str:
    if len(statements)==0:
        return "None"
    list_as_string = ["%s {ref: (%s)}" % (sdict['text'],sdict['ref_reco']) for sdict in statements]
    list_as_string = " | ".join(list_as_string)
    return list_as_string

In [13]:
format_statements_list(LINES[1]["conclusion_statements"])

'being an ingredient of Pure Elegance is necessary for being an ingredient of Shower Gel {ref: (9)}'

In [14]:
# define modes in terms of keys
modes = [
    {'from':['argument_source'],'to':'argdown_reconstruction'},
    {'from':['argument_source','reason_statements'],'to':'argdown_reconstruction'},
    {'from':['argument_source','conclusion_statements'],'to':'argdown_reconstruction'},
    {'from':['reason_statements','conclusion_statements'],'to':'argdown_reconstruction'},
    {'from':['argument_source'],'to':'reason_statements'},
    {'from':['argument_source','argdown_reconstruction'],'to':'reason_statements'},
    {'from':['argument_source','conclusion_statements'],'to':'reason_statements'},
    {'from':['argument_source'],'to':'conclusion_statements'},
    {'from':['argument_source','argdown_reconstruction'],'to':'conclusion_statements'},
    {'from':['argument_source','reason_statements'],'to':'conclusion_statements'},
]
len(modes)

10

In [16]:
train_amount = int(len(LINES)*0.7)
eval_amount  = int(len(LINES)*0.15)
random.shuffle(LINES)
train_instances = LINES[:train_amount]
dev_instances   = LINES[train_amount:train_amount+eval_amount]
test_instances  = LINES[train_amount+eval_amount:]

I just took a random train/test/dev split to start with. Not sure how much this makes sense given the corpus.

In [18]:
##open wandb again
run = wandb.init(entity="aaac",project="dataset_versions",name="dataset_upload")

with tempfile.TemporaryDirectory() as tempdir: 
    for sname,split in [
        ("train",train_instances),
        ("dev",dev_instances),
        ("test",test_instances)
    ]:
        ### outputfile 
        over = 0
        total = 0
        
        file_out = os.path.join(tempdir,sname+".jsonl")
        write_file = open(file_out,'w')
    
        for k,instance in enumerate(split):
            arg_source            = instance["argument_source"]
            conclusion_statements = instance["conclusion_statements"]
            reason_statements     = instance["reason_statements"]
            argdown               = instance["argdown_reconstruction"]
 
        
            ### iterate over all modes
            for mode in modes:
                mname = "+".join(mode['from']) +'>'+mode['to']

                # construct input
                question=""
                for key_from in mode['from']:
                    add = instance[key_from]
                    if key_from in ["reason_statements","conclusion_statements"]:
                        add = format_statements_list(add)
                    question = question + " %s: %s" % (key_from,add)
                question = question + " " + mode['to'] +":"
                question = question.strip()

                ### arbitrary limitation on input size for now
                ## transformer is limited here
                if len(question.split()) >= 280: continue 

                # construct output
                output=instance[mode['to']]
                if mode['to'] in ["reason_statements","conclusion_statements"]:
                    output = format_statements_list(output)
                output = output.strip()

                ### arbitrary limitation on output size for now
                if len(output.split()) >= 280: continue 

                # put input and output together
                ### json line format and schema for Kyle's model 
                new_item = {}
                new_item["id"] = "%s_%d_%s" % (sname,k,mname)
                new_item["question"] = {}
                new_item["question"]["stem"] = question #<-- input field
                new_item["output"] = output ##<-- left in newlines, tokenizer will ignore them
                new_item["prefix"] = "gen:" ##<-- model specific field, indicates the model mode
                write_file.write(json.dumps(new_item))
                write_file.write("\n")
                
        write_file.close()
        
    ### write to wandb 
    artifact = wandb.Artifact("aaac_multi_angle",type='dataset')
    artifact.add_dir(tempdir)
    run.log_artifact(artifact)
    run.finish()

[34m[1mwandb[0m: wandb version 0.10.29 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[34m[1mwandb[0m: Adding directory to artifact (/var/folders/ql/h_s52yl51x70ynttgg820gz80000gp/T/tmp1cjnzzaa)... Done. 0.2s


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…