In [1]:
import sys; sys.path.append('../../')
import wandb
import pathlib
import os
import re
import ast
import json
import numpy as np
import random
import tempfile
from itertools import chain
from collections import defaultdict
random.seed(42)

In [2]:
WANDB_CACHE = str(pathlib.PosixPath('~/.wandb_cache').expanduser())
VERSION     ="v2" ##<- update if you want to use a different verison

In [3]:
FILE_PATH = os.path.join(WANDB_CACHE,"aaac_raw:%s" % VERSION)
def grab_raw_data(path):
    if not os.path.isdir(FILE_PATH):
        api = wandb.Api()
        artifact = api.artifact(
            'aaac/dataset_versions/aaac_raw:%s' % VERSION, 
            type='raw_data'
        )
        artifact_dir = artifact.download(root=FILE_PATH)
        #with wandb.init(entity="aaac",project="dataset_versions",name="dataset_download") as run: 
            #artifact = run.use_artifact(
            #    'aaac/dataset_versions/aaac_raw:%s' % VERSION, 
            #    type='raw_data'
            #)
grab_raw_data(FILE_PATH) 
DATA_JSON=os.path.join(FILE_PATH,"aaac.jsonl")

This grabs the raw `aaac` corpus (with version `v0`) from wandb and places it into a wandb cache. It first requires having some global access to your `WANDB_API_KEY`, which can be set by doing `export WANDB_API_KEY=....` 

In [4]:
LINES = []
with open(DATA_JSON) as my_data: 
    for line in my_data: 
        line      = line.strip()
        json_line = json.loads(line)
        LINES.append(json_line)

In [5]:
len(LINES)

10000

In [6]:
LINES[1]["reason_statements"]

[{'text': 'Lisa occasionally purchases Camay soap or Lisa regularly consumes Bentley Organic soap',
  'starts_at': 160,
  'ref_reco': 1},
 {'text': 'if Lisa occasionally purchases Camay soap, then Lisa occasionally purchases Infusium shampoo',
  'starts_at': 252,
  'ref_reco': 2},
 {'text': 'if Lisa regularly consumes Bentley Organic soap, then Lisa occasionally purchases Infusium shampoo',
  'starts_at': 494,
  'ref_reco': 3},
 {'text': 'Lisa occasionally purchases Protex soap',
  'starts_at': 607,
  'ref_reco': 5}]

In [7]:
list(LINES[0].keys())

['argument_source',
 'argdown_reconstruction',
 'reason_statements',
 'conclusion_statements',
 'premises',
 'premises_formalized',
 'conclusion',
 'conclusion_formalized',
 'intermediary_conclusions_formalized',
 'intermediary_conclusions',
 'distractors',
 'id',
 'predicate_placeholders',
 'entity_placeholders',
 'steps',
 'n_premises',
 'n_distractors',
 'base_scheme_groups',
 'scheme_variants',
 'domain_id',
 'domain_type',
 'plcd_subs',
 'argdown_index_map',
 'presentation_parameters']

In [15]:
# defines how to present reason and conclusion statements to the model
def format_statements_list(statements: list) -> str:
    if len(statements)==0:
        return "None"
    list_as_string = ["%s (ref: (%s))" % (sdict['text'],sdict['ref_reco']) for sdict in statements]
    list_as_string = " | ".join(list_as_string)
    return list_as_string

In [16]:
format_statements_list(LINES[1]["conclusion_statements"])

'Lisa occasionally purchases Infusium shampoo and Lisa occasionally purchases Protex soap (ref: (6)) | Lisa occasionally purchases Infusium shampoo (ref: (4))'

In [17]:
# defines how to present argdown-snippe to the model
def format_argdown(argdown: str) -> str:
    argdown = argdown
    pattern = r"({.*uses: \[[\s\d,]*\]})" # matches yaml metadata inline blocks in inference patterns 
    matches = re.findall(pattern, argdown)
    for match in matches:
        m = match.replace('uses:','"uses":')
        m = m.replace('variant:','"variant":')
        d = ast.literal_eval(m)
        subst = "" 
        if "variant" in d:
            subst = "(%s) " % ", ".join(d['variant'])
        subst = subst + "from " + " ".join(["(%d)" % i for i in d['uses']])
        argdown = argdown.replace(match,subst)
    return argdown

In [18]:
print(format_argdown(LINES[0]["argdown_reconstruction"]))

(1) If something is not a street longer than Jefferson Street, then it is not a street crossing State Street.
--
with instantiation (transposition) from (1)
--
(2) If River Street is a street crossing State Street, then River Street is a street longer than Jefferson Street.
(3) If River Street is a street longer than Jefferson Street, then River Street is a street crossing Summit Avenue.
--
with chain rule from (2) (3)
--
(4) If River Street is a street crossing State Street, then River Street is a street crossing Summit Avenue.
(5) River Street is a street crossing State Street or River Street is a street longer than Front Street.
(6) If River Street is a street longer than Front Street, then River Street is a street crossing Summit Avenue.
--
with case analysis from (4) (5) (6)
--
(7) River Street is a street crossing Summit Avenue.


In [19]:
# define modes in terms of keys
modes = [
    {'from':['argument_source'],'to':'argdown_reconstruction'},
    {'from':['argument_source','reason_statements'],'to':'argdown_reconstruction'},
    {'from':['argument_source','conclusion_statements'],'to':'argdown_reconstruction'},
    {'from':['reason_statements','conclusion_statements'],'to':'argdown_reconstruction'},
    {'from':['argument_source','reason_statements','conclusion_statements'],'to':'argdown_reconstruction'},
    {'from':['argument_source'],'to':'reason_statements'},
    {'from':['argument_source','argdown_reconstruction'],'to':'reason_statements'},
    {'from':['argument_source','conclusion_statements'],'to':'reason_statements'},
    {'from':['argument_source'],'to':'conclusion_statements'},
    {'from':['argument_source','argdown_reconstruction'],'to':'conclusion_statements'},
    {'from':['argument_source','reason_statements'],'to':'conclusion_statements'},
]
len(modes)

11

In [20]:
train_amount = int(len(LINES)*0.7)
eval_amount  = int(len(LINES)*0.15)
random.shuffle(LINES)
train_instances = LINES[:train_amount]
dev_instances   = LINES[train_amount:train_amount+eval_amount]
test_instances  = LINES[train_amount+eval_amount:]
max_words = 280

I just took a random train/test/dev split to start with. Not sure how much this makes sense given the corpus.

In [21]:
##open wandb again
run = wandb.init(entity="aaac",project="dataset_versions",name="dataset_upload")

with tempfile.TemporaryDirectory() as tempdir: 
    for sname,split in [
        ("train",train_instances),
        ("dev",dev_instances),
        ("test",test_instances)
    ]:
        ### outputfile 
        over = 0
        total = 0
        
        file_out = os.path.join(tempdir,sname+".jsonl")
        write_file = open(file_out,'w')
    
        for k,instance in enumerate(split):
 
        
            ### iterate over all modes
            for mode in modes:
                mname = "+".join(mode['from']) +'>'+mode['to']

                # construct input
                question=""
                for key_from in mode['from']:
                    add = instance[key_from]
                    if key_from in ["reason_statements","conclusion_statements"]:
                        add = format_statements_list(add)
                    elif key_from in ["argdown_reconstruction"]:
                        add = format_argdown(add)
                    question = question + " %s: %s" % (key_from,add)
                #question = question + " " + mode['to'] +":" # mode['to'] is used as prefix
                question = question.strip()


                # construct output
                output=instance[mode['to']]
                if mode['to'] in ["reason_statements","conclusion_statements"]:
                    output = format_statements_list(output)
                elif mode['to'] in ["argdown_reconstruction"]:
                    output = format_argdown(output)
                output = output.strip()

                ### arbitrary limitation on input size for now
                ## transformer is limited here
                if len(question.split()) <= max_words and len(output.split()) <= max_words: 
                    # put input and output together
                    ### json line format and schema for Kyle's model 
                    new_item = {}
                    new_item["id"] = "%s_%d_%s" % (sname,k,mname)
                    new_item["question"] = {}
                    new_item["question"]["stem"] = question #<-- input field
                    new_item["output"] = output ##<-- left in newlines, tokenizer will ignore them
                    new_item["prefix"] = mode['to'] +":" ##<-- model specific field, indicates the model mode | "answer:" -> using the HACK by Dennis
                    write_file.write(json.dumps(new_item))
                    write_file.write("\n")
                
        write_file.close()
        
    ### write to wandb 
    artifact = wandb.Artifact("aaac_multi_angle",type='dataset',metadata={
        "source": ('aaac/aaac_model_runs/aaac_raw:%s' % VERSION),
        "max_length": max_words,
        "modes": str(modes)
    })
    artifact.add_dir(tempdir)
    run.log_artifact(artifact)
    run.finish()

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33maaac[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.29 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[34m[1mwandb[0m: Adding directory to artifact (/var/folders/ql/h_s52yl51x70ynttgg820gz80000gp/T/tmp4uceqtpc)... Done. 0.2s


VBox(children=(Label(value=' 0.00MB of 124.81MB uploaded (0.00MB deduped)\r'), FloatProgress(value=8.458579657…