In [18]:
import sys; sys.path.append('../../')
import wandb
import pathlib
import os
import re
import ast
import json
import numpy as np
import random
import tempfile
from itertools import chain
from collections import defaultdict
random.seed(42)

In [19]:
WANDB_CACHE = str(pathlib.PosixPath('~/.wandb_cache').expanduser())
VERSION     ="v3" ##<- update if you want to use a different verison

In [28]:
FILE_PATH = os.path.join(WANDB_CACHE,"aaac_raw:%s" % VERSION)
def grab_raw_data(path):
    #if not os.path.isdir(FILE_PATH): # GB: prevents update / download of data
        api = wandb.Api()
        artifact = api.artifact(
            'aaac/dataset_versions/aaac_raw:%s' % VERSION, 
            type='raw_data'
        )
        artifact_dir = artifact.download(root=FILE_PATH)
        #with wandb.init(entity="aaac",project="dataset_versions",name="dataset_download") as run: 
            #artifact = run.use_artifact(
            #    'aaac/dataset_versions/aaac_raw:%s' % VERSION, 
            #    type='raw_data'
            #)
grab_raw_data(FILE_PATH) 
DATA_JSON=os.path.join(FILE_PATH,"aaac.jsonl")

This grabs the raw `aaac` corpus (with version `v0`) from wandb and places it into a wandb cache. It first requires having some global access to your `WANDB_API_KEY`, which can be set by doing `export WANDB_API_KEY=....` 

In [29]:
LINES = []
with open(DATA_JSON) as my_data: 
    for line in my_data: 
        line      = line.strip()
        json_line = json.loads(line)
        LINES.append(json_line)

In [30]:
len(LINES)

10000

In [31]:
LINES[1]["reason_statements"]

[{'text': 'to be a classmate of Elvis or a workmate of Solomon is necessary for being a son of Rich',
  'starts_at': 134,
  'ref_reco': 5},
 {'text': 'if someone is not a cousin of Johnny, then they are not a classmate of Elvis',
  'starts_at': 303,
  'ref_reco': 2},
 {'text': 'being an uncle of Ivan is sufficient for not being a classmate of Elvis',
  'starts_at': 501,
  'ref_reco': 3}]

In [32]:
list(LINES[0].keys())

['argument_source',
 'argdown_reconstruction',
 'reason_statements',
 'conclusion_statements',
 'premises',
 'premises_formalized',
 'conclusion',
 'conclusion_formalized',
 'intermediary_conclusions_formalized',
 'intermediary_conclusions',
 'distractors',
 'id',
 'predicate_placeholders',
 'entity_placeholders',
 'steps',
 'n_premises',
 'n_distractors',
 'base_scheme_groups',
 'scheme_variants',
 'domain_id',
 'domain_type',
 'plcd_subs',
 'argdown_index_map',
 'presentation_parameters']

In [33]:
# defines how to present reason and conclusion statements to the model
def format_statements_list(statements: list) -> str:
    if len(statements)==0:
        return "None"
    list_as_string = ["%s (ref: (%s))" % (sdict['text'],sdict['ref_reco']) for sdict in statements]
    list_as_string = " | ".join(list_as_string)
    return list_as_string

In [35]:
format_statements_list(LINES[1]["conclusion_statements"])

'A son of Rich works with Solomon (ref: (6)) | being a son of Rich is sufficient for not being a classmate of Elvis (ref: (4))'

In [36]:
print(LINES[0]["argdown_reconstruction"])

(1) If, and only if, someone is allergic to cheese, then they are allergic to joghurt.
--
with generalized biconditional elimination {uses: [1]}
--
(2) If someone is allergic to cheese, then they are allergic to joghurt.
(3) If someone is a sufferer of allergy to cheese, then they are a sufferer of allergy to egg or a sufferer of allergy to ginger.
(4) If someone is allergic to egg, then they are allergic to cinnamon.
(5) If someone is allergic to ginger, then they are allergic to cinnamon.
--
with generalized dilemma {uses: [3,4,5]}
--
(6) If someone is allergic to cheese, then they are allergic to cinnamon.
--
with generalized adjunction {uses: [2,6]}
--
(7) If someone is a sufferer of allergy to cheese, then they are a sufferer of allergy to joghurt and a sufferer of allergy to cinnamon.


In [39]:
# defines how to present argdown-snippet to the model
def format_argdown(argdown: str) -> str:
    argdown = argdown
    pattern = r"({.*uses: \[[\s\d,]*\]})" # matches yaml metadata inline blocks in inference patterns 
    matches = re.findall(pattern, argdown)
    for match in matches:
        m = match.replace('uses:','"uses":')
        m = m.replace('variant:','"variant":')
        #print(m)
        d = ast.literal_eval(m)
        subst = "" 
        if "variant" in d:
            subst = "(%s) " % ", ".join(d['variant'])
        subst = subst + "from " + " ".join(["(%d)" % i for i in d['uses']])
        argdown = argdown.replace(match,subst)
    return argdown

In [43]:
print(format_argdown(LINES[1]["argdown_reconstruction"]))

(1) If someone is a son of Rich, then they are not a cousin of Johnny or an uncle of Ivan.
(2) If someone is not a cousin of Johnny, then they are not a classmate of Elvis.
(3) If someone is an uncle of Ivan, then they are not a classmate of Elvis.
--
with generalized dilemma (negation variant) from (1) (2) (3)
--
(4) If someone is a son of Rich, then they are not a classmate of Elvis.
(5) If someone is a son of Rich, then they are a classmate of Elvis or a workmate of Solomon.
--
with generalized disjunctive syllogism from (4) (5)
--
(6) If someone is a son of Rich, then they work with Solomon.


In [44]:
# define modes in terms of keys
modes = [
    {'from':['argument_source'],'to':'argdown_reconstruction'},
    {'from':['argument_source','reason_statements'],'to':'argdown_reconstruction'},
    {'from':['argument_source','conclusion_statements'],'to':'argdown_reconstruction'},
    {'from':['reason_statements','conclusion_statements'],'to':'argdown_reconstruction'},
    {'from':['argument_source','reason_statements','conclusion_statements'],'to':'argdown_reconstruction'},
    {'from':['argument_source'],'to':'reason_statements'},
    {'from':['argument_source','argdown_reconstruction'],'to':'reason_statements'},
    {'from':['argument_source','conclusion_statements'],'to':'reason_statements'},
    {'from':['argument_source'],'to':'conclusion_statements'},
    {'from':['argument_source','argdown_reconstruction'],'to':'conclusion_statements'},
    {'from':['argument_source','reason_statements'],'to':'conclusion_statements'},
]
len(modes)

11

In [45]:
train_amount = int(len(LINES)*0.7)
eval_amount  = int(len(LINES)*0.15)
random.shuffle(LINES)
train_instances = LINES[:train_amount]
dev_instances   = LINES[train_amount:train_amount+eval_amount]
test_instances  = LINES[train_amount+eval_amount:]
max_words = 750

I just took a random train/test/dev split to start with. Not sure how much this makes sense given the corpus.

In [46]:
##open wandb again
run = wandb.init(entity="aaac",project="dataset_versions",name="dataset_upload")

with tempfile.TemporaryDirectory() as tempdir: 
    for sname,split in [
        ("train",train_instances),
        ("dev",dev_instances),
        ("test",test_instances)
    ]:
        ### outputfile 
        over = 0
        total = 0
        
        file_out = os.path.join(tempdir,sname+".jsonl")
        write_file = open(file_out,'w')
    
        for k,instance in enumerate(split):
 
        
            ### iterate over all modes
            for mode in modes:
                mname = "+".join(mode['from']) +'>'+mode['to']

                # construct input
                question=""
                for key_from in mode['from']:
                    add = instance[key_from]
                    if key_from in ["reason_statements","conclusion_statements"]:
                        add = format_statements_list(add)
                    elif key_from in ["argdown_reconstruction"]:
                        add = format_argdown(add)
                    question = question + " %s: %s" % (key_from,add)
                #question = question + " " + mode['to'] +":" # mode['to'] is used as prefix
                question = question.strip()


                # construct output
                output=instance[mode['to']]
                if mode['to'] in ["reason_statements","conclusion_statements"]:
                    output = format_statements_list(output)
                elif mode['to'] in ["argdown_reconstruction"]:
                    output = format_argdown(output)
                output = output.strip()

                ### arbitrary limitation on input size for now
                ## transformer is limited here
                if len(question.split()) <= max_words and len(output.split()) <= max_words: 
                    # put input and output together
                    ### json line format and schema for Kyle's model 
                    new_item = {}
                    new_item["id"] = "%s_%d_%s" % (sname,k,mname)
                    new_item["question"] = {}
                    new_item["question"]["stem"] = question #<-- input field
                    new_item["output"] = output ##<-- left in newlines, tokenizer will ignore them
                    new_item["prefix"] = mode['to'] +":" ##<-- model specific field, indicates the model mode | "answer:" -> using the HACK by Dennis
                    write_file.write(json.dumps(new_item))
                    write_file.write("\n")
                
        write_file.close()
        
    ### write to wandb 
    artifact = wandb.Artifact("aaac_multi_angle",type='dataset',metadata={
        "source": ('aaac/aaac_model_runs/aaac_raw:%s' % VERSION),
        "max_length": max_words,
        "modes": str(modes)
    })
    artifact.add_dir(tempdir)
    run.log_artifact(artifact)
    run.finish()

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33maaac[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


[34m[1mwandb[0m: Adding directory to artifact (/var/folders/ql/h_s52yl51x70ynttgg820gz80000gp/T/tmpt1ed8e86)... Done. 0.3s


VBox(children=(Label(value=' 0.00MB of 179.98MB uploaded (0.00MB deduped)\r'), FloatProgress(value=5.855139276…