In [1]:
import collections, itertools, json, re, operator
from functools import reduce
from pathlib import Path
import pandas as pd

ls = lambda p, g="*.*": print("\n".join(map(str, p.glob(g))))

data_dir = Path("../data/scone")
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 100)

In [2]:
ls(data_dir)

../data/scone/scene-test.tsv
../data/scone/alchemy-test.tsv
../data/scone/alchemy-train.tsv
../data/scone/scene-dev.tsv
../data/scone/README.md
../data/scone/alchemy-train-orig.tsv
../data/scone/scene-train-orig.tsv
../data/scone/tangrams-test.tsv
../data/scone/scene-train.tsv
../data/scone/tangrams-train-orig.tsv
../data/scone/tangrams-train.tsv
../data/scone/alchemy-dev.tsv
../data/scone/tangrams-dev.tsv


In [3]:
parsed = []
with (data_dir / "alchemy-dev.tsv").open('r') as f:
    for line in f:
        idx, *parts = line.strip().split('\t')
        states       = [{int(v.split(':',1)[0]):v.split(':')[1].strip() for v in p.split(' ')} for i,p in enumerate(parts) if i%2==0]
        instructions = [p for i,p in enumerate(parts) if i%2==1]
        instructions.insert(0,"INIT")
        seq = 0
        for state, instruction in zip(states,instructions):
            d = state
            d["instruction"]=instruction
            d['id']=idx
            d['seq']=seq
            parsed.append(d)
            seq+=1
df = pd.DataFrame(parsed)
df.head(100)

Unnamed: 0,1,2,3,4,5,6,7,instruction,id,seq
0,_,g,p,o,g,r,y,INIT,dev-1830,0
1,_,g,p,_,g,r,y,throw out the orange chemical,dev-1830,1
2,_,_,p,_,g,r,yg,"then, add the leftmost beaker of green chemical to the yellow chemical",dev-1830,2
3,_,_,p,_,g,r,bb,mix it,dev-1830,3
4,_,_,p,_,_,r,bbg,"then, add the remaining green chemical to it",dev-1830,4
5,_,_,p,_,_,r,bbb,mix that too,dev-1830,5
6,y,_,y,y,p,g,ooo,INIT,dev-1831,0
7,y,_,y,y,p,_,ooo,throw out green beaker,dev-1831,1
8,y,_,y,y,_,_,ooo,throw out purple one,dev-1831,2
9,y,_,_,yy,_,_,ooo,pour third beaker into fourth one,dev-1831,3


In [4]:
lst = ['a','b']
lst.insert(0,'c')
lst

['c', 'a', 'b']

In [5]:
beakers = reduce(operator.or_, [set(df[i].unique()) for i in range(1,8)], set())
colors = list({c for b in beakers for c in b if '_' not in c})

In [6]:
len(beakers)

114

In [7]:
colors

['o', 'p', 'y', 'b', 'g', 'r']

In [8]:
desc = {"p": "Purple", "o": "Orange", "y": "Yellow", "b": "Blue", "g": "Green", "r":"Red"}

In [9]:
ingredients = {"I" + (c).upper() : desc[c] for c in colors}
tools = {"T" + (c + c).upper() : desc[c] for c in colors}
time_lengths = {"L" + (c + c + c).upper() : desc[c] for c in colors}
ingredients

{'IO': 'Orange',
 'IP': 'Purple',
 'IY': 'Yellow',
 'IB': 'Blue',
 'IG': 'Green',
 'IR': 'Red'}

In [10]:
tools

{'TOO': 'Orange',
 'TPP': 'Purple',
 'TYY': 'Yellow',
 'TBB': 'Blue',
 'TGG': 'Green',
 'TRR': 'Red'}

In [12]:
time_lengths

{'LOOO': 'Orange',
 'LPPP': 'Purple',
 'LYYY': 'Yellow',
 'LBBB': 'Blue',
 'LGGG': 'Green',
 'LRRR': 'Red'}