In [1]:
import json, random, re, collections, itertools, base64, hashlib
from datetime import datetime, timedelta
from pathlib import Path
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from operator import itemgetter as at
from IPython.core.display import display, HTML
from ipywidgets import interact
display_html = lambda x: display(HTML(x))
annot_path = Path("../annotations")
data_path = Path("../data")
preprocessed_path = Path("../preprocessed")

In [2]:
!rm ../preprocessed/*

zsh:1: no matches found: ../preprocessed/*


# Read meta data

In [3]:
ingredients = []
# Read ingredients map
with (data_path/ "ingredients_map.json").open('r') as f:
    ingredients.extend([(k,v) for v,k in json.load(f).items()])
# Read implicit ingredients
with (data_path/ "implicit_ingredients.json").open('r') as f:
    ingredients.extend(json.load(f).items())
# Read tools
with (data_path/ "tools.json").open('r') as f:
    ingredients.extend(json.load(f).items())
# Read time_lengths
IMMEDIATE = "LIMMEDIATE"
ingredients.append((IMMEDIATE, "Ends Immediately"))
with (data_path/ "time_lengths.json").open('r') as f:
    ingredients.extend(json.load(f).items())    
ingredients [-10:]

[('TSKILLET', 'Skillet'),
 ('TSTIR', 'Stirring spoon'),
 ('TSPATULA', 'Spatula'),
 ('TWOK', 'Wok'),
 ('TOTHER', 'Other'),
 ('LIMMEDIATE', 'Ends Immediately'),
 ('LTIME', 'For X minutes'),
 ('LCOLOR', 'Until color change'),
 ('LTEXTURE', 'Until texture change'),
 ('LTEMPTRATURE', 'Until cool/boil')]

In [4]:
with (data_path/ "resources.json").open('r') as f:
    resources=json.load(f)
    resources = [(res["id"], res_category["name"] + "/" + res["name"]) for res_category in resources for res in res_category["children"]]
resources

[('A1', 'Idle/Unused'),
 ('A2', 'Idle/Set-Aside'),
 ('A0', 'Idle/Discarded'),
 ('A5', 'Idle/Serve'),
 ('W1', 'Countertop/Combine/Mix'),
 ('W2', 'Countertop/Roll/Fold'),
 ('W3', 'Countertop/Massage/Rub/Knead'),
 ('W4', 'Countertop/Peel/Squeeze'),
 ('W5', 'Countertop/Coat/Sprinkle'),
 ('C1', 'Countertop/Cut/chop'),
 ('GW1', 'Stove/Low'),
 ('GW2', 'Stove/Medium-Low'),
 ('GW3', 'Stove/Medium'),
 ('GW4', 'Stove/Medium-High'),
 ('GW5', 'Stove/High'),
 ('BL1', 'Oven/Low'),
 ('BL2', 'Oven/Medium'),
 ('BL3', 'Oven/High'),
 ('R1', 'Regrigerator/Marinade'),
 ('R2', 'Regrigerator/Chill'),
 ('R3', 'Regrigerator/Freeze'),
 ('S1', 'Sink/Wash'),
 ('S2', 'Sink/Drain'),
 ('O1', 'Other/Grind'),
 ('O2', 'Other/Blend')]

In [5]:
print (f"# of resources: {len(resources)}")
print (f"# of ingredients: {len(ingredients)}")
print (f"Vector size {len(ingredients)*len(resources)}")

# of resources: 25
# of ingredients: 755
Vector size 18875


In [6]:
idx2label=list(itertools.product(map(at(0), resources), map(at(0), ingredients)))
label2idx={r:i for i,r in enumerate(idx2label)}

## Save mappings

In [7]:
with (preprocessed_path/"resources.json").open('w') as f:
    json.dump(resources, f)
with (preprocessed_path/"ingredients.json").open('w') as f:
    json.dump(ingredients, f)
with (preprocessed_path/"labels.json").open('w') as f:
    json.dump(idx2label, f)

In [8]:
def ing2type(ing_id):
    return {
        "I": "Ingredient",
        "M": "Unlisted ingredient",
        "L": "Time duration",
        "T": "Tool",
        
    }[ing_id[0]]

# Read annotations

In [9]:
def handle_instruction_label(lst):
    events = list(map(at("start", "end", "action", "resource") ,lst))
    ret = collections.defaultdict(list)
    for start,end,action, resource in events:
        start = (datetime.strptime(start, "%Y-%m-%dT00:00:00") - datetime(2020,1,1)).days
        end = (datetime.strptime(end, "%Y-%m-%dT00:00:00") - datetime(2020,1,1)).days
        for i in range(start, end):
            ret[i].append((resource, action))
    # Add "LIMMEDIATE" if no time duration specified
    for lst in ret.values():
        has_time_duration_map = collections.defaultdict(bool)
        for res, action in lst:
            has_time_duration_map[res]|=ing2type(action)=="Time duration"
        for res, has_time_duration in has_time_duration_map.items():
            if not has_time_duration:
                lst.append((res, IMMEDIATE))
            lst.sort()
        
    return dict(ret)


annotations = dict()
instructions = dict()
for p in annot_path.iterdir():
    annotaion_id = p.name.split('.',1)[0]
    with p.open('r') as f:
        annotation = json.load(f)
    if int(annotation["status"])<=0:
        continue
    instructions[annotaion_id] = annotation["instructions"]
    annotations[annotaion_id]=list(map(handle_instruction_label,annotation["labels"]))
with (data_path/"annotaions.json").open('w') as f:
    json.dump(annotations, f)
annotations

{'103308': [{0: [('A1', 'I10_d4pRP'),
    ('A1', 'I7oUHHY41'),
    ('A1', 'IN0CxTNVh'),
    ('A1', 'INNbMITPe'),
    ('A1', 'IW2FkWnJk'),
    ('A1', 'Itg64O_Uz'),
    ('A1', 'IukXkN8DV'),
    ('A1', 'Iynxt1P5R'),
    ('A1', 'LIMMEDIATE')]},
  {1: [('A1', 'IN0CxTNVh'),
    ('A1', 'INNbMITPe'),
    ('A1', 'IW2FkWnJk'),
    ('A1', 'Itg64O_Uz'),
    ('A1', 'IukXkN8DV'),
    ('A1', 'Iynxt1P5R'),
    ('A1', 'LIMMEDIATE'),
    ('GW4', 'I10_d4pRP'),
    ('GW4', 'I7oUHHY41'),
    ('GW4', 'LIMMEDIATE'),
    ('GW4', 'TSKILLET')],
   0: [('A1', 'I10_d4pRP'),
    ('A1', 'I7oUHHY41'),
    ('A1', 'IN0CxTNVh'),
    ('A1', 'INNbMITPe'),
    ('A1', 'IW2FkWnJk'),
    ('A1', 'Itg64O_Uz'),
    ('A1', 'IukXkN8DV'),
    ('A1', 'Iynxt1P5R'),
    ('A1', 'LIMMEDIATE')]},
  {1: [('A1', 'IN0CxTNVh'),
    ('A1', 'INNbMITPe'),
    ('A1', 'IW2FkWnJk'),
    ('A1', 'Itg64O_Uz'),
    ('A1', 'LIMMEDIATE'),
    ('GW4', 'I10_d4pRP'),
    ('GW4', 'I7oUHHY41'),
    ('GW4', 'LIMMEDIATE'),
    ('GW4', 'TSKILLET'),
    ('W1', 

# Vectorize annotations

In [10]:
def vectorize_instruction_annotation(instruction_annotation):
    ret = np.zeros((len(instruction_annotation), len(idx2label)))
    for i,tuples in instruction_annotation.items():
        ret[i, list(map(label2idx.get,tuples))]=1
    return ret.astype(bool)

vectorize_instruction_annotation({0: [('S1', 'Ieg3R-oQ_'), ('S1', 'LIMMEDIATE')], 1: [('S2', 'Ieg3R-oQ_'), ('S2', 'LIMMEDIATE')]})

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [11]:
indices_of_noningredients = [label2idx[x] for x in itertools.product(map(at(0), resources), [i for i,d in ingredients if ing2type(i) != "Ingredient"])]
for recipe_id, instruction_annotations in tqdm(annotations.items(), total=len(annotations)):
    with (annot_path/ f"{recipe_id}.json").open('r') as r:
        annotation = json.load(r)
    annotation_data = [vectorize_instruction_annotation(instruction_annotation) for instruction_annotation in instruction_annotations]
    annotation_data = {"{r}_{i}".format(r=recipe_id, i=i):v  for i, v in enumerate(annotation_data)}
    annotation_starts = [0] + [len(a) for a in annotation_data.values()]
    annotation_timeranges = list(zip(annotation_starts[:-1], annotation_starts[1:]))

    instructions = [{"text": instruction, "start": annotation_timeranges[0], "end": annotation_timeranges[1]}
     for instruction,annotation_timeranges in zip(annotation["instructions"], annotation_timeranges)]
    indices_of_ingredients = [label2idx[x] for x in itertools.product(map(at(0), resources), annotation["normalized_ingredients"])]
    meta_data = {"id":recipe_id, "instructions": instructions, "indices_of_interest": indices_of_ingredients+indices_of_noningredients}
    with (preprocessed_path/f"{recipe_id}.json").open('w') as f:
        json.dump(meta_data,f, indent=4)
    np.savez_compressed(str(preprocessed_path/ "{r}.npz".format(r=recipe_id) ), **annotation_data)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


