In [1]:
import pandas
import json
import numpy as np

In [2]:
base_data_path = "/Users/david/Documents/Research2324/Sanner/data/500QA.json"

In [3]:
# Load in recipe-mpr 500QA
with open(base_data_path) as base_data_file:
    base_data = json.load(base_data_file)

## 1000 items - get all unique recipes first

In [6]:
# FS examples
fs_exs = ['Salsa made with pineapple, mangoes, onions, black beans, corn, strawberries, jalapeno, and tomatoes',
          'Sweet chinese red bean bun',
          'Chocolate cupcakes with blue vanilla custard filling hidden inside the cupcake',
          'Thai curry with shrimp and coconut milk that can be dipped with bread',
          'Breakfast quiche with onions, swiss cheese, and bacon'
]

In [7]:
all_items = []
for query_set in base_data:
    for key, option in query_set['options'].items():
        if not (option in all_items or option in fs_exs):
            all_items.append(option)

In [8]:
len(all_items)

1827

In [10]:
# Randomly select 1000 items
np.random.seed(42)
keep_indices = list(np.random.choice(range(len(all_items)), size=(1000,), replace=False))
items_1000_dict = {}
for i in range(1000):
    items_1000_dict[i] = {'description': all_items[keep_indices[i]]}
items_1000_dict

{0: {'description': 'Zucchini stir-fried with salad oil, water, salt, and sugar'},
 1: {'description': 'Chinese style ribs made from sparerib racks, fresh ginger, and honey'},
 2: {'description': 'Regular dinner rolls with honey'},
 3: {'description': 'Roasted almonds'},
 4: {'description': 'A Mexican take of traditional Filipino Adobo chicken with noodles'},
 5: {'description': 'Caramel apple jam made with granny smith and braeburn apples'},
 6: {'description': 'Swedish meatballs - beef and pork meatballs with savoury mushroom gravy'},
 7: {'description': 'Indian vegan tikka masala curry that tastes like chicken'},
 8: {'description': 'Iced strawberry smoothie with ginger, lime, and honey added'},
 9: {'description': 'Sauteed kale with onion, olive oil, and sea salt'},
 10: {'description': 'Dinner waffles with deli turkey'},
 11: {'description': 'Not Difficult At All: Oi-sobagi (Stuffed Cucumber Kimchi) recipe'},
 12: {'description': 'Quinoa cake with fruits and nuts that is diary-fre

In [11]:
# Save 1000 random items to file
save_file_1000_items = "data/recipe_1000_sample.json"
with open(save_file_1000_items, "w") as save_file:
    json.dump(items_1000_dict, save_file, indent=4) 

## 100 items

In [6]:
# Randomly select 100 items
np.random.seed(42)
keep_indices = list(np.random.choice(range(len(base_data)), size=(100,), replace=False))
items_100_dict = {}
for i in range(100):
    item_data = base_data[keep_indices[i]]
    items_100_dict[i] = {'description': item_data['options'][item_data['answer']]}
items_100_dict

{0: {'description': 'Soup made with cabbage, carrots, potatoes, and onions'},
 1: {'description': '2-ingredient yogurt from scratch'},
 2: {'description': 'High protein skillet with rice and vegetables'},
 3: {'description': 'Chinese salad made of eggplant and zucchini, flavoured with soy sauce'},
 4: {'description': 'Large portion fries with gravy and bacon'},
 5: {'description': 'Beef stew made with beef, carrots and potatoes'},
 6: {'description': 'Grilled salmon seasoned with salt and pepper'},
 7: {'description': 'Poke of tofu, mushroom and tomatoes'},
 8: {'description': 'Queso made from melted cheese and chili peppers'},
 9: {'description': 'Low fat baked breakfast pie made with sausage (pork) and apples'},
 10: {'description': 'Chicken breasts stuffed with bacon and mozzarella'},
 11: {'description': 'Chopped French duck confit preserved by duck oil'},
 12: {'description': 'Spaghetti using year round egg catsup noodles, made with sauce consisting of ground beef meat, bacon (por

In [25]:
# Save 100 random items to file
save_file_100_items = "data/recipe_100_sample.json"
with open(save_file_100_items, "w") as save_file:
    json.dump(items_100_dict, save_file, indent=4) 

### 25 item non-overlapping set

In [35]:
full_idxs = []
for i in range(len(base_data)):
    full_idxs.append(i)
holdout_idxs = [idx for idx in full_idxs if idx not in keep_indices]

In [36]:
# Random 25 item sample from holdouts
np.random.seed(42)
fs25_temp_indices = list(np.random.choice(range(len(holdout_idxs)), size=(25,), replace=False)) # this is the idx of the idx of the item

In [37]:
fs25_temp_indices

[209,
 280,
 33,
 210,
 93,
 84,
 329,
 94,
 266,
 126,
 9,
 361,
 56,
 72,
 132,
 42,
 278,
 376,
 231,
 385,
 77,
 15,
 391,
 271,
 0]

In [40]:
fs25_items = {}
for i,fs_idx in enumerate(fs25_temp_indices):
    item_data = base_data[holdout_idxs[fs_idx]]
    fs25_items[i] = {'description': item_data['options'][item_data['answer']]}

In [41]:
fs25_items

{0: {'description': 'Mini corndogs'},
 1: {'description': 'Flourless chocolate cake with butter'},
 2: {'description': 'Creamy broccoli dish with mayonnaise and cheddar cheese'},
 3: {'description': 'Chocolate cupcakes with blue vanilla custard filling hidden inside the cupcake'},
 4: {'description': 'Loaded potato salad made with mayonnaise, buttermilk, cheddar and crumbled bacon'},
 5: {'description': 'Chowder soup made from corn, bacon, cornstarch and heavy cream'},
 6: {'description': 'Salmon cooked in the oven, seasoned with garlic, onion, cumin, dill, and lime'},
 7: {'description': 'McDonalds-style fries made from russet potatoes'},
 8: {'description': 'Protein box made from eggs, fruit, and yogurt'},
 9: {'description': 'Sweet chinese red bean bun'},
 10: {'description': 'Garlic butter strip steaks cooked with onions'},
 11: {'description': 'Breakfast quiche with onions, swiss cheese, and bacon'},
 12: {'description': '3-ingredient pizza with mozzarella topping'},
 13: {'descri

In [42]:
# Save 25 random holdout items to file
save_file_fs_items = "data/FS_recipe_25_sample.json"
with open(save_file_fs_items, "w") as save_file:
    json.dump(fs25_items, save_file, indent=4) 

### 16 item subset

In [43]:
items_16_dict = {}
for i in range(16):
    items_16_dict[i] = items_100_dict[i]

In [44]:
items_16_dict

{0: {'description': 'Soup made with cabbage, carrots, potatoes, and onions'},
 1: {'description': '2-ingredient yogurt from scratch'},
 2: {'description': 'High protein skillet with rice and vegetables'},
 3: {'description': 'Chinese salad made of eggplant and zucchini, flavoured with soy sauce'},
 4: {'description': 'Large portion fries with gravy and bacon'},
 5: {'description': 'Beef stew made with beef, carrots and potatoes'},
 6: {'description': 'Grilled salmon seasoned with salt and pepper'},
 7: {'description': 'Poke of tofu, mushroom and tomatoes'},
 8: {'description': 'Queso made from melted cheese and chili peppers'},
 9: {'description': 'Low fat baked breakfast pie made with sausage (pork) and apples'},
 10: {'description': 'Chicken breasts stuffed with bacon and mozzarella'},
 11: {'description': 'Chopped French duck confit preserved by duck oil'},
 12: {'description': 'Spaghetti using year round egg catsup noodles, made with sauce consisting of ground beef meat, bacon (por

In [45]:
# Save 16 subset of 100 random items to file
save_file_16_items = "data/recipe_16_sample.json"
with open(save_file_16_items, "w") as save_file:
    json.dump(items_16_dict, save_file, indent=4) 

In [8]:
# Create name to id map
inv_map = {v['description']: k for k, v in items_100_dict.items()}

In [9]:
# Save map to file
save_file_recipe_map = "data/name_maps/recipe_100_map.json"
with open(save_file_recipe_map, "w") as save_file:
    json.dump(inv_map, save_file, indent=4) 

### Testing for parsing

In [12]:
import re

# Your input string
dataset = [
            "Soup made with cabbage, carrots, potatoes, and onions \\n\n2-ingredient yogurt from scratch \\n\nHigh protein skillet with rice and vegetables \\n\nChinese salad made of eggplant and zucchini, flavoured with soy sauce \\n\nLarge portion fries with gravy and bacon \\n\nBeef stew made with beef, carrots and potatoes \\n\nPoke of tofu, mushroom and tomatoes \\n\nQueso made from melted cheese and chili peppers \\n\nLow fat baked breakfast pie made with sausage (pork) and apples \\n\nChicken breasts stuffed with bacon and mozzarella",
            "Beef stew made with beef, carrots and potatoes\nGrilled salmon seasoned with salt and pepper\nChicken breasts stuffed with bacon and mozzarella\nChopped French duck confit preserved by duck oil\nSpaghetti using year round egg catsup noodles, made with sauce consisting of ground beef meat, bacon (pork) meat, cheddar cheese, parmesan, and tomatoes.\nLow fat baked breakfast pie made with sausage (pork) and apples\nHigh protein skillet with rice and vegetables\nSoup made with ground turkey that is low fat, high carb\nChinese salad made of eggplant and zucchini, flavoured with soy sauce\nPoke of tofu, mushroom and tomatoes",
            "Chicken breasts stuffed with bacon and mozzarella\n****\nBeef stew made with beef, carrots and potatoes\n****\nHigh protein skillet with rice and vegetables\n****\nLow fat baked breakfast pie made with sausage (pork) and apples\n****\nChopped French duck confit preserved by duck oil\n****\nSpaghetti using year round egg catsup noodles, made with sauce consisting of ground beef meat, bacon (pork) meat, cheddar cheese, parmesan, and tomatoes.\n****\nSoup made with ground turkey that is low fat, high carb\n****\nChinese salad made of eggplant and zucchini, flavoured with soy sauce\n****\nPoke of tofu, mushroom and tomatoes\n****\nQueso made from melted cheese and chili peppers"
]

for data in dataset:
    # Splitting the string using regular expression
    # The pattern '\n\*{4}\n|\n' matches both '\n****\n' and '\n'
    recipes = re.split(r'\n\*{4}\n|\n|\\n\n', data)

    # Printing the list of strings
    for recipe in recipes:
        print(recipe)

Soup made with cabbage, carrots, potatoes, and onions 
2-ingredient yogurt from scratch 
High protein skillet with rice and vegetables 
Chinese salad made of eggplant and zucchini, flavoured with soy sauce 
Large portion fries with gravy and bacon 
Beef stew made with beef, carrots and potatoes 
Poke of tofu, mushroom and tomatoes 
Queso made from melted cheese and chili peppers 
Low fat baked breakfast pie made with sausage (pork) and apples 
Chicken breasts stuffed with bacon and mozzarella
Beef stew made with beef, carrots and potatoes
Grilled salmon seasoned with salt and pepper
Chicken breasts stuffed with bacon and mozzarella
Chopped French duck confit preserved by duck oil
Spaghetti using year round egg catsup noodles, made with sauce consisting of ground beef meat, bacon (pork) meat, cheddar cheese, parmesan, and tomatoes.
Low fat baked breakfast pie made with sausage (pork) and apples
High protein skillet with rice and vegetables
Soup made with ground turkey that is low fat, h