In [None]:
import pandas as pd
import os
import cooklang
import re

In [None]:
example_df = pd.read_csv("data/bronze/cooklang_dataset_with_method.tsv", sep="\t")
example_df.head()

In [None]:
def get_method(recipe : str):
    method = []
    for s in cooklang.parseRecipe(recipe)['steps']:
        for ss in s:
            if 'value' in ss:
                if not ss['value'].startswith("-"):
                    method.append(ss['value'])
    method = ".".join(method[1:]).strip().strip(".")
    return method

def parse_ingredients(recipe: str):
    cooklang_recipe = cooklang.parseRecipe(recipe.replace("--", " "))
    ingredients = cooklang_recipe["ingredients"]

    ingredients = [
        f"{i['quantity'].rstrip('0').rstrip(".")} {i['units']} {i['name']}".strip().replace(".000", "")
        for i in ingredients
    ]
    return ",".join(ingredients)


# find all files that are in data/bronze/recipes and has .source extension it should also check subdirectories
source_files = []
for root, dirs, files in os.walk('data/bronze/recipes'):
    for file in files:
        if file.endswith('.source'):
            source_files.append(os.path.join(root, file))

data = []
for file_path in source_files:
    with (open(file_path, 'r')) as f:
        recipe_text = f.read()
    method = get_method(recipe_text)
    with (open(file_path.replace(".source",".cook" ), 'r')) as f:
        cooklang_text = f.read()
    ingredients = parse_ingredients(cooklang_text)
    if method == "":
        continue
    data.append(
        {
            "source" : file_path,
            "name" : os.path.basename(file_path).replace(".source", ""),
            "metadata" : None,
            "method": method, 
            "ingredients": ingredients,
            "cooklang" : cooklang_text.strip().strip("\n"),
        }
    )
df = pd.DataFrame(data)
df.to_csv('data/bronze/recipe_dataset_with_method.tsv', sep='\t', index=False)

In [None]:
source_files = []
for root, dirs, files in os.walk("data/bronze/bonmot"):
    for file in files:
        if file.endswith(".source"):
            source_files.append(os.path.join(root, file))

data = []
for file_path in source_files:
    with open(file_path, "r") as f:
        cook_text = f.read()
    cook_text = "\n".join(
        [line for line in cook_text.split("\n") if not line.startswith(">>")]
    )

    method_text = cook_text.replace("@", "")
    method_text = re.sub(r"\{[^}]*\}", "", method_text)
    method = get_method(method_text)

    ingredients = parse_ingredients(cook_text)
    if method == "":
        continue
    data.append(
        {
            "source": file_path,
            "name": os.path.basename(file_path).replace(".source", ""),
            "metadata": None,
            "method": method,
            "ingredients": ingredients,
            "cooklang": cook_text.strip().strip("\n"),
        }
    )
df = pd.DataFrame(data)
df = df[df["ingredients"] != ""]
df = df.reset_index(drop=True)
df.to_csv("data/bronze/bonmot_dataset_with_method.tsv", sep="\t", index=False)