In [None]:
from programs import (
    get_program,
    CookLangFormatter,
    CookLangFormatterNoSteps,
    CookLangFormatterNoStepsNoIngredients,
)
import os
import glob
import dspy
import pandas as pd
import cooklang
from tqdm import tqdm

from dotenv import load_dotenv

In [None]:
# load your environment variables from .env file
load_dotenv()

# litellm deployment
LITELLM_URL = os.getenv("LITELLM_URL")
LITELLM_API_KEY = os.getenv("LITELLM_API_KEY")
# ollama deployment
OLLAMA_URL = os.getenv("OLLAMA_URL")

In [None]:
def load_raw_data():
    def parse_ingredients(recipe: str):
        cooklang_recipe = cooklang.parseRecipe(recipe.replace("--", " "))
        ingredients = cooklang_recipe["ingredients"]

        ingredients = [
            f"{i['quantity']} {i['units']} {i['name']}".strip().replace(".000", "")
            for i in ingredients
        ]
        return ",".join(ingredients)

    df = pd.read_csv("data/bronze/cooklang_dataset_with_method.tsv", sep="\t")
    df["ingredients"] = df["cooklang"].apply(parse_ingredients)

    r_df = pd.read_csv("data/bronze/recipe_dataset_with_method.tsv", sep="\t")
    b_df = pd.read_csv("data/bronze/bonmot_dataset_with_method.tsv", sep="\t")
    df = pd.concat([df, r_df, b_df], ignore_index=True)
    df["ingredients"] = df["ingredients"].fillna("No ingredients.")
    return df

In [None]:
def get_json_files(directory):
    # Ensure the directory path ends with a slash
    directory = os.path.join(directory, "")

    # Use glob to get all .json files in the directory
    json_pattern = os.path.join(directory, "*.json")
    json_files = glob.glob(json_pattern)

    # Return the list of JSON file paths
    return json_files


def get_cooklang(row, program):
    ings = row["ingredients"]
    method = row["method"]
    try:
        pred = None
        if isinstance(program, CookLangFormatter):
            pred = program(recipe_text=method, ingredients=ings)
        elif isinstance(program, CookLangFormatterNoSteps):
            pred = program(recipe_text=method, ingredients=ings)
        elif isinstance(program, CookLangFormatterNoStepsNoIngredients):
            pred = program(recipe_text=method)
        else:
            raise ValueError("Invalid program type")
        print(f"Generated: {pred.cooklang}")
        return pred.cooklang
    except Exception as e:
        print(f"Error: {e}")
        return "Failed to generate"


def get_lm(model):
    if "gpt-" in model:
        lm = dspy.LM(
            model,
            api_base=LITELLM_URL,
            api_key=LITELLM_API_KEY,
            temperature=0,
            cache=False,
            max_tokens=2_000,
        )
    elif "llama" in model:
        lm = dspy.LM(
            base_url=OLLAMA_URL, model=f"ollama/{model}", max_tokens=2_000, cache=False
        )
    else:
        raise ValueError("Invalid model name")
    return lm


directory_path = "data/programs"
json_files = get_json_files(directory_path)
for program_path in tqdm(json_files):
    program_name = program_path.split("/")[-1]
    optimizer = program_name.split("_")[0]
    model = program_name.split("_")[1]
    with_cooklang = True if "with_cooklang" in program_name else False
    without_ings = True if "without_ings" not in program_name else False
    program = get_program(
        optimizer, model, cooklang_spec=with_cooklang, ingredients=without_ings
    )
    df = load_raw_data()
    lm = get_lm(model)
    dspy.settings.configure(lm=lm)

    examples = []
    for _, row in df.iterrows():
        if not without_ings:
            examples.append(
                dspy.Example(recipe_text=row["method"]).with_inputs("recipe_text")
            )
        else:
            examples.append(
                dspy.Example(
                    recipe_text=row["method"], ingredients=row["ingredients"]
                ).with_inputs("recipe_text", "ingredients")
            )

    evaluate = dspy.Evaluate(
        devset=examples,
        metric=lambda x, y: True,
        num_threads=15,
        display_progress=True,
        return_outputs=True,
    )
    _, outputs = evaluate(program)
    answers = [dict(output[1]) for output in outputs]
    eval_df = pd.DataFrame(answers)
    df["predict_cooklang"] = eval_df["cooklang"]

    df["predict_cooklang"] = df["predict_cooklang"].fillna("Failed to generate")
    df.to_csv(f"data/silver/{program_name}_predictions.tsv", sep="\t", index=False)