In [None]:
from programs import (
    get_program,
    CookLangFormatter,
    CookLangFormatterNoSteps,
    CookLangFormatterNoStepsNoIngredients,
)
import os
import glob
import dspy
import pandas as pd
import cooklang
from tqdm import tqdm

from dotenv import load_dotenv

# load your environment variables from .env file
load_dotenv()

# azure-openai model deployment
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_DEPLOYMENT_4o_mini = os.getenv("AZURE_OPENAI_DEPLOYMENT_4o_mini")
AZURE_OPENAI_DEPLOYMENT_4o = os.getenv("AZURE_OPENAI_DEPLOYMENT_4o")
AZURE_OPENAI_VERSION = os.getenv("AZURE_OPENAI_VERSION")

# ollama deployment
OLLAMA_URL = os.getenv("OLLAMA_URL")

In [4]:
def load_raw_data():
    def parse_ingredients(recipe: str):
        cooklang_recipe = cooklang.parseRecipe(recipe.replace("--", " "))
        ingredients = cooklang_recipe["ingredients"]

        ingredients = [
            f"{i['quantity']} {i['units']} {i['name']}".strip().replace(".000", "")
            for i in ingredients
        ]
        return ",".join(ingredients)

    df = pd.read_csv("data/bronze/cooklang_dataset_with_method.tsv", sep="\t")
    df["ingredients"] = df["cooklang"].apply(parse_ingredients)
    df["ingredients"] = df["ingredients"].fillna("No ingredients.")
    return df

In [None]:
def get_json_files(directory):
    # Ensure the directory path ends with a slash
    directory = os.path.join(directory, "")

    # Use glob to get all .json files in the directory
    json_pattern = os.path.join(directory, "*.json")
    json_files = glob.glob(json_pattern)

    # Return the list of JSON file paths
    return json_files


def get_cooklang(row, program):
    ings = row["ingredients"]
    method = row["method"]
    try:
        pred = None
        if isinstance(program, CookLangFormatter):
            pred = program(recipe_text=method, ingredients=ings)
        elif isinstance(program, CookLangFormatterNoSteps):
            pred = program(recipe_text=method, ingredients=ings)
        elif isinstance(program, CookLangFormatterNoStepsNoIngredients):
            pred = program(recipe_text=method)
        else:
            raise ValueError("Invalid program type")
        return pred.cooklang
    except Exception as e:
        print(f"Error: {e}")
        return "Failed to generate"


def get_lm(model):
    # client for AzureOpenAI
    if model == "gpt-4o":
        lm = dspy.AzureOpenAI(
            api_base=AZURE_OPENAI_ENDPOINT,
            api_version=AZURE_OPENAI_VERSION,
            deployment_id=AZURE_OPENAI_DEPLOYMENT_4o,
            api_key=AZURE_OPENAI_KEY,
            max_tokens=2_000,
        )
    elif model == "gpt-4o-mini":
        lm = dspy.AzureOpenAI(
            api_base=AZURE_OPENAI_ENDPOINT,
            api_version=AZURE_OPENAI_VERSION,
            deployment_id=AZURE_OPENAI_DEPLOYMENT_4o_mini,
            api_key=AZURE_OPENAI_KEY,
            max_tokens=2_000,
        )
    elif "llama" in model:
        model = model.split("+")[0]
        lm = dspy.OllamaLocal(base_url=OLLAMA_URL, model=model, max_tokens=2_000)
    else:
        raise ValueError("Invalid model name")
    return lm


directory_path = "data/programs"
json_files = get_json_files(directory_path)
for program_path in tqdm(json_files):
    program_name = program_path.split("/")[-1]
    optimizer = program_name.split("_")[0]
    model = program_name.split("_")[1]
    with_cooklang = True if "with_cooklang" in program_name else False
    without_ings = True if "without_ings" not in program_name else False
    program = get_program(
        optimizer, model, cooklang_spec=with_cooklang, ingredients=without_ings
    )
    df = load_raw_data()
    lm = get_lm(model)

    dspy.settings.configure(lm=lm)

    df["predict_cooklang"] = df.apply(
        lambda row: get_cooklang(row, program), axis=1
    )
    df.to_csv(f"data/silver/{program_name}_predictions.tsv", sep="\t", index=False)
