In [33]:
%reload_ext autoreload
%autoreload 2

import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import dspy
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

from src.html import clean_html
from src.utils import html_str2md
from src.models import Recipe
from Levenshtein import distance

In [34]:
load_dotenv()
LITELLM_API_KEY = os.getenv("LITELLM_API_KEY")
MODEL = "gpt-4o-mini"
LITELLM_URL = os.getenv("LITELLM_URL")

In [35]:
# dspy setup

lm = dspy.LM(
    MODEL,
    api_base=LITELLM_URL,
    api_key=LITELLM_API_KEY,
    temperature=0,
    max_tokens=8192,
)
dspy.settings.configure(lm=lm, async_max_workers=8)

In [None]:
def process_html_file(file_path: Path) -> dict:
    try:
        with open(file_path, encoding="utf-8") as f:
            soup = BeautifulSoup(f, "html.parser")
        return {
            "file_path": file_path,
            "method": file_path.parent.name,
            "file_name": file_path.name,
            "raw_html": soup.prettify(),
        }
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


def load_html_dataset(dataset_path: str) -> pd.DataFrame:
    dataset_path = Path(dataset_path)
    html_files = list(dataset_path.rglob("*.html"))
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_html_file, html_files))
    valid_results = [r for r in results if r is not None]
    df = pd.DataFrame(valid_results)
    df["cleaned_html"] = df["raw_html"].apply(clean_html)
    df["markdown"] = df["cleaned_html"].apply(html_str2md)
    return df


dataset_path = "src/data/generated/dummy"
df = load_html_dataset(dataset_path)

In [37]:
class RecipeSearchSignature(dspy.Signature):
    """Extract all recipes with ingredients and instructions from a text passage"""

    passage: str = dspy.InputField(desc="a text passage")
    recipe: Recipe = dspy.OutputField(desc="Response with extracted recipe")


class RecipeExtractor(dspy.Module):
    def __init__(self) -> None:
        self.extract = dspy.Predict(RecipeSearchSignature)

    def forward(
        self, passage: str, file_path: str, method: str, file_name: str, input_type: str
    ) -> str:
        response = self.extract(passage=passage)
        return dspy.Example(
            file_path=file_path,
            method=method,
            file_name=file_name,
            input_type=input_type,
            response=response.recipe,
        )


batch = []
for _, row in df.iterrows():
    example = dspy.Example(
        passage=row["markdown"],
        file_path=row["file_path"],
        method=row["method"],
        input_type="markdown",
        file_name=row["file_name"],
    ).with_inputs("passage", "file_path", "method", "file_name", "input_type")
    batch.append(example)

    example = dspy.Example(
        passage=row["cleaned_html"],
        file_path=row["file_path"],
        method=row["method"],
        input_type="html",
        file_name=row["file_name"],
    ).with_inputs("passage", "file_path", "method", "file_name", "input_type")
    batch.append(example)

extractor = RecipeExtractor()
metric = lambda x, y: True  # noqa: E731

In [None]:
output = dspy.Evaluate(
    devset=batch,
    metric=metric,
    num_threads=20,
    display_progress=True,
    return_outputs=True,
)(extractor)

In [39]:
rows = []

for i in range(len(output[1])):
    d = dict(output[1][i][1])
    d["response"] = d["response"].model_dump()
    rows.append(d)
r = pd.DataFrame(rows)

true_responses = (
    r[(r["method"] == "true") & (r["input_type"] == "html")]
    .set_index("file_name")["response"]
    .to_dict()
)
r["response_true"] = r["file_name"].map(true_responses)

In [50]:
import json
json.dumps(true_responses)

arr = [
"Press tofu for 15 minutes to remove excess water",
"Cut into 1-inch cubes",
"Toss with cornstarch, salt, and pepper",
"Heat sesame oil in a large skillet over medium-high heat",
"Cook tofu 3-4 minutes per side until golden",
"Remove and set aside",
"Whisk all sauce ingredients in a bowl",
"Set aside until needed",
"Add oil to the same pan",
"Stir-fry garlic and ginger until fragrant",
"Add vegetables in order of cooking time",
"Cook until crisp-tender",
"Return tofu to pan",
"Pour sauce over",
"Simmer until thickened",
"Sprinkle with sesame seeds",
]
l = []
for a in arr:
    l.append({"description": a} )
print(json.dumps(l))

[{"description": "Press tofu for 15 minutes to remove excess water"}, {"description": "Cut into 1-inch cubes"}, {"description": "Toss with cornstarch, salt, and pepper"}, {"description": "Heat sesame oil in a large skillet over medium-high heat"}, {"description": "Cook tofu 3-4 minutes per side until golden"}, {"description": "Remove and set aside"}, {"description": "Whisk all sauce ingredients in a bowl"}, {"description": "Set aside until needed"}, {"description": "Add oil to the same pan"}, {"description": "Stir-fry garlic and ginger until fragrant"}, {"description": "Add vegetables in order of cooking time"}, {"description": "Cook until crisp-tender"}, {"description": "Return tofu to pan"}, {"description": "Pour sauce over"}, {"description": "Simmer until thickened"}, {"description": "Sprinkle with sesame seeds"}]


In [None]:
def eval_recipes(true: dict, pred: dict) -> list[str, float]:
    def format_ingredient(ingredient: dict) -> str:
        return f"{str(ingredient['amount'])} {ingredient['unit']} {ingredient['item']}".replace(
            "None", ""
        ).strip()

    def compare_lists(true_list: list[str], pred_list: list[str]) -> tuple[bool, float]:
        from statistics import mean

        is_match = len(true_list) == len(pred_list) and all(
            t == p for t, p in zip(true_list, pred_list)
        )
        if pred_list == []:
            avg_distance = 0
        else:
            avg_distance = (
                mean(distance(t, p) for t, p in zip(true_list, pred_list))
                if true_list
                else 0
            )
        return is_match, avg_distance

    # Compare titles
    scores = {
        "title_match": true["title"] == pred["title"],
        "title_distance": distance(true["title"], pred["title"]),
    }

    # Compare ingredients
    true_ingredients = [format_ingredient(i) for i in true["ingredients"]]
    pred_ingredients = [format_ingredient(i) for i in pred["ingredients"]]

    ingredients_match, ingredients_distance = compare_lists(
        true_ingredients, pred_ingredients
    )
    scores.update(
        {
            "ingredients_match": ingredients_match,
            "ingredients_distance": ingredients_distance,
        }
    )

    # Compare instructions
    true_instructions = [i["description"] for i in true["instructions"]]
    pred_instructions = [i["description"] for i in pred["instructions"]]
    instructions_match, instructions_distance = compare_lists(
        true_instructions, pred_instructions
    )
    scores.update(
        {
            "instructions_match": instructions_match,
            "instructions_distance": instructions_distance,
        }
    )

    return scores


r["scores"] = r.apply(lambda x: eval_recipes(x["response_true"], x["response"]), axis=1)
r = pd.concat([r.drop(["scores"], axis=1), pd.json_normalize(r["scores"])], axis=1)

r[
    [
        "input_type",
        "method",
        "title_match",
        "title_distance",
        "ingredients_match",
        "ingredients_distance",
        "instructions_match",
        "instructions_distance",
    ]
].groupby(["method", "input_type"]).mean()