In [1]:
%reload_ext autoreload
%autoreload 2

import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import dspy
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv
import json
from src.html import clean_html
from src.utils import html_str2md
from src.models import Recipe
from Levenshtein import distance

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
LITELLM_API_KEY = os.getenv("LITELLM_API_KEY")
MODEL = "gpt-4o-mini"
LITELLM_URL = os.getenv("LITELLM_URL")

In [3]:
# dspy setup

lm = dspy.LM(
    MODEL,
    api_base=LITELLM_URL,
    api_key=LITELLM_API_KEY,
    temperature=0,
    max_tokens=8192,
)
dspy.settings.configure(lm=lm, async_max_workers=8)

In [4]:
def process_html_file(file_path: Path) -> dict:
    try:
        with open(file_path, encoding="utf-8") as f:
            soup = BeautifulSoup(f, "html.parser")
        return {
            "file_path": file_path,
            "method": file_path.parent.name,
            "file_name": file_path.name,
            "raw_html": soup.prettify(),
        }
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


def load_html_dataset(dataset_path: str) -> pd.DataFrame:
    dataset_path = Path(dataset_path)
    html_files = list(dataset_path.rglob("*.html"))
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_html_file, html_files))
    valid_results = [r for r in results if r is not None]
    df = pd.DataFrame(valid_results)
    df["cleaned_html"] = df["raw_html"].apply(clean_html)
    df["markdown"] = df["cleaned_html"].apply(html_str2md)
    return df


dataset_path = "src/data/generated/dummy"
df = load_html_dataset(dataset_path)

In [5]:
class RecipeSearchSignature(dspy.Signature):
    """Extract all recipes with ingredients and instructions from a text passage"""

    passage: str = dspy.InputField(desc="a text passage")
    recipe: Recipe = dspy.OutputField(desc="Response with extracted recipe")


class RecipeExtractor(dspy.Module):
    def __init__(self) -> None:
        self.extract = dspy.Predict(RecipeSearchSignature)

    def forward(
        self, passage: str, file_path: str, method: str, file_name: str, input_type: str
    ) -> str:
        response = self.extract(passage=passage)
        return dspy.Example(
            file_path=file_path,
            method=method,
            file_name=file_name,
            input_type=input_type,
            response=response.recipe,
        )


batch = []
for _, row in df.iterrows():
    example = dspy.Example(
        passage=row["markdown"],
        file_path=row["file_path"],
        method=row["method"],
        input_type="markdown",
        file_name=row["file_name"],
    ).with_inputs("passage", "file_path", "method", "file_name", "input_type")
    batch.append(example)

    example = dspy.Example(
        passage=row["cleaned_html"],
        file_path=row["file_path"],
        method=row["method"],
        input_type="html",
        file_name=row["file_name"],
    ).with_inputs("passage", "file_path", "method", "file_name", "input_type")
    batch.append(example)

extractor = RecipeExtractor()
metric = lambda x, y: True  # noqa: E731

In [6]:
output = dspy.Evaluate(
    devset=batch,
    metric=metric,
    num_threads=10,
    display_progress=True,
    return_outputs=True,
)(extractor)

Average Metric: 120.00 / 120 (100.0%): 100%|██████████| 120/120 [02:22<00:00,  1.18s/it]

2025/01/25 14:49:28 INFO dspy.evaluate.evaluate: Average Metric: 120 / 120 (100.0%)





In [7]:
rows = []

for i in range(len(output[1])):
    d = dict(output[1][i][1])
    d["response"] = d["response"].model_dump()
    rows.append(d)
r = pd.DataFrame(rows)

true_responses = json.load(open("src/data/true/dummy/labels.json"))

r["response_true"] = r["file_name"].map(true_responses)

In [8]:
def eval_recipes(true: dict, pred: dict) -> list[str, float]:
    def format_ingredient(ingredient: dict) -> str:
        return (
            f"{str(ingredient['amount'])} {ingredient['unit']} {ingredient['item']}".replace(
                "None", ""
            )
            .lower()
            .strip()
        )

    # def compare_lists(true_list: list[str], pred_list: list[str]) -> tuple[bool, float]:
    #     from statistics import mean

    #     is_match = len(true_list) == len(pred_list) and all(
    #         t == p for t, p in zip(true_list, pred_list)
    #     )
    #     if pred_list == []:
    #         avg_distance = 0
    #     else:
    #         avg_distance = (
    #             mean(distance(t, p) for t, p in zip(true_list, pred_list))
    #             if true_list
    #             else 0
    #         )
    #     return is_match, avg_distance

    # TODO REwrite better
    def compare_lists(
        true_list: list[str], pred_list: list[str]
    ) -> tuple[float, float]:
        from statistics import mean

        if not pred_list:
            return 0.0, 0.0

        if not true_list:
            return 0.0, 0.0

        # Compare lengths
        len_similarity = min(len(true_list), len(pred_list)) / max(
            len(true_list), len(pred_list)
        )

        # Compare elements
        common_length = min(len(true_list), len(pred_list))
        element_similarities = [
            1.0 if t == p else 0.0
            for t, p in zip(true_list[:common_length], pred_list[:common_length])
        ]

        match_score = mean(element_similarities) * len_similarity
        avg_distance = mean(
            distance(t, p)
            for t, p in zip(true_list[:common_length], pred_list[:common_length])
        )

        return match_score, avg_distance

    # Compare titles
    scores = {
        "title_match": true["title"] == pred["title"],
        "title_distance": distance(true["title"], pred["title"]),
    }

    # Compare ingredients
    true_ingredients = [format_ingredient(i) for i in true["ingredients"]]
    pred_ingredients = [format_ingredient(i) for i in pred["ingredients"]]

    ingredients_match, ingredients_distance = compare_lists(
        true_ingredients, pred_ingredients
    )
    scores.update(
        {
            "ingredients_match": ingredients_match,
            "ingredients_distance": ingredients_distance,
        }
    )

    # Compare instructions
    true_instructions = [i["description"].lower() for i in true["instructions"]]
    pred_instructions = [i["description"].lower() for i in pred["instructions"]]
    instructions_match, instructions_distance = compare_lists(
        true_instructions, pred_instructions
    )
    scores.update(
        {
            "instructions_match": instructions_match,
            "instructions_distance": instructions_distance,
        }
    )

    return scores


r["scores"] = r.apply(lambda x: eval_recipes(x["response_true"], x["response"]), axis=1)
r = pd.concat([r.drop(["scores"], axis=1), pd.json_normalize(r["scores"])], axis=1)

r[
    [
        "input_type",
        "method",
        "title_match",
        "title_distance",
        "ingredients_match",
        "ingredients_distance",
        "instructions_match",
        "instructions_distance",
    ]
].groupby(["method", "input_type"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,title_match,title_distance,ingredients_match,ingredients_distance,instructions_match,instructions_distance
method,input_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
iframe,html,1.0,0.0,0.540662,3.290414,0.820238,0.56631
iframe,markdown,0.933333,2.0,0.509392,3.126449,0.677738,3.750368
obfuscation,html,0.0,33.666667,0.0,16.910926,0.0,39.004815
obfuscation,markdown,0.0,35.133333,0.0,10.76119,0.0,25.138889
prompt_injection,html,1.0,0.0,0.398417,2.993818,0.728571,0.337143
prompt_injection,markdown,0.933333,2.0,0.438114,2.81327,0.474405,3.279074
true,html,1.0,0.0,0.540662,3.044068,0.761905,0.599643
true,markdown,0.933333,2.0,0.509392,3.126449,0.677738,3.750368


In [10]:
r[r["method"] == "true"][["response", "response_true"]].values[0]

array([{'title': 'Butternut Squash Risotto', 'ingredients': [{'item': 'medium butternut squash', 'amount': 1.0, 'unit': None}, {'item': 'tablespoons olive oil', 'amount': 3.0, 'unit': 'tablespoon'}, {'item': 'ground black pepper', 'amount': None, 'unit': None}, {'item': 'sprigs fresh thyme', 'amount': 2.0, 'unit': None}, {'item': 'cups chicken stock', 'amount': 6.0, 'unit': 'cup'}, {'item': 'tablespoons butter', 'amount': 2.0, 'unit': 'tablespoon'}, {'item': 'large onion', 'amount': 1.0, 'unit': None}, {'item': 'cloves garlic', 'amount': 3.0, 'unit': None}, {'item': 'cups Arborio rice', 'amount': 2.0, 'unit': 'cup'}, {'item': 'cup dry white wine', 'amount': 0.5, 'unit': 'cup'}, {'item': 'cup grated Parmesan cheese', 'amount': 0.5, 'unit': 'cup'}, {'item': 'sage leaves', 'amount': 8.0, 'unit': None}, {'item': 'tablespoons butter', 'amount': 2.0, 'unit': 'tablespoon'}], 'instructions': [{'description': 'Preheat oven to 400°F (200°C)'}, {'description': 'Toss squash cubes with olive oil, s