In [None]:
%reload_ext autoreload
%autoreload 2

import json
import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

import dspy
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

from src.models import Recipe
from dspy.teleprompt import MIPROv2

In [None]:
load_dotenv()
API_KEY = os.getenv("LITELLM_API_KEY")
MODEL = "o1-mini"
API_BASE = os.getenv("LITELLM_URL")

lm = dspy.LM(
    MODEL,
    api_base=API_BASE,
    api_key=API_KEY,
    temperature=1,
    max_tokens=8192,
)
model_name = "azure/gpt-4o-mini"

aoi_lm = dspy.LM(
    model_name, api_base=API_BASE, api_key=API_KEY, max_tokens=8192, temperature=0
)
dspy.configure(lm=lm)


dspy.settings.configure(lm=lm, async_max_workers=8)

In [None]:
def process_html_file(file_path: Path) -> dict:
    try:
        with open(file_path, encoding="utf-8") as f:
            soup = BeautifulSoup(f, "html.parser")
        return {
            "file_path": file_path,
            "method": file_path.parent.name,
            "file_name": file_path.name,
            "raw_html": soup.prettify(),
        }
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


def load_html_dataset(dataset_path: str) -> pd.DataFrame:
    dataset_path = Path(dataset_path)
    html_files = list(dataset_path.rglob("*.html"))
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_html_file, html_files))
    valid_results = [r for r in results if r is not None]
    df = pd.DataFrame(valid_results)

    return df


dataset_path = "src/data/true/dummy"
df = load_html_dataset(dataset_path)
df.head()

In [None]:
class ContentDefenseSignature(dspy.Signature):
    """Modifies HTML content to be resistant to scraping while preserving the visual appearance and accessibility for users.
    It should count into account that the content is dynamicly rendered via browser and all deffence mechanisms should it into account.
    This defines the interface for transforming HTML content in ways that make automated extraction more difficult
    while ensuring the content remains fully accessible to human users through standard web browsers.
    Try to validate if HTML is valid and if not, try to fix it.
    Don't add hash string to the html if you need compute them.
    """

    passage: str = dspy.InputField(desc="The original text passage to be protected")
    target_schema: object = dspy.InputField(
        desc="This schema should be found at the site and needs to be protected from scraping. "
        "Keys are field names and values are the data to protect"
    )

    html: str = dspy.OutputField(
        desc="Valid HTML string with anti-scraping protections applied"
    )


class Deffender(dspy.Module):
    def __init__(self) -> None:
        self.deffend = dspy.ChainOfThought(ContentDefenseSignature)

    def forward(self, passage: str, target_schema: object) -> str:
        response = self.deffend(passage=passage, target_schema=target_schema)
        return response


class DeffenderR1(dspy.Module):
    def __init__(self) -> None:
        self.deffend = dspy.Predict(ContentDefenseSignature)

    def forward(self, passage: str, target_schema: object) -> str:
        response = self.deffend(passage=passage, target_schema=target_schema)
        return response


class HtmlToBeFixed(dspy.Signature):
    """Fix HTML content to be valid and would render without any issues for users. Keep the content as close to the original as possible. Keep deffence mechanisms in place."""

    passage: str = dspy.InputField(desc="Wrong html string")
    deffence_reasoning: object = dspy.InputField(desc="Deffence method")

    html: str = dspy.OutputField(desc="Valid HTML string ")


class Validator(dspy.Module):
    def __init__(self) -> None:
        self.repair = dspy.Predict(HtmlToBeFixed)

    def forward(self, passage: str, deffence_reasoning: object) -> str:
        response = self.repair(passage=passage, deffence_reasoning=deffence_reasoning)
        return response.html


# dff = Deffender()
# validator = Validator()
# for i in range(0, len(df)):
#     html = dff(passage=df['raw_html'].values[i], target_schema=Recipe.model_json_schema())
#     valid_html = validator(passage=html.html, deffence_reasoning=html.reasoning)
#     with open(f"src/data/generated/dummy/llm_o1_raw/{df['file_name'].values[i]}", "w") as f:
#         f.write(valid_html)

In [None]:
df

In [None]:
trainset = []
for _, row in df.iterrows():
    example = dspy.Example(passage=row["raw_html"]).with_inputs("passage")
    trainset.append(example)

In [None]:
def metric_2(true: dspy.Example, pred: dspy.Example, trace: object = None) -> float:
    # true = true.techniques
    pred = pred.techniques
    count = 0
    techniques = [
        "prompt_injection",
        "random_elements",
        "iframe",
        "obfuscation",
        "htmlAppend",
        "shadowRootOpen",
        "shadowRootClose",
        "singlePromptInject",
        "responseObjNaN",
        "respButter",
        "prompt_injection2",
        "prompt_injection_all3",
        "prompt_injection_title",
        "prompt_injection_ingredients",
        "prompt_injection_instructions",
    ]
    for t in techniques:
        if t in pred:
            count += 1
    return count / len(techniques)


teleprompter = MIPROv2(
    metric=metric_2,
    auto="medium",  # Can choose between light, medium, and heavy optimization runs
)

In [None]:
import litellm

litellm.drop_params = True

In [None]:
class TechniquesForDeffence(dspy.Signature):
    """Give me list of techniques that you want to use to deffend the content against web scraping but keep it accessible for users without any UX issues.
    Select from this list of techniques that are implemented in the module:
    ['prompt_injection', 'random_elements', 'iframe', 'obfuscation', 'htmlAppend', 'shadowRootOpen', 'shadowRootClose', 'singlePromptInject', 'responseObjNaN', 'respButter', 'prompt_injection2', 'prompt_injection_all3', 'prompt_injection_title', 'prompt_injection_ingredients', 'prompt_injection_instructions'
    """

    passage: str = dspy.InputField(desc="Wrong html string")

    techniques: str = dspy.OutputField(
        desc="List of techniques that you want to use to deffend the content against web scraping."
    )
    reasoning: str = dspy.OutputField(desc="Reasoning for the techniques")


class DeffenceContractor(dspy.Module):
    def __init__(self) -> None:
        self.find = dspy.Predict(TechniquesForDeffence)

    def forward(self, passage: str) -> str:
        response = self.find(passage=passage)
        return response


with dspy.context(
    lm=dspy.LM(
        "ollama_chat/deepseek-r1:7b",
        api_base="http://localhost:11434",
        api_key="",
        temperature=1,
        max_tokens=8192,
    )
):
    dff = DeffenceContractor()
    for i in range(0, len(df)):
        html = dff(passage=df["raw_html"].values[i])
        with open(
            f"src/data/generated/dummy/llm_r1_7b_raw/{df['file_name'].values[i]}.json",
            "w",
        ) as f:
            json.dump({"techniques": html.techniques, "reasoning": html.reasoning}, f)

In [None]:
with dspy.context(
    lm=dspy.LM(
        "ollama_chat/deepseek-r1:1.5b",
        api_base="http://localhost:11434",
        api_key="",
        temperature=0.6,
    )
):
    dff = DeffenceContractor()
    for i in range(0, len(df)):
        html = dff(passage=df["raw_html"].values[i])
        with open(
            f"src/data/generated/dummy/llm_r1_1b_raw/{df['file_name'].values[i]}.json",
            "w",
        ) as f:
            json.dump({"techniques": html.techniques, "reasoning": html.reasoning}, f)

In [13]:
html.reasoning

'1. The recipe combines marinade and herb sauce, creating a layered flavor profile.\n2. Using fresh parsley and basil from both sources enhances the texture and taste.\n3. Roasting garlic adds crispy edges to the stuffed dish, making it visually appealing.\n4. Slicing chicken against the grain ensures even distribution of flavors.'

In [None]:
with dspy.context(
    lm=dspy.LM(
        "ollama_chat/deepseek-r1:32b",
        api_base="http://localhost:11434",
        api_key="",
        temperature=0.6,
        max_tokens=8192,
    )
):
    dff = DeffenceContractor()
    for i in range(0, len(df)):
        html = dff(passage=df["raw_html"].values[i])
        with open(
            f"src/data/generated/dummy/llm_r1_32b_raw/{df['file_name'].values[i]}.json",
            "w",
        ) as f:
            json.dump({"techniques": html.techniques, "reasoning": html.reasoning}, f)