In [33]:
import os

from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

In [34]:
SERVER_URL = "0.0.0.0:8000"
TRUE_PATH = "src/data/true"
GENERATED_PATH = "src/data/generated"
METHODS = ["prompt_injection", "random_elements", "iframe", "obfuscation", "true"]

In [35]:
folders = [
    f for f in os.listdir(TRUE_PATH) if os.path.isdir(os.path.join(TRUE_PATH, f))
]

# check if generated folder exists if not create folder
if not os.path.exists(GENERATED_PATH):
    os.makedirs(GENERATED_PATH)

In [46]:
async def save_content(content: str, save_folder_path: str) -> None:
    soup = BeautifulSoup(content, "html.parser")
    with open(save_folder_path, "w") as f:
        f.write(str(soup))


async def scrape_page(
    page: object, file_path: str, method: str, save_folder_path: str
) -> None:
    if method == "true":
        url = f"http://{SERVER_URL}?file_path={file_path}"
    else:
        url = f"http://{SERVER_URL}?file_path={file_path}&method={method}"

    try:
        await page.goto(url, timeout=60000, wait_until="networkidle")
        content = await page.content()
        await save_content(content, save_folder_path)
    except Exception as e:
        print(f"Error scraping {url}: {e}")


async def main() -> None:

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        for folder in folders:
            files = [
                f
                for f in os.listdir(os.path.join(TRUE_PATH, folder))
                if os.path.isfile(os.path.join(TRUE_PATH, folder, f))
            ]
            for file in files:
                file_path = os.path.join(folder, file)
                for method in METHODS:
                    save_folder_path = os.path.join(GENERATED_PATH, folder, method)
                    if not os.path.exists(save_folder_path):
                        os.makedirs(save_folder_path)
                    save_folder_path = os.path.join(save_folder_path, file)
                    await scrape_page(page, file_path, method, save_folder_path)
        await browser.close()
    print("Scraping completed.")


await main()  # noqa: PLE1142

Successfully scraped: http://0.0.0.0:8000?file_path=dummy/recipe_1.html&method=prompt_injection
Generated src/data/generated/dummy/prompt_injection/recipe_1.html
Successfully scraped: http://0.0.0.0:8000?file_path=dummy/recipe_1.html&method=random_elements
Generated src/data/generated/dummy/random_elements/recipe_1.html
Successfully scraped: http://0.0.0.0:8000?file_path=dummy/recipe_1.html&method=iframe
Generated src/data/generated/dummy/iframe/recipe_1.html
Successfully scraped: http://0.0.0.0:8000?file_path=dummy/recipe_1.html&method=obfuscation
Generated src/data/generated/dummy/obfuscation/recipe_1.html
Successfully scraped: http://0.0.0.0:8000?file_path=dummy/recipe_1.html
Generated src/data/generated/dummy/true/recipe_1.html
Successfully scraped: http://0.0.0.0:8000?file_path=dummy/recipe_5.html&method=prompt_injection
Generated src/data/generated/dummy/prompt_injection/recipe_5.html
Successfully scraped: http://0.0.0.0:8000?file_path=dummy/recipe_5.html&method=random_elements
G