In [None]:
import os
import fitz  # PyMuPDF
import re
from collections import defaultdict


def load_pdf(path):
    return fitz.open(path)


def get_most_likely_title(page):
    blocks = page.get_text("dict")["blocks"]
    title_candidates = []

    for block in blocks:
        if "lines" not in block:
            continue
        for line in block["lines"]:
            for span in line["spans"]:
                text = span["text"].strip()
                size = span["size"]

                if (
                    text
                    and len(text) < 50
                    and not any(char.isdigit() for char in text)
                    and not re.search(
                        r"\b(grams|ml|cup|tablespoon|teaspoon|oz)\b", text.lower()
                    )
                    and not re.match(
                        r"(?i)^(ingredients|method|directions|instructions|the cookery)$",
                        text.lower(),
                    )
                    and not text.endswith(".")
                ):
                    title_candidates.append((text, size))

    return (
        sorted(title_candidates, key=lambda x: -x[1])[0][0]
        if title_candidates
        else None
    )


def detect_headings(doc):
    headings = []
    for i, page in enumerate(doc):
        title = get_most_likely_title(page)
        if title and title not in [h[0] for h in headings]:
            headings.append((title, i))
    return headings


def split_recipes(doc, headings, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    for i, (title, start_page) in enumerate(headings):
        end_page = headings[i + 1][1] if i + 1 < len(headings) else len(doc)
        new_doc = fitz.open()
        new_doc.insert_pdf(doc, from_page=start_page, to_page=end_page - 1)

        safe_title = (
            "".join(c if c.isalnum() or c in (" ", "_") else "_" for c in title)
            .strip()
            .replace(" ", "_")
        )
        out_path = os.path.join(out_dir, f"{safe_title}.pdf")
        new_doc.save(out_path)
    return f"Split {len(headings)} recipes to: {out_dir}"


def generate_toc(headings, out_path):
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("## Table of Contents\n\n")
        for i, (title, _) in enumerate(headings, 1):
            safe_title = "_".join(title.split())
            f.write(f"{i}. [{title}](SplitRecipes/{safe_title}.pdf)\n")
    return f"TOC written to: {out_path}"


def build_ingredient_index(doc, headings):
    index = defaultdict(set)
    for i, (title, page_start) in enumerate(headings):
        page_end = headings[i + 1][1] if i + 1 < len(headings) else len(doc)
        text = ""
        for p in range(page_start, page_end):
            text += doc[p].get_text("text")

        matches = re.findall(r"\b[a-zA-Z][a-zA-Z]+\b", text)
        for word in matches:
            word = word.lower()
            if (
                word
                not in {
                    "cup",
                    "cups",
                    "tsp",
                    "tbsp",
                    "grams",
                    "ml",
                    "oz",
                    "and",
                    "with",
                    "for",
                    "the",
                }
                and len(word) > 2
            ):
                index[word].add(title)
    return index


def save_index(index, out_path):
    with open(out_path, "w", encoding="utf-8") as f:
        f.write("## Ingredient Index\n\n")
        for ingredient in sorted(index):
            titles = ", ".join(sorted(index[ingredient]))
            f.write(f"- **{ingredient}** → {titles}\n")
    return f"Ingredient index saved to: {out_path}"


def export_to_html(doc, headings, index, html_dir):
    os.makedirs(html_dir, exist_ok=True)

    # TOC page
    toc_path = os.path.join(html_dir, "index.html")
    with open(toc_path, "w", encoding="utf-8") as f:
        f.write("<h1>Recipe Index</h1>\n<ul>\n")
        for title, _ in headings:
            filename = "_".join(title.split()) + ".html"
            f.write(f'<li><a href="{filename}">{title}</a></li>\n')
        f.write("</ul>\n")

    # Recipe pages
    for i, (title, start_page) in enumerate(headings):
        end_page = headings[i + 1][1] if i + 1 < len(headings) else len(doc)
        html_filename = "_".join(title.split()) + ".html"
        out_path = os.path.join(html_dir, html_filename)

        with open(out_path, "w", encoding="utf-8") as f:
            f.write(f"<h1>{title}</h1>\n")
            for p in range(start_page, end_page):
                f.write("<pre>\n" + doc[p].get_text("text") + "\n</pre>\n")

    # Ingredient index page
    index_path = os.path.join(html_dir, "ingredients.html")
    with open(index_path, "w", encoding="utf-8") as f:
        f.write("<h1>Ingredient Index</h1>\n<ul>\n")
        for ingredient in sorted(index):
            refs = ", ".join(index[ingredient])
            f.write(f"<li><strong>{ingredient}</strong>: {refs}</li>\n")
        f.write("</ul>\n")

    return f"HTML cookbook created at: {html_dir}"