In [12]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urldefrag
from collections import deque
from pathlib import Path

In [13]:
BASE = "https://docs.manim.community/en/stable/"
OUTPUT_DIR = "page_content"

In [14]:
def link_to_file_name(link):
    return link[39:].replace('/', '.')

In [15]:
def is_html_doc(link: str) -> bool:
    if not link.startswith(BASE):
        return False
    if link.endswith((".png", ".jpg", ".svg", ".gif", ".pdf", ".json", ".js", ".css")):
        return False
    if any(x in link for x in ("/_sources/", "#", "genindex", "search", "py-modindex")):
        return False
    return True

In [16]:
session = requests.Session()
visited = set()
queue = deque([BASE])

while queue:
    url = queue.popleft()

    if url in visited:
        continue

    visited.add(url)
    print(f"[visited] {url}")

    try:
        response = session.get(url, timeout=15)
        response.raise_for_status()
    except requests.RequestException as exc:
        print(f"[skip] {url} — {exc}")
        continue

    # Add content
    if link_to_file_name(url):
        with open(f"{OUTPUT_DIR}/{link_to_file_name(url)}", "wb") as f:
            # Put url at the front
            url_line = (url + "\n").encode('utf-8') 
            f.write(url_line)
            f.write(response.content)
    else:
        print("[skip] root")

    # Travel through internal links
    for a in BeautifulSoup(response.text, "html.parser").find_all("a", href=True):
        link, _ = urldefrag(urljoin(url, a["href"]))
        if is_html_doc(link) and link not in visited:
            queue.append(link)

[visited] https://docs.manim.community/en/stable/
[skip] root
[visited] https://docs.manim.community/en/stable/examples.html
[visited] https://docs.manim.community/en/stable/installation.html
[visited] https://docs.manim.community/en/stable/installation/uv.html
[visited] https://docs.manim.community/en/stable/installation/conda.html
[visited] https://docs.manim.community/en/stable/installation/docker.html
[visited] https://docs.manim.community/en/stable/installation/jupyter.html
[visited] https://docs.manim.community/en/stable/tutorials_guides.html
[visited] https://docs.manim.community/en/stable/tutorials/index.html
[visited] https://docs.manim.community/en/stable/tutorials/quickstart.html
[visited] https://docs.manim.community/en/stable/tutorials/output_and_config.html
[visited] https://docs.manim.community/en/stable/tutorials/building_blocks.html
[visited] https://docs.manim.community/en/stable/guides/index.html
[visited] https://docs.manim.community/en/stable/guides/configuration.h