<a href="https://colab.research.google.com/github/wyattowalsh/sitedumper/blob/main/SiteDumper_(Crawl4AI).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
#@title **Enhanced Crawl4AI Colab Notebook**
# This single-cell notebook script installs and sets up Crawl4AI with an advanced BFS crawler.
# It includes enhanced sitemap discovery (with fallback), robust link extraction, improved logging & progress,
# output cleaning for downstream LLM ingestion, conditional file splitting with automatic local download,
# optional Google Drive integration, reverse crawling options, subdomain allowance, and JS-heavy page support.

import sys, os, asyncio, json, re, collections, math, fnmatch, traceback, xml.etree.ElementTree as ET

# Install/update packages and Playwright, plus extras (loguru, rich, nest_asyncio, tenacity, pydantic, aiohttp)
try:
    !{sys.executable} -m pip install --upgrade --quiet "crawl4ai[all]"
    !{sys.executable} -m playwright install
    !{sys.executable} -m pip install --upgrade --quiet nest_asyncio loguru rich tenacity pydantic google-colab aiohttp
    print("✅ Installation step complete!")
except Exception as e:
    print("❌ Installation failed:", e)
    raise e

import nest_asyncio
nest_asyncio.apply()

# Optional: Mount Google Drive (set enable_google_drive=True to mount)
enable_google_drive = False  #@param {type:"boolean"}
drive_folder = "Crawl4AI_Results"  #@param {type:"string"}
if enable_google_drive:
    from google.colab import drive
    drive.mount('/content/drive')
    output_dir = os.path.join("/content/drive/My Drive", drive_folder)
    os.makedirs(output_dir, exist_ok=True)
    print("✅ Google Drive mounted. Output directory:", output_dir)
else:
    output_dir = os.getcwd()  # save locally in current directory

# New: Option to automatically download output files to the local machine (only applicable in Colab)
auto_download_files = True  #@param {type:"boolean"}

# =============================================================================
#                IMPORTS FROM CRAWL4AI & PYTHON LIBS
# =============================================================================
from loguru import logger
from rich.console import Console
from rich.progress import Progress, BarColumn, MofNCompleteColumn, TimeElapsedColumn
from pydantic import BaseModel, Field
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

from crawl4ai import (
    AsyncWebCrawler,
    CacheMode,
    BrowserConfig,
    CrawlerRunConfig
)
try:
    from crawl4ai.async_configs import (
        MemoryAdaptiveDispatcher,
        SemaphoreDispatcher,
        RateLimiter,
        CrawlerMonitor,
        DisplayMode
    )
except ImportError:
    MemoryAdaptiveDispatcher = SemaphoreDispatcher = RateLimiter = CrawlerMonitor = DisplayMode = None

from crawl4ai.extraction_strategy import (
    NoExtractionStrategy,
    JsonCssExtractionStrategy,
    JsonXPathExtractionStrategy,
    LLMExtractionStrategy,
    CosineStrategy
)
from crawl4ai.chunking_strategy import (
    RegexChunking,
    SlidingWindowChunking,
    OverlappingWindowChunking
)
from crawl4ai.content_filter_strategy import (
    LLMContentFilter,
    PruningContentFilter
)
from crawl4ai.markdown_generation_strategy import (
    DefaultMarkdownGenerator
)

console = Console()
logger.remove()  # Remove default logger
logger.add(sys.stdout, format="<green>{time:HH:mm:ss}</green> | <level>{message}</level>", level="INFO")
print("✅ Crawl4AI modules imported and logging set up!\n")

# =============================================================================
#                   COLAB FORM PARAMETERS
# =============================================================================
# 1) Basic Settings
url_to_crawl = "https://crawl4ai.com"  #@param {type:"string"}
use_headless = True  #@param {type:"boolean"}
verbose_browser = False  #@param {type:"boolean"}
browser_type_choice = "chromium"  #@param ["chromium","firefox","webkit"]

# Cache Mode Options
cache_mode_choice = "BYPASS"  #@param ["ENABLED", "DISABLED", "READ_ONLY", "WRITE_ONLY", "BYPASS"]

# Magic (Anti-Bot) Mode
magic_mode = True  #@param {type:"boolean"}

# 2) Extraction Strategy Settings
extraction_strategy_choice = "NoExtractionStrategy"  #@param ["NoExtractionStrategy","JsonCssExtractionStrategy","JsonXPathExtractionStrategy","LLMExtractionStrategy","CosineStrategy"]
llm_instruction_text = "Please summarize the page content."  #@param {type:"string"}
llm_provider = "openai/gpt-3.5-turbo"  #@param {type:"string"}
llm_api_token = ""  #@param {type:"string"}
llm_input_format = "markdown"  #@param ["markdown","html","fit_markdown"]

cosine_semantic_filter = "Key points"  #@param {type:"string"}
cosine_word_threshold = 10  #@param {type:"integer"}
cosine_sim_threshold = 0.3  #@param {type:"number"}
cosine_top_k = 3  #@param {type:"integer"}

jsoncss_base_selector = ".article"  #@param {type:"string"}
jsonxpath_base_selector = "//div[@class='article']"  #@param {type:"string"}

# 3) Additional Options (Screenshots, PDF, Lazy Loading, etc.)
want_pdf = False  #@param {type:"boolean"}
want_screenshot = False  #@param {type:"boolean"}
storage_state_path = ""  #@param {type:"string"}
# Use scan_full_page to support JS-heavy pages (e.g., infinite scroll)
scan_full_page = True  #@param {type:"boolean"}
scroll_delay = 0.3  #@param {type:"number"}
wait_for_images = False  #@param {type:"boolean"}
base_word_count_threshold = 5  #@param {type:"integer"}

# 4) Concurrency & Dispatcher Settings
dispatcher_type = "None"  #@param ["None","MemoryAdaptiveDispatcher","SemaphoreDispatcher"] {allow-input: true}
max_session_permit = 10  #@param {type:"integer"}
semaphore_count = 5  #@param {type:"integer"}
enable_rate_limit = False  #@param {type:"boolean"}
rate_base_delay_min = 1.0  #@param {type:"number"}
rate_base_delay_max = 2.0  #@param {type:"number"}
rate_max_retries = 2  #@param {type:"integer"}
display_monitor_mode = "NONE"  #@param ["NONE","SIMPLE","DETAILED"]

# 5) BFS / Multi-Page Settings
enable_bfs = True  #@param {type:"boolean"}
max_depth = 7  #@param {type:"integer"}
max_pages = 2500  #@param {type:"integer"}
# Option to limit crawling strictly to the domain or allow subdomains.
limit_to_domain = True  #@param {type:"boolean"}
allow_subdomains = True  #@param {type:"boolean"}  # If True, subdomains (e.g. docs.crawl4ai.com, crawl4ai.com/mkdocs) are allowed.
include_patterns_csv = ""  #@param {type:"string"}
exclude_patterns_csv = ""  #@param {type:"string"}
enable_sitemap_parsing = True  #@param {type:"boolean"}

# 6) Proxy Settings
enable_proxy = False  #@param {type:"boolean"}
proxy_url = ""  #@param {type:"string"}
proxy_username = ""  #@param {type:"string"}
proxy_password = ""  #@param {type:"string"}

# 7) Hooks & Session ID
session_id_value = ""  #@param {type:"string"}
enable_hooks = False  #@param {type:"boolean"}
hook_selector = ".content"  #@param {type:"string"}

# 8) Output Options & File Splitting
output_json = True  #@param {type:"boolean"}
output_md   = True  #@param {type:"boolean"}
output_html = True  #@param {type:"boolean"}
output_txt  = True  #@param {type:"boolean"}
max_output_size_mb = 10  #@param {type:"integer"}
output_base_filename = "crawl4ai_output"  #@param {type:"string"}

# 9) Execution
execute_crawl = True  #@param {type:"boolean"}

# 10) Reverse/Backwards Crawling Options:
# If disable_reverse_crawling is True, only URLs that start with the original seed URL will be followed.
disable_reverse_crawling = False  #@param {type:"boolean"}

# -----------------------------------------------------------------------------
#       HOOK FUNCTIONS (optional)
# -----------------------------------------------------------------------------
async def custom_before_goto(page, context, url, **kwargs):
    logger.info(f"[HOOK] Before navigating to {url}")
    return page

async def custom_after_goto(page, context, url, response, **kwargs):
    logger.info(f"[HOOK] After loading {url}")
    if hook_selector.strip():
        try:
            await page.wait_for_selector(hook_selector.strip(), timeout=3000)
            logger.info(f"[HOOK] Selector {hook_selector.strip()} found.")
        except Exception as e:
            logger.warning(f"[HOOK] Selector {hook_selector.strip()} not found: {e}")
    return page

# =============================================================================
#       UTILITY FUNCTIONS
# =============================================================================
import urllib.parse
def get_domain(url: str) -> str:
    try:
        parsed = urllib.parse.urlparse(url)
        return parsed.netloc.lower()
    except:
        return ""

def same_domain(url: str, domain: str, allow_subdomains: bool=False) -> bool:
    d = get_domain(url)
    if allow_subdomains:
        return d.endswith(domain)
    else:
        return d == domain

def remove_fragment(url: str) -> str:
    parsed = urllib.parse.urlparse(url)
    return urllib.parse.urlunparse((parsed.scheme, parsed.netloc, parsed.path, parsed.params, parsed.query, ""))

def match_patterns(url: str, patterns: list) -> bool:
    for pat in patterns:
        pat = pat.strip()
        if pat and fnmatch.fnmatch(url, pat):
            return True
    return False

# Enhanced sitemap parser with fallback URL
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10), retry=retry_if_exception_type(Exception))
async def fetch_text(url: str) -> str:
    import aiohttp
    async with aiohttp.ClientSession() as sess:
        async with sess.get(url, timeout=15) as resp:
            return await resp.text()

async def parse_sitemap(url: str) -> list:
    results = []
    try:
        text = await fetch_text(url)
        root = ET.fromstring(text)
        ns = ""
        if root.tag.startswith("{"):
            ns = root.tag.split("}")[0].strip("{")
        if "sitemapindex" in root.tag.lower():
            for sitemap in root.findall(f".//{{{ns}}}sitemap"):
                loc = sitemap.find(f".//{{{ns}}}loc")
                if loc is not None and loc.text:
                    sub_url = loc.text.strip()
                    results.extend(await parse_sitemap(sub_url))
        elif "urlset" in root.tag.lower():
            for url_tag in root.findall(f".//{{{ns}}}url"):
                loc = url_tag.find(f".//{{{ns}}}loc")
                if loc is not None and loc.text:
                    results.append(loc.text.strip())
    except Exception as e:
        logger.warning(f"Failed to parse sitemap at {url}: {e}")
    return results

async def enhanced_sitemap_parser(domain: str) -> list:
    # Try default sitemap at /sitemap.xml, then fallback to /mkdocs/sitemap.xml
    default = domain.rstrip("/") + "/sitemap.xml"
    fallback = domain.rstrip("/") + "/mkdocs/sitemap.xml"
    links = await parse_sitemap(default)
    if not links:
        logger.info(f"Default sitemap not found. Trying fallback sitemap: {fallback}")
        links = await parse_sitemap(fallback)
    return links

# =============================================================================
#       BFS Crawler Implementation (No Redundant Route Tree)
# =============================================================================
class BFSCollector:
    def __init__(self, start_url: str, max_pages: int):
        self.start_url = remove_fragment(start_url)
        self.max_pages = max_pages
        self.visited = set([self.start_url])
        self.contents = {}  # url -> page text

    def is_visited(self, url: str) -> bool:
        return url in self.visited

    def mark_visited(self, url: str):
        self.visited.add(url)

    def set_result(self, url: str, success: bool, status_code: int, error: str = "", text: str = ""):
        if text:
            self.contents[url] = text

    def get_metadata(self, config_summary: dict) -> dict:
        return {
            "start_url": self.start_url,
            "total_visited": len(self.visited),
            "visited": list(self.visited),
            "config_summary": config_summary
        }

async def gather_links(result, domain: str, limit_to_dom: bool, includes: list, excludes: list, allow_subdomains: bool) -> list:
    if not result.success or not result.links:
        return []
    combined = result.links.get("internal", []) + result.links.get("external", [])
    out = []
    for linfo in combined:
        href = linfo.get("href", "").strip()
        if not href:
            continue
        href = remove_fragment(href)
        if limit_to_dom and not same_domain(href, domain, allow_subdomains):
            continue
        if includes and not match_patterns(href, includes):
            continue
        if excludes and match_patterns(href, excludes):
            continue
        out.append(href)
    return list(set(out))

@retry(stop=stop_after_attempt(2), wait=wait_exponential(min=1, max=5), retry=retry_if_exception_type(Exception))
async def single_page_crawl(url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig, session_id: str = None):
    return await crawler.arun(url, config=config, session_id=session_id)

async def do_bfs(start_url: str, domain: str, collector: BFSCollector,
                 crawler: AsyncWebCrawler, run_config: CrawlerRunConfig,
                 includes: list, excludes: list, limit_to_dom: bool, allow_subdomains: bool,
                 max_depth: int, max_pages: int, session_id: str, disable_reverse_crawling: bool):
    queue = collections.deque()
    queue.append((start_url, 0))
    with Progress(
        "[progress.description]{task.description}",
        BarColumn(),
        MofNCompleteColumn(),
        TimeElapsedColumn(),
        console=console
    ) as progress:
        task_id = progress.add_task("BFS Crawling", total=max_pages)
        crawled_count = 0
        while queue and crawled_count < max_pages:
            cur_url, depth = queue.popleft()
            progress.update(task_id, description=f"[cyan]Crawling:[/] {cur_url} (depth={depth})")
            crawled_count += 1
            # Dynamically update total based on number of visited URLs.
            progress.update(task_id, total=max(max_pages, len(collector.visited)))
            try:
                result = await single_page_crawl(cur_url, crawler, run_config, session_id=session_id)
                if not result.success:
                    logger.warning(f"Page crawl failed: {result.error_message}")
                    collector.set_result(cur_url, False, result.status_code, error=result.error_message)
                    continue
                text = result.markdown or ""
                # Clean output for downstream ingestion: remove stray tokens, fix formatting.
                text = re.sub(r"<https?://[^>]+>", lambda m: m.group(0).replace("<", "").replace(">", ""), text)
                collector.set_result(cur_url, True, result.status_code, text=text)
                if depth < max_depth and 200 <= result.status_code < 300:
                    new_links = await gather_links(result, domain, limit_to_dom, includes, excludes, allow_subdomains)
                    logger.info(f"Found {len(new_links)} new links from {cur_url}")
                    for child_url in new_links:
                        # Respect reverse crawling option: if disabled, only add if link starts with the seed URL.
                        if disable_reverse_crawling and not child_url.startswith(url_to_crawl):
                            continue
                        if not collector.is_visited(child_url):
                            collector.mark_visited(child_url)
                            queue.append((child_url, depth + 1))
            except Exception as e:
                logger.warning(f"Exception at {cur_url}: {e}")
                collector.set_result(cur_url, False, 0, error=str(e))
        progress.stop()

# =============================================================================
#       OUTPUT FORMATTING & FILE SPLITTING (with conditional part suffix)
# =============================================================================
def chunk_large_text(text: str, max_bytes: int) -> list:
    data = text.encode("utf-8")
    if len(data) <= max_bytes:
        return [text]
    parts = []
    start = 0
    while start < len(data):
        end = min(start + max_bytes, len(data))
        chunk = data[start:end].decode("utf-8", "ignore")
        parts.append(chunk)
        start = end
    return parts

def build_output_string(collector: BFSCollector, meta: dict, fmt: str) -> str:
    if fmt.lower() == "json":
        out_dict = {"metadata": meta, "results": []}
        for url, content in collector.contents.items():
            out_dict["results"].append({"url": url, "content": content})
        return json.dumps(out_dict, indent=2)
    elif fmt.lower() == "md":
        lines = ["# Crawl4AI Results", "\n## Metadata", "```json", json.dumps(meta, indent=2), "```"]
        for url, content in collector.contents.items():
            lines.append(f"\n## {url}\n```text\n{content}\n```")
        return "\n".join(lines)
    elif fmt.lower() == "html":
        lines = ["<html><head><meta charset='utf-8'><title>Crawl4AI Results</title></head><body>",
                 f"<h1>Metadata</h1><pre>{json.dumps(meta, indent=2)}</pre>"]
        for url, content in collector.contents.items():
            lines.append(f"<h2>{url}</h2><pre>{content}</pre>")
        lines.append("</body></html>")
        return "\n".join(lines)
    elif fmt.lower() == "txt":
        lines = ["=== Crawl4AI Results ===", "METADATA:", json.dumps(meta, indent=2)]
        for url, content in collector.contents.items():
            lines.append(f"\n--- {url} ---\n{content}")
        return "\n".join(lines)
    else:
        return ""

def write_output_files(collector: BFSCollector, meta: dict, base_filename: str, formats: list, max_size_mb: int) -> list:
    max_bytes = max_size_mb * 1024 * 1024
    output_paths = []
    for fmt in formats:
        if not fmt:
            continue
        logger.info(f"Writing output in {fmt} format...")
        big_str = build_output_string(collector, meta, fmt)
        parts = chunk_large_text(big_str, max_bytes)
        ext = f".{fmt.lower()}"
        if len(parts) == 1:
            filename = f"{base_filename}_{fmt.lower()}{ext}"
            filepath = os.path.join(output_dir, filename)
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(parts[0])
            logger.info(f"Wrote {filepath} ({len(parts[0].encode('utf-8'))} bytes)")
            output_paths.append(filepath)
        else:
            for i, chunk in enumerate(parts):
                partno = i + 1
                filename = f"{base_filename}_{fmt}_part{partno:03d}{ext}"
                filepath = os.path.join(output_dir, filename)
                with open(filepath, "w", encoding="utf-8") as f:
                    f.write(chunk)
                logger.info(f"Wrote {filepath} ({len(chunk.encode('utf-8'))} bytes)")
                output_paths.append(filepath)
    return output_paths

# =============================================================================
#       MAIN LOGIC: RUN THE CRAWL
# =============================================================================
async def run_colab_crawl():
    logger.info("========== BUILDING CONFIGS ==========")
    # 1) BrowserConfig
    bcfg = BrowserConfig(
        browser_type=browser_type_choice,
        headless=use_headless,
        verbose=verbose_browser
    )
    if enable_proxy and proxy_url.strip():
        pconf = {"server": proxy_url.strip()}
        if proxy_username.strip() and proxy_password.strip():
            pconf.update({"username": proxy_username.strip(), "password": proxy_password.strip()})
        bcfg.proxy_config = pconf
        logger.info(f"Using proxy: {pconf}")
    if storage_state_path.strip():
        bcfg.storage_state = storage_state_path.strip()
        logger.info(f"Using storage state from: {storage_state_path}")

    # 2) Translate cache mode
    cache_map = {
        "ENABLED": CacheMode.ENABLED,
        "DISABLED": CacheMode.DISABLED,
        "READ_ONLY": CacheMode.READ_ONLY,
        "WRITE_ONLY": CacheMode.WRITE_ONLY,
        "BYPASS": CacheMode.BYPASS
    }
    chosen_cache = cache_map.get(cache_mode_choice, CacheMode.BYPASS)

    # 3) Build run config
    run_config = CrawlerRunConfig(
        cache_mode=chosen_cache,
        magic=magic_mode,
        pdf=want_pdf,
        screenshot=want_screenshot,
        word_count_threshold=base_word_count_threshold,
        scan_full_page=scan_full_page,  # supports JS-heavy pages (e.g., infinite scroll)
        scroll_delay=scroll_delay,
        wait_for_images=wait_for_images
    )

    # 4) Set extraction strategy
    strat = None
    if extraction_strategy_choice == "NoExtractionStrategy":
        strat = NoExtractionStrategy()
        logger.info("Using NoExtractionStrategy.")
    elif extraction_strategy_choice == "JsonCssExtractionStrategy":
        schema = {"name": "JsonCssSample", "baseSelector": jsoncss_base_selector, "fields": [{"name": "title", "selector": "h2", "type": "text"}]}
        strat = JsonCssExtractionStrategy(schema=schema)
        logger.info(f"Using JsonCssExtractionStrategy with baseSelector={jsoncss_base_selector}")
    elif extraction_strategy_choice == "JsonXPathExtractionStrategy":
        schema = {"name": "JsonXPathSample", "baseSelector": jsonxpath_base_selector, "fields": [{"name": "title", "selector": ".//h2/text()", "type": "text"}]}
        from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
        strat = JsonXPathExtractionStrategy(schema=schema)
        logger.info(f"Using JsonXPathExtractionStrategy with baseSelector={jsonxpath_base_selector}")
    elif extraction_strategy_choice == "LLMExtractionStrategy":
        atoken = llm_api_token.strip() if llm_api_token.strip() else None
        strat = LLMExtractionStrategy(provider=llm_provider, api_token=atoken, instruction=llm_instruction_text, input_format=llm_input_format)
        logger.info(f"Using LLMExtractionStrategy: provider={llm_provider}, instruction={llm_instruction_text}")
    elif extraction_strategy_choice == "CosineStrategy":
        strat = CosineStrategy(semantic_filter=cosine_semantic_filter, word_count_threshold=cosine_word_threshold, sim_threshold=cosine_sim_threshold, top_k=cosine_top_k)
        logger.info(f"Using CosineStrategy: filter={cosine_semantic_filter}, top_k={cosine_top_k}")
    run_config.extraction_strategy = strat

    # 5) Concurrency / Dispatcher setup
    dispatcher = None
    if dispatcher_type != "None" and (MemoryAdaptiveDispatcher or SemaphoreDispatcher):
        rlim = None
        if enable_rate_limit and RateLimiter:
            rlim = RateLimiter(base_delay=(rate_base_delay_min, rate_base_delay_max), max_delay=30.0, max_retries=rate_max_retries)
        mon = None
        if CrawlerMonitor and display_monitor_mode != "NONE":
            dmode = DisplayMode.DETAILED if display_monitor_mode == "DETAILED" else DisplayMode.SIMPLE
            mon = CrawlerMonitor(display_mode=dmode)
        if dispatcher_type == "MemoryAdaptiveDispatcher" and MemoryAdaptiveDispatcher:
            dispatcher = MemoryAdaptiveDispatcher(memory_threshold_percent=70.0, max_session_permit=max_session_permit, rate_limiter=rlim, monitor=mon)
            logger.info(f"Using MemoryAdaptiveDispatcher with max_session_permit={max_session_permit}")
        elif dispatcher_type == "SemaphoreDispatcher" and SemaphoreDispatcher:
            dispatcher = SemaphoreDispatcher(semaphore_count=semaphore_count, rate_limiter=rlim, monitor=mon)
            logger.info(f"Using SemaphoreDispatcher with concurrency={semaphore_count}")

    # 6) Process BFS options and prepare include/exclude lists.
    inc_list = [x.strip() for x in include_patterns_csv.split(",") if x.strip()]
    exc_list = [x.strip() for x in exclude_patterns_csv.split(",") if x.strip()]
    domain = get_domain(url_to_crawl)

    # Prepare config summary for metadata.
    config_summary = {
        "start_url": url_to_crawl,
        "enable_bfs": enable_bfs,
        "max_depth": max_depth,
        "max_pages": max_pages,
        "limit_to_domain": limit_to_domain,
        "allow_subdomains": allow_subdomains,
        "disable_reverse_crawling": disable_reverse_crawling,
        "include_patterns": inc_list,
        "exclude_patterns": exc_list,
        "enable_sitemap": enable_sitemap_parsing,
        "cache_mode": cache_mode_choice,
        "magic_mode": magic_mode,
        "pdf": want_pdf,
        "screenshot": want_screenshot,
        "browser_type": browser_type_choice,
        "headless": use_headless
    }

    chosen_formats = []
    if output_json:
        chosen_formats.append("json")
    if output_md:
        chosen_formats.append("md")
    if output_html:
        chosen_formats.append("html")
    if output_txt:
        chosen_formats.append("txt")
    if not chosen_formats:
        logger.warning("No output formats selected! Defaulting to JSON and MD.")
        chosen_formats = ["json", "md"]

    # =============================================================================
    #       RUN THE CRAWL
    # =============================================================================
    try:
        async with AsyncWebCrawler(config=bcfg) as crawler:
            if enable_hooks:
                crawler.crawler_strategy.set_hook("before_goto", custom_before_goto)
                crawler.crawler_strategy.set_hook("after_goto", custom_after_goto)
            if enable_bfs:
                collector = BFSCollector(url_to_crawl, max_pages)
                # Enhanced sitemap parsing – inject discovered sitemap URLs into the crawl.
                if enable_sitemap_parsing:
                    sitemap_links = await enhanced_sitemap_parser(url_to_crawl)
                    if sitemap_links:
                        logger.info(f"Sitemap discovered {len(sitemap_links)} links.")
                        for link in sitemap_links:
                            if limit_to_domain and not same_domain(link, domain, allow_subdomains):
                                continue
                            if inc_list and not match_patterns(link, inc_list):
                                continue
                            if exc_list and match_patterns(link, exc_list):
                                continue
                            if disable_reverse_crawling and not link.startswith(url_to_crawl):
                                continue
                            if not collector.is_visited(link):
                                collector.mark_visited(link)
                                # Inject sitemap URL into BFS queue.
                                await do_bfs(link, domain, collector, crawler, run_config, inc_list, exc_list, limit_to_domain, allow_subdomains, max_depth, max_pages, session_id_value.strip() or None, disable_reverse_crawling)
                # Run BFS starting from the seed URL.
                await do_bfs(
                    start_url=collector.start_url,
                    domain=domain,
                    collector=collector,
                    crawler=crawler,
                    run_config=run_config,
                    includes=inc_list,
                    excludes=exc_list,
                    limit_to_dom=limit_to_domain,
                    allow_subdomains=allow_subdomains,
                    max_depth=max_depth,
                    max_pages=max_pages,
                    session_id=session_id_value.strip() or None,
                    disable_reverse_crawling=disable_reverse_crawling
                )
                meta = collector.get_metadata(config_summary)
                output_file_paths = write_output_files(collector, meta, output_base_filename, chosen_formats, max_output_size_mb)
            else:
                # Single page crawl mode
                logger.info("Running in single-page crawl mode.")
                collector = BFSCollector(url_to_crawl, 1)
                try:
                    result = await crawler.arun(url_to_crawl, run_config, session_id=session_id_value.strip() or None)
                    if not result.success:
                        logger.error(f"Crawl failed: {result.error_message}")
                        collector.set_result(url_to_crawl, False, result.status_code, error=result.error_message)
                    else:
                        text = result.markdown or ""
                        text = re.sub(r"<https?://[^>]+>", lambda m: m.group(0).replace("<", "").replace(">", ""), text)
                        collector.set_result(url_to_crawl, True, result.status_code, text=text)
                except Exception as e:
                    logger.error(f"Single-page crawl exception: {e}")
                    collector.set_result(url_to_crawl, False, 0, error=str(e))
                meta = collector.get_metadata(config_summary)
                output_file_paths = write_output_files(collector, meta, output_base_filename, chosen_formats, max_output_size_mb)
    except Exception as ex:
        logger.error(f"Top-level crawl exception: {ex}")
        traceback.print_exc()
        output_file_paths = []

    # New: Automatically download the output files to local machine if running in Colab.
    try:
        from google.colab import files as gfiles
        if auto_download_files and output_file_paths:
            logger.info("Initiating automatic download of output files...")
            for path in output_file_paths:
                logger.info(f"Downloading {path}...")
                gfiles.download(path)
    except ImportError:
        logger.info("google.colab not available. Skipping automatic file download.")

if execute_crawl:
    asyncio.run(run_colab_crawl())
else:
    logger.info("Set 'execute_crawl=True' above and re-run to start crawling.")


╔══════════════════════════════════════════════════════╗
║ Host system is missing dependencies to run browsers. ║
║ Missing libraries:                                   ║
║     libgtk-4.so.1                                    ║
║     libgraphene-1.0.so.0                             ║
║     libwoff2dec.so.1.0.2                             ║
║     libgstgl-1.0.so.0                                ║
║     libgstcodecparsers-1.0.so.0                      ║
║     libavif.so.13                                    ║
║     libharfbuzz-icu.so.0                             ║
║     libenchant-2.so.2                                ║
║     libsecret-1.so.0                                 ║
║     libhyphen.so.0                                   ║
║     libmanette-0.2.so.0                              ║
╚══════════════════════════════════════════════════════╝
    at validateDependenciesLinux (/usr/local/lib/python3.11/dist-packages/playwright/driver/package/lib/server/registry/dependencies.js:216:9)
  

Output()

20:18:51 | Found 36 new links from https://docs.crawl4ai.com/
20:18:59 | Found 35 new links from https://docs.crawl4ai.com/core/fit-markdown/
20:19:09 | Found 35 new links from https://docs.crawl4ai.com/core/crawler-result/
20:19:15 | Found 35 new links from https://docs.crawl4ai.com/advanced/ssl-certificate/
20:19:26 | Found 35 new links from https://docs.crawl4ai.com/core/browser-crawler-config/
20:19:37 | Found 35 new links from https://docs.crawl4ai.com/advanced/advanced-features/
20:19:46 | Found 35 new links from https://docs.crawl4ai.com/core/page-interaction/
20:19:58 | Found 35 new links from https://docs.crawl4ai.com/core/quickstart/
20:20:02 | Found 35 new links from https://docs.crawl4ai.com/advanced/crawl-dispatcher/
20:20:08 | Found 35 new links from https://docs.crawl4ai.com/advanced/file-downloading/
20:20:22 | Found 35 new links from https://docs.crawl4ai.com/extraction/no-llm-strategies/
20:20:29 | Found 35 new links from https://docs.crawl4ai.com/advanced/hooks-auth/

Output()

20:24:09 | Found 36 new links from https://docs.crawl4ai.com/blog/articles/dockerize_hooks/


Output()

20:24:16 | Found 36 new links from https://docs.crawl4ai.com/blog/releases/v0.4.3b1/


Output()

20:24:22 | Found 38 new links from https://crawl4ai.com
20:24:28 | Found 37 new links from https://crawl4ai.com/


20:25:37 | Writing output in json format...
20:25:37 | Wrote /content/crawl4ai_output_json.json (563061 bytes)
20:25:37 | Writing output in md format...
20:25:37 | Wrote /content/crawl4ai_output_md.md (544481 bytes)
20:25:37 | Writing output in html format...
20:25:37 | Wrote /content/crawl4ai_output_html.html (544787 bytes)
20:25:37 | Writing output in txt format...
20:25:37 | Wrote /content/crawl4ai_output_txt.txt (543940 bytes)
20:25:37 | Initiating automatic download of output files...
20:25:37 | Downloading /content/crawl4ai_output_json.json...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

20:25:37 | Downloading /content/crawl4ai_output_md.md...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

20:25:37 | Downloading /content/crawl4ai_output_html.html...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

20:25:37 | Downloading /content/crawl4ai_output_txt.txt...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>