<a href="https://colab.research.google.com/github/wyattowalsh/sitedumper/blob/main/SiteDumper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


In [17]:
"""
SiteDumper Notebook - Advanced Intelligent Website Content Harvester
Version: 2025.2.7
Last Updated: 2025-02-07 12:00:00 UTC
Author: wyattowalsh
"""
from datetime import datetime, timezone
#@title 📅 Current Time and User {display-mode: "form"}
USER_LOGIN = "w4w" #@param {type:"string"}
CURRENT_UTC = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") # Automatically set current UTC time


!apt-get update && apt-get install -y \
        libgtk-4-1 libgraphene-1.0-0 libwoff1 \
        libgstreamer-gl1.0-0 libgstreamer-plugins-base1.0-0 \
        libavif13 libharfbuzz-icu0 libenchant-2-2 \
        libsecret-1-0 libhyphen0 libmanette-0.2-0

%pip install -q \
    playwright \
    camoufox[geoip] \
    docling \
    docling-core \
    pydantic[email] \
    rich \
    loguru \
    nest-asyncio \
    uvloop \
    tqdm \
    beautifulsoup4 \
    lxml \
    python-docx \
    xmltodict \
    requests \
    pandas \
    aiohttp \
    asyncio-throttle \
    pyppeteer \
    aiofiles \
    python-magic \
    fastapi \
    uvicorn \
    colorama \
    yaspin \
    halo \
    psutil \
    xmltodict

!playwright install chromium
!python3 -m camoufox fetch

#@title 📚 Import Required Libraries {display-mode: "code"}
import os
import psutil
import sys
import glob
import fnmatch
from pathlib import Path
import asyncio
import json
import gzip
import random
from datetime import datetime, timezone, timedelta
from typing import Dict, List, Optional, Set, Union, Any, Generator, Literal
from urllib.parse import urlparse, urljoin
from functools import partial, wraps
from collections import deque, Counter, defaultdict
import xml.etree.ElementTree as ET
import requests
import xmltodict
from urllib.robotparser import RobotFileParser
import pandas as pd
import aiohttp
from io import BytesIO, StringIO
import hashlib
import time
import signal
import statistics
import re
from bs4 import BeautifulSoup
import aiofiles
import magic
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
import uvicorn
from asyncio_throttle import Throttler
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
from camoufox import AsyncCamoufox, AsyncNewBrowser
from pydantic import BaseModel, EmailStr, HttpUrl, validator, Field
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn, TimeRemainingColumn, MofNCompleteColumn
from rich.panel import Panel
from rich.style import Style
from rich.text import Text
from rich.layout import Layout
from rich.live import Live
from rich.table import Table
from rich.console import Group
from rich.align import Align
from loguru import logger
import nest_asyncio
import uvloop
from yaspin import yaspin
from halo import Halo
from colorama import init, Fore, Back, Style as ColoramaStyle
from docling_core.types.doc import DoclingDocument, ProvenanceItem, DocItemLabel, BoundingBox, CoordOrigin


# Initialize colorama for cross-platform colored output
init(autoreset=True)

#@title 🔧 Configuration Class Definition {display-mode: "code"}

# Configuration class using Pydantic
class Config(BaseModel):
    """Enhanced configuration with validation and descriptions."""
    # Basic settings
    user_login: str = Field(..., description="User login name.")
    current_utc: str = Field(..., description="Current UTC time.")
    output_dir: str = Field("sitedumper_output", description="Base directory for all outputs.")

    # Website settings
    website_url: HttpUrl = Field(..., description="URL of the website to crawl.")
    max_pages: int = Field(1000, ge=1, description="Maximum number of pages to crawl.")
    max_depth: int = Field(3, ge=0, description="Maximum crawl depth.")
    stay_on_domain: bool = Field(True, description="Restrict crawling to the initial domain.")

    # Compliance settings
    obey_robots_txt: bool = Field(True, description="Follow rules in robots.txt.")
    process_sitemaps: bool = Field(True, description="Process sitemaps to discover URLs.")
    recursive_sitemap: bool = Field(True, description="Recursively process nested sitemaps.")
    sitemap_only: bool = Field(False, description="Only crawl URLs found in sitemaps.")

    # Path patterns
    include_patterns: List[str] = Field(default=[], description="URL patterns to include.")
    exclude_patterns: List[str] = Field(default=[], description="URL patterns to exclude.")

    # Browser settings
    stealth_mode: Literal["Basic", "Moderate", "Maximum"] = Field("Maximum", description="Level of browser stealth.")
    browser_profile: Literal["random", "mobile", "desktop"] = Field("random", description="Browser profile type.")
    use_proxy: bool = Field(False, description="Use a proxy server for requests.")
    proxy_url: Optional[str] = Field(None, description="Proxy server URL (if use_proxy is True).")

    # Performance settings
    request_delay_min: float = Field(1.0, ge=0.0, description="Minimum delay between requests.")
    request_delay_max: float = Field(3.0, ge=0.0, description="Maximum delay between requests.")
    max_concurrent: int = Field(5, ge=1, description="Maximum concurrent requests.")
    request_timeout: int = Field(30, ge=1, description="Request timeout in seconds.")
    retry_attempts: int = Field(3, ge=1, description="Number of retry attempts for failed requests.")

    # Output settings
    export_formats: List[str] = Field(default=["markdown", "json", "text", "html"], description="Formats to export the scraped content.")
    compress_output: bool = Field(True, description="Compress output files using gzip.")
    structure_output: bool = Field(True, description="Organize output files in directories mirroring site structure.")
    maintain_hierarchy: bool = Field(True, description="Preserve URL hierarchy in output file paths.")
    clean_output: bool = Field(True, description="Remove unnecessary elements from HTML.")

    # Export options
    export_metadata: bool = Field(True, description="Include metadata in output files.")
    export_stats: bool = Field(True, description="Export crawler statistics.")
    export_sitemap: bool = Field(True, description="Export discovered sitemap URLs.")
    include_timestamps: bool = Field(True, description="Include timestamps in output data.")
    include_checksums: bool = Field(True, description="Include checksums for content verification.")
    generate_report: bool = Field(True, description="Generate a comprehensive crawl report.")

    # API settings
    enable_api: bool = Field(False, description="Enable the REST API for monitoring and control.")
    api_port: int = Field(8000, ge=1024, le=65535, description="Port for the API server.")

    @validator('request_delay_max')
    def validate_delay_max(cls, v, values):
        """Ensure max delay is not less than min delay."""
        if 'request_delay_min' in values and v < values['request_delay_min']:
            raise ValueError('request_delay_max must be greater than or equal to request_delay_min')
        return v

class CrawlerStats:
    """Enhanced statistics tracking with real-time visualization using rich."""
    def __init__(self):
        self.start_time = datetime.now(timezone.utc)
        self.pages_processed = 0
        self.bytes_downloaded = 0
        self.download_times = []
        self.processing_times = []
        self.errors = Counter()
        self.status_codes = Counter()
        self.content_types = Counter()
        self.urls_processed = set()
        self.skipped_urls = {}
        self.current_memory_usage = 0
        self.peak_memory_usage = 0
        self.last_update = time.time()
        self.update_interval = 1.0  # Update stats every second

        # Real-time visualization setup
        self.progress_table = Table(
            title="Crawler Statistics",
            show_header=True,
            header_style="bold magenta",
            border_style="cyan",
            box=None
        )
        self.progress_table.add_column("Metric", style="cyan")
        self.progress_table.add_column("Value", justify="right", style="green")

    def update_memory_usage(self):
        """Update memory usage statistics."""
        process = psutil.Process(os.getpid())
        self.current_memory_usage = process.memory_info().rss / 1024 / 1024  # MB
        self.peak_memory_usage = max(self.peak_memory_usage, self.current_memory_usage)

    def get_progress_display(self) -> Panel:
        """Generate rich progress display."""
        self.update_memory_usage()

        # Update progress table
        self.progress_table.clear()
        stats = [
            ("Pages Processed", f"{self.pages_processed:,}"),
            ("Data Downloaded", f"{self.bytes_downloaded / 1024 / 1024:.2f} MB"),
            ("Average Speed", f"{self.get_average_speed():.2f} pages/s"),
            ("Current Memory", f"{self.current_memory_usage:.1f} MB"),
            ("Peak Memory", f"{self.peak_memory_usage:.1f} MB"),
            ("Success Rate", f"{self.get_success_rate():.1f}%"),
            ("Active URLs", f"{len(self.urls_processed):,}"),
            ("Error Count", f"{sum(self.errors.values()):,}")
        ]

        for metric, value in stats:
            self.progress_table.add_row(metric, value)

        return Panel(
            Align.center(self.progress_table),
            title="[bold cyan]SiteDumper Status[/bold cyan]",
            subtitle=f"[dim]Running since: {self.start_time.strftime('%Y-%m-%d %H:%M:%S UTC')}[/dim]",
            border_style="cyan",
            padding=(1, 2)
        )
    def get_average_speed(self) -> float:
        """Calculate average processing speed."""
        duration = (datetime.now(timezone.utc) - self.start_time).total_seconds()
        return self.pages_processed / duration if duration > 0 and self.pages_processed > 0 else 0.0

    def get_success_rate(self) -> float:
        """Calculate success rate."""
        total = self.pages_processed + sum(self.errors.values())
        return (self.pages_processed / total * 100) if total > 0 else 0.0

    def add_page(self, url: str, size: int, content_type: str, status_code: int):
        """Record processed page statistics."""
        self.pages_processed += 1
        self.bytes_downloaded += size
        self.status_codes[status_code] += 1
        self.content_types[content_type] += 1
        self.urls_processed.add(url)

        # Update display if needed
        current_time = time.time()
        if current_time - self.last_update >= self.update_interval:
            self.last_update = current_time
            console.print(self.get_progress_display())


    def add_error(self, url: str, error: str):
        """Record error statistics."""
        self.errors[error] += 1
        self.skipped_urls[url] = error
        console.print(f"[red]Error processing {url}: {error}[/red]")

    def add_skipped(self, url: str, reason: str):
        """Record skipped URL statistics."""
        self.skipped_urls[url] = reason
        console.print(f"[yellow]Skipped {url}: {reason}[/yellow]")

    def add_download_time(self, time: float):
        """Record download time."""
        self.download_times.append(time)

    def add_processing_time(self, time: float):
        """Record processing time."""
        self.processing_times.append(time)

    def finish(self):
        """Complete statistics tracking and generate final report."""
        self.end_time = datetime.now(timezone.utc)
        self.generate_final_report()

    def get_summary(self) -> dict:
        """Get current statistics summary."""
        return {
            "pages_processed": self.pages_processed,
            "bytes_downloaded": self.bytes_downloaded,
            "average_speed": self.get_average_speed(),
            "success_rate": self.get_success_rate(),
            "errors": dict(self.errors),
            "status_codes": dict(self.status_codes),
            "content_types": dict(self.content_types),
            "memory_usage": {
                "current": self.current_memory_usage,
                "peak": self.peak_memory_usage
            }
        }

    def generate_final_report(self):
        """Generate and display the final crawler report."""
        duration = self.end_time - self.start_time

        summary_table = Table(
            title="Crawler Summary Report",
            show_header=True,
            header_style="bold magenta",
            border_style="cyan"
        )

        summary_table.add_column("Metric", style="cyan")
        summary_table.add_column("Value", justify="right", style="green")

        summary_stats = [
            ("Duration", str(duration)),
            ("Total Pages", f"{self.pages_processed:,}"),
            ("Total Data", f"{self.bytes_downloaded / 1024 / 1024:.2f} MB"),
            ("Average Speed", f"{self.get_average_speed():.2f} pages/s"),
            ("Success Rate", f"{self.get_success_rate():.1f}%"),
            ("Peak Memory", f"{self.peak_memory_usage:.1f} MB"),
            ("Unique URLs", f"{len(self.urls_processed):,}"),
            ("Error Count", f"{sum(self.errors.values()):,}"),
            ("Status Codes", ", ".join(f"{k}: {v}" for k, v in self.status_codes.most_common())),
            ("Content Types", ", ".join(f"{k}: {v}" for k, v in self.content_types.most_common(3)))
        ]

        for metric, value in summary_stats:
            summary_table.add_row(metric, value)

        console.print("\n")
        console.print(Panel(
            summary_table,
            title="[bold cyan]Crawl Complete![/bold cyan]",
            border_style="cyan",
            padding=(1, 2)
        ))

class URLQueue:
    """Manages URLs to be crawled, prioritizing by depth."""
    def __init__(self, max_depth: int):
        self.queue = deque()
        self.processed_urls = set()
        self.max_depth = max_depth
        self.lock = asyncio.Lock()  # Ensure thread safety

    async def add_url(self, url: str, depth: int = 0, referrer: Optional[str] = None):
        """Add a URL to the queue if it hasn't been processed and is within max depth."""
        async with self.lock:
            if url not in self.processed_urls and depth <= self.max_depth:
                self.queue.append({"url": url, "depth": depth, "referrer": referrer})
                self.processed_urls.add(url)

    async def get_next(self) -> Optional[dict]:
        """Retrieve the next URL from the queue, prioritizing by depth."""
        async with self.lock:
            if self.queue:
                return self.queue.popleft()  # FIFO for breadth-first
            return None

    def get_stats(self) -> dict:
        """Return statistics about the queue."""
        return {
            "queued": len(self.queue),
            "processed": len(self.processed_urls),
            "max_depth": self.max_depth
        }

    def is_empty(self) -> bool:
        """Check if the queue is empty."""
        return len(self.queue) == 0

class ContentProcessor:
    """Advanced content processor with docling integration."""
    def __init__(self, clean_output: bool = True):
        self.clean_output = clean_output
        self.mime_detector = magic.Magic(mime=True)
        self.content_stats = Counter()

    async def process_content(self, url: str, content: str, response_headers: dict) -> dict:
        """Process content with advanced features and error handling."""
        try:
            # Detect content type
            content_type = response_headers.get('content-type',
                self.mime_detector.from_buffer(content.encode()))
            self.content_stats[content_type] += 1

            # Create docling document
            doc = DoclingDocument()

            # Create soup object with error handling
            try:
                soup = BeautifulSoup(content, 'lxml')
            except Exception as e:
                logger.warning(f"Failed to parse HTML with lxml, falling back to html.parser: {e}")
                soup = BeautifulSoup(content, 'html.parser')

            # Extract and process content
            title = self._extract_title(soup, url)
            meta_tags = self._extract_meta_tags(soup)
            links = self._extract_links(soup, url)
            images = self._extract_images(soup, url)

            # Process main content
            text = self._extract_clean_text(soup)

            # Create text item with provenance
            text_item = doc.add_text(
                text=text,
                prov=ProvenanceItem(
                    page_no=1,  # Single page for web content
                    charspan=(0, len(text)),
                    bbox=BoundingBox(  # Placeholder bbox
                        x1=0, y1=0, x2=100, y2=100,
                        origin=CoordOrigin.TOPLEFT
                    )
                ),
                label=DocItemLabel.PARAGRAPH
            )

            return {
                'url': url,
                'title': title,
                'meta_tags': meta_tags,
                'text': text,
                'links': links,
                'images': images,
                'content_type': content_type,
                'doc_model': doc,  # Include docling document model
                'checksums': self._generate_checksums(content),
                'stats': {
                    'text_length': len(text),
                    'link_count': len(links),
                    'image_count': len(images),
                    'processing_timestamp': datetime.now(timezone.utc).isoformat()
                }
            }

        except Exception as e:
            logger.error(f"Error processing content for {url}: {e}")
            raise

    def _extract_clean_text(self, soup: BeautifulSoup) -> str:
        """Extract and clean text content with contextual awareness."""
        # Remove unwanted elements
        for unwanted in ['script', 'style', 'noscript', 'iframe', 'head']:
            for elem in soup.find_all(unwanted):
                elem.decompose()

        # Extract text with structure preservation
        paragraphs = []  # Initialize as an empty list to store paragraphs

        for elem in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']):
            text = elem.get_text(strip=True)
            if text:
                if elem.name.startswith('h'):
                    # Preserve heading structure
                    level = int(elem.name[1])
                    paragraphs.append(f"{'#' * level} {text}")
                elif elem.name == 'li':
                    # Preserve list items
                    paragraphs.append(f"- {text}")
                else:
                    paragraphs.append(text)

        return "\n\n".join(paragraphs)

    def _extract_title(self, soup: BeautifulSoup, url: str) -> str:
        """Extract page title with enhanced fallbacks."""
        title = None

        # Try different title sources in priority order
        if soup.title:
            title = soup.title.string
        if not title and soup.find('meta', property='og:title'):
            title = soup.find('meta', property='og:title')['content']
        if not title and soup.find('h1'):
            title = soup.find('h1').get_text(strip=True)
        if not title and soup.find('meta', {'name': 'twitter:title'}):
            title = soup.find('meta', {'name': 'twitter:title'})['content']

        # Clean and normalize title
        if title:
            title = ' '.join(title.split())

        return title or urlparse(url).path.split('/')[-1] or url

    def _extract_meta_tags(self, soup: BeautifulSoup) -> dict:
        """Extract comprehensive metadata."""
        meta_tags = {}

        # Standard meta tags
        for meta in soup.find_all('meta'):
            name = meta.get('name', meta.get('property', ''))
            content = meta.get('content', '')
            if name and content:
                meta_tags[name] = content

        # OpenGraph tags
        for meta in soup.find_all('meta', property=re.compile('^og:')):
            meta_tags[meta['property']] = meta.get('content', '')

        # Twitter cards
        for meta in soup.find_all('meta', name=re.compile('^twitter:')):
            meta_tags[meta['name']] = meta.get('content', '')

        # Schema.org metadata
        for script in soup.find_all('script', type='application/ld+json'):
            try:
                data = json.loads(script.string)
                if isinstance(data, dict):
                    meta_tags['schema_org'] = data
            except (json.JSONDecodeError, AttributeError):
                continue

        return meta_tags

    def _extract_links(self, soup: BeautifulSoup, base_url: str) -> Dict[str, List[dict]]:
        """Extract and categorize links with enhanced metadata."""
        links = defaultdict(list)

        for a in soup.find_all('a', href=True):
            href = urljoin(base_url, a['href'])
            text = a.get_text(strip=True)
            title = a.get('title', '')
            rel = a.get('rel',)

            link_data = {
                'url': href,
                'text': text,
                'title': title,
                'rel': rel,
                'class': a.get('class',),
                'id': a.get('id', ''),
                'aria_label': a.get('aria-label', '')
            }

            # Categorize link
            if href.startswith('mailto:'):
                links['email'].append(link_data)
            elif href.startswith('tel:'):
                links['phone'].append(link_data)
            elif urlparse(href).netloc == urlparse(base_url).netloc:
                links['internal'].append(link_data)
            else:
                links['external'].append(link_data)

        return dict(links)

    def _extract_images(self, soup: BeautifulSoup, base_url: str) -> List[dict]:
        """Extract comprehensive image information."""
        images = []  # Initialize images list to avoid syntax errors

        for img in soup.find_all('img'):
            src = img.get('src', '')
            if src:
                image_url = urljoin(base_url, src)

                # Extract all available image metadata
                image_data = {
                    'url': image_url,
                    'alt': img.get('alt', ''),
                    'title': img.get('title', ''),
                    'width': img.get('width', ''),
                    'height': img.get('height', ''),
                    'class': img.get('class', []),
                    'id': img.get('id', ''),
                    'loading': img.get('loading', ''),
                    'srcset': img.get('srcset', ''),
                    'sizes': img.get('sizes', ''),
                    'figure_caption': None
                }

                # Look for associated figure caption
                figure_parent = img.find_parent('figure')
                if figure_parent:
                    figcaption = figure_parent.find('figcaption')
                    if figcaption:
                        image_data['figure_caption'] = figcaption.get_text(strip=True)

                images.append(image_data)

        return images


    def _generate_checksums(self, content: str) -> dict:
        """Generate multiple checksums for content verification."""
        content_bytes = content.encode('utf-8')
        return {
            'md5': hashlib.md5(content_bytes).hexdigest(),
            'sha1': hashlib.sha1(content_bytes).hexdigest(),
            'sha256': hashlib.sha256(content_bytes).hexdigest()
        }

class PathFilter:
    """Enhanced path filtering with pattern matching."""
    def __init__(self, include_patterns: List[str], exclude_patterns: List[str]):
        self.include_patterns = [p.strip() for p in include_patterns if p.strip()]
        self.exclude_patterns = [p.strip() for p in exclude_patterns if p.strip()]

    def should_process(self, url: str) -> bool:
        """Determine if URL should be processed based on patterns."""
        path = urlparse(url).path

        # Check exclude patterns first
        for pattern in self.exclude_patterns:
            if fnmatch.fnmatch(path, pattern):
                return False

        # If include patterns exist, path must match at least one
        if self.include_patterns:
            return any(fnmatch.fnmatch(path, pattern) for pattern in self.include_patterns)

        return True


class RobotsProcessor:
    """Enhanced robots.txt processor with caching and retry logic."""
    def __init__(self, base_url: str):
        self.base_url = base_url
        self.robots_url = urljoin(base_url, "/robots.txt")
        self.parser = RobotFileParser(self.robots_url)
        self.cache = {}
        self.cache_time = 3600  # Cache for 1 hour
        self.last_fetch = 0
        self.max_retries = 3
        self.initialized = False

    async def initialize(self):
        """Initialize robots.txt parser with retry logic."""
        if self.initialized:
            return

        async with aiohttp.ClientSession() as session:
            for attempt in range(self.max_retries):
                try:
                    async with session.get(self.robots_url) as response:
                        if response.status == 200:
                            content = await response.text()
                            self.parser.parse(content.splitlines())
                            self.last_fetch = time.time()
                            self.initialized = True
                            break
                        elif response.status == 404:
                            # No robots.txt, allow everything
                            self.initialized = True
                            break
                except Exception as e:
                    if attempt == self.max_retries - 1:
                        logger.warning(f"Failed to fetch robots.txt: {e}")
                        self.initialized = True  # Proceed without robots.txt
                    await asyncio.sleep(2 ** attempt)  # Exponential backoff

    def can_fetch(self, url: str) -> bool:
        """Check if URL can be fetched according to robots.txt."""
        if not self.initialized:
            return True  # Allow if not initialized

        # Check cache first
        cache_key = url
        cached_result = self.cache.get(cache_key)
        if cached_result and time.time() - cached_result['time'] < self.cache_time:
            return cached_result['allowed']

        # Check robots.txt
        allowed = True  # Default to allowed
        if self.initialized and self.parser.mtime():
            allowed = self.parser.can_fetch("*", url)

        # Update cache
        self.cache[cache_key] = {
            'time': time.time(),
            'allowed': allowed
        }

        return allowed

class SitemapProcessor:
    """Enhanced sitemap processor with recursive support."""
    def __init__(self, recursive: bool = True):
        self.recursive = recursive
        self.processed_sitemaps = set()
        self.max_retries = 3

    async def process_sitemap(self, sitemap_url: str) -> Set[str]:
        """Process sitemap with support for index sitemaps."""
        urls = set()

        if sitemap_url in self.processed_sitemaps:
            return urls

        self.processed_sitemaps.add(sitemap_url)

        async with aiohttp.ClientSession() as session:
            for attempt in range(self.max_retries):
                try:
                    async with session.get(sitemap_url) as response:
                        if response.status!= 200:
                            break

                        content = await response.text()

                        # Try parsing as XML
                        try:
                            sitemap_dict = xmltodict.parse(content)
                        except Exception:
                            # Handle non-XML sitemaps (e.g., plain text)
                            urls.update(
                                url.strip() for url in content.splitlines()
                                if url.strip().startswith('http')
                            )
                            break

                        # Process sitemap index
                        if 'sitemapindex' in sitemap_dict:
                            if self.recursive:
                                sitemaps = sitemap_dict['sitemapindex']['sitemap']
                                if isinstance(sitemaps, dict):
                                    sitemaps = [sitemaps]

                                for sitemap in sitemaps:
                                    loc = sitemap.get('loc')
                                    if loc:
                                        sub_urls = await self.process_sitemap(loc)
                                        urls.update(sub_urls)

                        # Process urlset
                        elif 'urlset' in sitemap_dict:
                            url_entries = sitemap_dict['urlset']['url']
                            if isinstance(url_entries, dict):
                                url_entries = [url_entries]

                            for entry in url_entries:
                                loc = entry.get('loc')
                                if loc:
                                    urls.add(loc)

                        break

                except Exception as e:
                    if attempt == self.max_retries - 1:
                        logger.warning(f"Failed to process sitemap {sitemap_url}: {e}")
                    await asyncio.sleep(2 ** attempt)

        return urls

class RateLimiter:
    """Advanced rate limiter with dynamic adjustment."""
    def __init__(self, min_delay: float, max_delay: float, max_concurrent: int):
        self.min_delay = min_delay
        self.max_delay = max_delay
        self.current_delay = min_delay
        self.last_request_time = defaultdict(float)
        self.throttler = Throttler(rate_limit=max_concurrent)
        self.response_times = deque(maxlen=100)
        self.errors = deque(maxlen=100)

    async def acquire(self, domain: str):
        """Acquire permission to make a request with dynamic rate limiting."""
        async with self.throttler:
            # Calculate time since last request to this domain
            elapsed = time.time() - self.last_request_time[domain]
            if elapsed < self.current_delay:
                await asyncio.sleep(self.current_delay - elapsed)

            self.last_request_time[domain] = time.time()

    def update_metrics(self, response_time: float, success: bool):
        """Update metrics to adjust rate limiting."""
        self.response_times.append(response_time)
        self.errors.append(not success)

        # Calculate error rate
        error_rate = sum(self.errors) / len(self.errors) if self.errors else 0

        # Adjust delay based on metrics
        if error_rate > 0.1:  # More than 10% errors
            self.current_delay = min(self.current_delay * 1.5, self.max_delay)
        elif error_rate < 0.05 and len(self.response_times) > 0 and statistics.mean(self.response_times) < 1.0:
            self.current_delay = max(self.current_delay * 0.8, self.min_delay)

class SiteDumper:
    """Advanced web crawler with docling integration and real-time monitoring."""
    def __init__(self, config: Config):
        self.config = config
        self.stats = CrawlerStats()
        self.path_filter = PathFilter(config.include_patterns, config.exclude_patterns)
        self.content_processor = ContentProcessor(config.clean_output)
        self.url_queue = URLQueue(config.max_depth)
        self.rate_limiter = RateLimiter(
            min_delay=config.request_delay_min,
            max_delay=config.request_delay_max,
            max_concurrent=config.max_concurrent
        )

        # Initialize components
        self.robots = None if not config.obey_robots_txt else RobotsProcessor(str(config.website_url))
        self.sitemap = SitemapProcessor(config.recursive_sitemap)

        # Setup playwright
        self.browser = None
        self.browser_context = None

        # Setup output directories
        self.output_dir = Path(config.output_dir)
        self.setup_directories()

        # Configure logging
        self.setup_logging()

        # Initialize docling document collection
        self.documents = []  # Initialize as an empty list to store processed documents


        # Initialize API if enabled
        self.api = None
        if config.enable_api:
            self.setup_api()

    def setup_directories(self):
        """Create necessary directory structure."""
        directories = [
            self.output_dir,
            *[self.output_dir / fmt for fmt in self.config.export_formats],
            self.output_dir / "logs",
            self.output_dir / "stats",
            self.output_dir / "media",
            self.output_dir / "reports",
            self.output_dir / "docling_models"  # For docling document models
        ]

        for directory in directories:
            directory.mkdir(parents=True, exist_ok=True)

    def setup_logging(self):
        """Configure advanced logging."""
        log_format = (
            "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | "
            "<level>{level: <8}</level> | "
            "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | "
            "<white>{message}</white>"
        )

        logger.remove()
        logger.add(
            self.output_dir / "logs/debug.log",
            format=log_format,
            level="DEBUG",
            rotation="100 MB",
            compression="zip"
        )
        logger.add(
            self.output_dir / "logs/error.log",
            format=log_format,
            level="ERROR",
            rotation="100 MB",
            compression="zip"
        )

    def setup_api(self):
        """Initialize FastAPI instance with enhanced endpoints."""
        self.api = FastAPI(
            title="SiteDumper API",
            description="Real-time monitoring and control API for SiteDumper",
            version="2025.2.7"
        )

        @self.api.get("/status")
        async def get_status():
            """Get the current status of the crawler."""
            return {
                "status": "running",
                "stats": self.stats.get_summary(),
                "queue": self.url_queue.get_stats(),
                "memory": {
                    "current": self.stats.current_memory_usage,
                    "peak": self.stats.peak_memory_usage
                },
                "rate_limiting": {
                    "current_delay": self.rate_limiter.current_delay,
                    "error_rate": sum(self.rate_limiter.errors) / len(self.rate_limiter.errors) if self.rate_limiter.errors else 0
                }
            }

        @self.api.get("/stats/documents")
        async def get_document_stats():
            """Get statistics about the processed documents."""
            return {
                "total_documents": len(self.documents),
                "average_length": statistics.mean([len(doc.text) for doc in self.documents]) if self.documents else 0,
                "content_types": Counter(doc['content_type'] for doc in self.documents if 'content_type' in doc)
            }


        @self.api.post("/pause")
        async def pause_crawler():
            """Pause the crawler (not implemented yet)."""
            raise HTTPException(status_code=501, detail="Not Implemented")

        @self.api.post("/resume")
        async def resume_crawler():
            """Resume the crawler (not implemented yet)."""
            raise HTTPException(status_code=501, detail="Not Implemented")


    async def initialize_browser(self):
        """Initialize and configure browser instance."""
        playwright = await async_playwright().start()

        browser_args = {
            "headless": True,
            "proxy": {"server": self.config.proxy_url} if self.config.use_proxy else None
        }

        self.browser = await playwright.chromium.launch(**browser_args)

        context_args = {
            "viewport": {"width": 1920, "height": 1080},
            "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"  # Default, can be overridden by profile
        }

        if self.config.browser_profile == "mobile":
            context_args["user_agent"] = "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1"
        elif self.config.browser_profile == "desktop":
            context_args["user_agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"

        self.browser_context = await self.browser.new_context(**context_args)

    async def process_page(self, url: str, depth: int) -> bool:
        """Process a single page with docling integration."""
        domain = urlparse(url).netloc

        async with self.rate_limiter.throttler:  # Use throttler directly
            try:
                await self.rate_limiter.acquire(domain)
                start_time = time.time()

                page = await self.browser_context.new_page()

                try:
                    # Configure page
                    await page.set_extra_http_headers({"Accept-Language": "en-US,en;q=0.9"})

                    # Navigate with retry logic
                    for attempt in range(self.config.retry_attempts):
                        try:
                            response = await page.goto(
                                url,
                                wait_until="networkidle",
                                timeout=self.config.request_timeout * 1000
                            )

                            if response and response.ok:
                                break

                        except PlaywrightTimeoutError:
                            logger.warning(f"Timeout on attempt {attempt+1} for {url}")
                            if attempt == self.config.retry_attempts - 1:
                                raise
                            await asyncio.sleep(2 ** attempt)  # Exponential backoff
                        except Exception as e:
                            if attempt == self.config.retry_attempts - 1:
                                raise
                            await asyncio.sleep(2 ** attempt)


                    if not response or not response.ok:
                        self.stats.add_error(url, f"HTTP {response.status if response else 'No response'}")
                        return False

                    # Get page content
                    content = await page.content()

                    # Process content with docling integration
                    processed_content = await self.content_processor.process_content(
                        url,
                        content,
                        dict(response.headers)
                    )

                    # Save content and document model
                    await self.save_content(processed_content)
                    self.documents.append(processed_content)  # Append the entire dictionary

                    # Update statistics
                    process_time = time.time() - start_time
                    self.stats.add_page(
                        url=url,
                        size=len(content),
                        content_type=response.headers.get("content-type", "unknown"),
                        status_code=response.status
                    )
                    self.stats.add_processing_time(process_time)

                    # Update rate limiter metrics
                    self.stats.add_download_time(process_time)  # Use process_time as a proxy for download time
                    self.rate_limiter.update_metrics(process_time, True)

                    # Process new URLs
                    if not self.config.sitemap_only:
                        for link_type, links in processed_content["links"].items():
                            if link_type in ["internal", "external"]:
                                for link in links:
                                    link_url = link["url"]
                                    if self.should_crawl_url(link_url):
                                        await self.url_queue.add_url(
                                            link_url,
                                            depth + 1,
                                            referrer=url
                                        )

                    return True

                finally:
                    await page.close()

            except Exception as e:
                self.stats.add_error(url, str(e))
                self.rate_limiter.update_metrics(time.time() - start_time, False)
                logger.exception(f"Error processing {url}")
                return False

    def should_crawl_url(self, url: str) -> bool:
        """Determine if a URL should be crawled based on configuration."""
        # Check if URL is already processed
        if url in self.url_queue.processed_urls:
            return False

        # Check robots.txt if enabled
        if self.robots and not self.robots.can_fetch(url):
            self.stats.add_skipped(url, "Disallowed by robots.txt")
            return False

        # Check domain if stay_on_domain is enabled
        if self.config.stay_on_domain:
            base_domain = urlparse(str(self.config.website_url)).netloc
            url_domain = urlparse(url).netloc
            if base_domain!= url_domain:
                self.stats.add_skipped(url, "Outside of base domain")
                return False

        # Check path filters
        if not self.path_filter.should_process(url):
            self.stats.add_skipped(url, "Filtered by include/exclude patterns")
            return False

        return True

    async def save_content(self, processed_content: dict):
        """Save processed content to appropriate formats."""
        url = processed_content['url']
        parsed_url = urlparse(url)
        path = parsed_url.path.strip('/')
        if not path:
          path = "index"

        # Ensure the path is safe and doesn't contain invalid characters
        path = re.sub(r'[\\/*?:"<>|]', "_", path)

        if self.config.maintain_hierarchy:
            # Create directory structure based on URL hierarchy
            base_path = self.output_dir / parsed_url.netloc
            full_path = base_path / path
        else:
            full_path = self.output_dir / path

        # Handle index files
        if full_path.name == "" or full_path.name.startswith("index."):
            if self.config.maintain_hierarchy:
                full_path = full_path.parent / "index"
            else:
                full_path = full_path / "index"

        for fmt in self.config.export_formats:
            try:
                if fmt == "markdown":
                    filepath = full_path.with_suffix(".md")
                    filepath.parent.mkdir(parents=True, exist_ok=True)
                    async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
                        await f.write(self.convert_to_markdown(processed_content))

                elif fmt == "json":
                    filepath = full_path.with_suffix(".json")
                    filepath.parent.mkdir(parents=True, exist_ok=True)
                    async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
                         await f.write(json.dumps(processed_content, indent=2, ensure_ascii=False))

                elif fmt == "text":
                    filepath = full_path.with_suffix(".txt")
                    filepath.parent.mkdir(parents=True, exist_ok=True)
                    async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
                        await f.write(processed_content['text'])

                elif fmt == "html":
                    filepath = full_path.with_suffix(".html")
                    filepath.parent.mkdir(parents=True, exist_ok=True)
                    async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
                        await f.write(processed_content.get('raw_html', ''))  # Save raw HTML if available

                if self.config.compress_output:
                    # Compress the file
                    compressed_filepath = filepath.with_suffix(filepath.suffix + ".gz")
                    async with aiofiles.open(filepath, 'rb') as f_in:
                        async with aiofiles.open(compressed_filepath, 'wb') as f_out:
                            await f_out.write(gzip.compress(await f_in.read()))
                    # Optionally remove the uncompressed file
                    os.remove(filepath)


            except Exception as e:
                logger.error(f"Error saving content in {fmt} format for {url}: {e}")
                self.stats.add_error(url, f"Failed to save in {fmt} format")

    def convert_to_markdown(self, processed_content: dict) -> str:
        """Convert processed content to Markdown format."""
        md = ""
        if self.config.export_metadata:
            md += f"# {processed_content.get('title', 'No Title')}\n\n"
            md += f"**URL:** {processed_content['url']}\n\n"
            if 'meta_tags' in processed_content:
                md += "## Metadata\n\n"
                for key, value in processed_content['meta_tags'].items():
                    md += f"* **{key}:** {value}\n"
                md += "\n"

        md += "## Content\n\n"
        md += processed_content['text'] + "\n\n"

        if 'links' in processed_content and self.config.export_metadata:
            md += "## Links\n\n"
            for link_type, links in processed_content['links'].items():
                md += f"### {link_type.capitalize()} Links\n\n"
                for link in links:
                    md += f"* [{link.get('text', link['url'])}]({link['url']})\n"
            md += "\n"

        if 'images' in processed_content and self.config.export_metadata:
            md += "## Images\n\n"
            for image in processed_content['images']:
                md += f"!({image['url']})  \n"
                if image['alt']:
                    md+= f"*{image['alt']}*  \n"
                md += "\n"

        return md

    async def run(self):
        """Execute the crawler with enhanced monitoring."""
        try:
            # Initialize browser
            await self.initialize_browser()

            # Start API if enabled in a separate task
            if self.config.enable_api:
                asyncio.create_task(uvicorn.run(self.api, host="0.0.0.0", port=self.config.api_port))

            # Initialize robots.txt if needed
            if self.robots:
                await self.robots.initialize()

            # Process sitemap if enabled
            if self.config.process_sitemaps:
                sitemap_url = urljoin(str(self.config.website_url), "/sitemap.xml")
                sitemap_urls = await self.sitemap.process_sitemap(sitemap_url)
                for url in sitemap_urls:
                    await self.url_queue.add_url(url)

            # Add seed URL if not using sitemap only
            if not self.config.sitemap_only:
                await self.url_queue.add_url(str(self.config.website_url))

            # Main crawling loop with enhanced progress visualization
            with create_progress_bar() as progress:
                task_id = progress.add_task(
                    "[cyan]Crawling...",
                    total=self.config.max_pages
                )

                while self.stats.pages_processed < self.config.max_pages and not self.url_queue.is_empty():
                    url_data = await self.url_queue.get_next()
                    if not url_data:
                        break

                    success = await self.process_page(url_data["url"], url_data["depth"])
                    if success:
                        progress.update(
                            task_id,
                            advance=1,
                            refresh=True,
                            memory=f"{self.stats.current_memory_usage:.1f} MB"  # Show current memory
                        )

                        # Generate interim report every 100 pages
                        if self.stats.pages_processed % 100 == 0:
                            await self.generate_interim_report()

        except KeyboardInterrupt:
            logger.info("Crawler stopped by user")
        except Exception as e:
            logger.exception("Fatal crawler error")
            raise
        finally:
            # Cleanup and generate final report
            if self.browser:
                await self.browser.close()  # Properly close the browser
            self.stats.finish()
            await self.generate_final_report()

            if self.config.enable_api:
                logger.info(f"API server running on port {self.config.api_port}")


    async def generate_interim_report(self):
        """Generate interim progress report."""
        report_path = self.output_dir / "reports" / f"interim_report_{self.stats.pages_processed}.json"
        report = {
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "stats": self.stats.get_summary(),
            "queue_status": self.url_queue.get_stats(),
            "memory_usage": {
                "current": self.stats.curreent_memory_usage,
                "peak": self.stats.peak_memory_usage
            },
            "document_stats": {
                "total_documents": len(self.documents),
                "content_types": Counter(doc['content_type'] for doc in self.documents if 'content_type' in doc)

            }
        }

        async with aiofiles.open(report_path, 'w') as f:
            await f.write(json.dumps(report, indent=2))

    async def generate_final_report(self):
        """Generate comprehensive final report."""
        report_path = self.output_dir / "reports" / "final_report.json"
        report = {
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "configuration": self.config.dict(),
            "stats": self.stats.get_summary(),
            "document_stats": {
                "total_documents": len(self.documents),
                "content_types": Counter(doc['content_type'] for doc in self.documents if 'content_type' in doc),
                "average_length": statistics.mean([len(doc['text']) for doc in self.documents if 'text' in doc]) if self.documents else 0
            },
            "error_summary": dict(self.stats.errors),
            "performance_metrics": {
                "average_processing_time": statistics.mean(self.stats.processing_times) if self.stats.processing_times else 0,
                "peak_memory_usage": self.stats.peak_memory_usage,
                "total_downloaded": self.stats.bytes_downloaded
            }
        }

        async with aiofiles.open(report_path, 'w') as f:
            await f.write(json.dumps(report, indent=2))

    def create_progress_bar():
        """Creates and returns a rich Progress instance."""
        return Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
            MofNCompleteColumn(),
            TimeRemainingColumn(),
            TimeElapsedColumn(),
            TextColumn("• Mem: {task.fields[memory]}"),  # Include memory usage
            console=console,
            transient=True,
            refresh_per_second=10,
        )


#@title 🎯 Target Website Configuration {display-mode: "form"}
website_url = "https://markmap.js.org/docs" #@param {type:"string"}
max_pages = 1000 #@param {type:"integer"}
max_depth = 3 #@param {type:"slider", min:1, max:10, step:1}
stay_on_domain = True #@param {type:"boolean"}

#@title 🤖 Compliance Settings {display-mode: "form"}
obey_robots_txt = False #@param {type:"boolean"}
process_sitemaps = True #@param {type:"boolean"}
recursive_sitemap = True #@param {type:"boolean"}
sitemap_only = False #@param {type:"boolean"}

#@title 🔍 Path Filtering {display-mode: "form"}
#@markdown Enter patterns to include/exclude (comma-separated)
include_patterns = "" #@param {type:"string"}
exclude_patterns = "" #@param {type:"string"}

#@title 🌐 Browser Configuration {display-mode: "form"}
stealth_mode = "Maximum" #@param ["Basic", "Moderate", "Maximum"]
browser_profile = "random" #@param ["random", "mobile", "desktop"]
use_proxy = False #@param {type:"boolean"}
proxy_url = "" #@param {type:"string"}

#@title ⚡ Performance Settings {display-mode: "form"}
#@markdown ### Request Timing
request_delay_min = 1.0 #@param {type:"slider", min:0.5, max:5.0, step:0.5}
request_delay_max = 3.0 #@param {type:"slider", min:1.0, max:10.0, step:0.5}
request_timeout = 30 #@param {type:"slider", min:5, max:120, step:5}
max_concurrent = 5 #@param {type:"slider", min:1, max:10, step:1}
retry_attempts = 3 #@param {type:"slider", min:1, max:5, step:1}

#@title 💾 Output Configuration {display-mode: "form"}
#@markdown ### Export Format Selection
export_markdown = True #@param {type:"boolean"}
export_json = True #@param {type:"boolean"}
export_text = True #@param {type:"boolean"}
export_html = True #@param {type:"boolean"}
output_dir = "markmap docs site" #@param {type:"string"}
compress_output = True #@param {type:"boolean"}
structure_output = True #@param {type:"boolean"}
maintain_hierarchy = True #@param {type:"boolean"}
clean_output = True #@param {type:"boolean"}

#@title 📊 Export Options {display-mode: "form"}
export_metadata = True #@param {type:"boolean"}
export_stats = True #@param {type:"boolean"}
export_sitemap = True #@param {type:"boolean"}
include_timestamps = True #@param {type:"boolean"}
include_checksums = True #@param {type:"boolean"}
generate_report = True #@param {type:"boolean"}

#@title 🔧 Advanced Settings {display-mode: "form"}
enable_api = False #@param {type:"boolean"}
api_port = 8000 #@param {type:"integer"}


def create_config_from_forms() -> Config:
    """Create Config object from form inputs."""
    # Initialize export formats list properly
    export_formats = []

    # Append selected formats based on form input
    if export_markdown:
        export_formats.append("markdown")
    if export_json:
        export_formats.append("json")
    if export_text:
        export_formats.append("text")
    if export_html:
        export_formats.append("html")

    # Process include and exclude patterns
    include_patterns_list = [p.strip() for p in include_patterns.split(",") if p.strip()]
    exclude_patterns_list = [p.strip() for p in exclude_patterns.split(",") if p.strip()]

    # Return configuration object
    return Config(
        user_login=USER_LOGIN,
        current_utc=CURRENT_UTC,
        website_url=website_url,
        max_pages=max_pages,
        max_depth=max_depth,
        stay_on_domain=stay_on_domain,
        obey_robots_txt=obey_robots_txt,
        process_sitemaps=process_sitemaps,
        recursive_sitemap=recursive_sitemap,
        sitemap_only=sitemap_only,
        include_patterns=include_patterns_list,
        exclude_patterns=exclude_patterns_list,
        stealth_mode=stealth_mode,
        browser_profile=browser_profile,
        use_proxy=use_proxy,
        proxy_url=proxy_url,
        request_delay_min=request_delay_min,
        request_delay_max=request_delay_max,
        request_timeout=request_timeout,
        max_concurrent=max_concurrent,
        retry_attempts=retry_attempts,
        export_formats=export_formats,
        output_dir=output_dir,
        compress_output=compress_output,
        structure_output=structure_output,
        maintain_hierarchy=maintain_hierarchy,
        clean_output=clean_output,
        export_metadata=export_metadata,
        export_stats=export_stats,
        export_sitemap=export_sitemap,
        include_timestamps=include_timestamps,
        include_checksums=include_checksums,
        generate_report=generate_report,
        enable_api=enable_api,
        api_port=api_port
    )


# Create configuration from form inputs and display summary
config = create_config_from_forms()

console = Console(
    color_system="truecolor",
    width=100,
    style=Style(color="cyan", bgcolor=None),
    highlight=True,
    record=True,
    markup=True
)

console.print(Panel(
    "\n".join([
        f"[cyan]Configuration Summary[/cyan]",
        f"Target URL: {config.website_url}",
        f"Max Pages: {config.max_pages}",
        f"Max Depth: {config.max_depth}",
        f"Output Directory: {config.output_dir}",
        f"Export Formats: {', '.join(config.export_formats)}",
        f"Browser Profile: {config.browser_profile}",
        f"Stealth Mode: {config.stealth_mode}"
    ]),
    title="SiteDumper Configuration",
    border_style="cyan",
    padding=(1, 2)
))

async def main():
    """Main execution flow."""
    try:
        dumper = SiteDumper(config)

        # Display startup banner
        console.print(Panel(
            f"[cyan]SiteDumper v2025.2.7[/cyan]\n" +
            f"User: {config.user_login}\n" +
            f"Started at: {config.current_utc}\n" +
            f"Target: {config.website_url}\n" +
            f"Output: {config.output_dir}",
            title="Crawler Status",
            border_style="cyan",
            padding=(1, 2)
        ))

        await dumper.run()

    except Exception as e:
        console.print(f"[red]Fatal Error:[/red] {str(e)}")
        sys.exit(1)

if __name__ == "__main__":
    nest_asyncio.apply()
    uvloop.install()
    asyncio.run(main())

ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-1' coro=<main() done, defined at <ipython-input-15-e74b16532cf8>:1387> exception=SystemExit(1)>
Traceback (most recent call last):
  File "<ipython-input-15-e74b16532cf8>", line 1404, in main
    await dumper.run()
          ^^^^^^^^^^
AttributeError: 'SiteDumper' object has no attribute 'run'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-e74b16532cf8>", line 1413, in <cell line: 0>
    asyncio.run(main())
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/nest_asyncio.py", line 92, in run_until_complete
 

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading

<ipython-input-17-ba99aa5a667e>:137: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  @validator('request_delay_max')


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-5' coro=<Connection.run() done, defined at /usr/local/lib/python3.11/dist-packages/playwright/_impl/_connection.py:272> exception=NotImplementedError()>
Traceback (most recent call last):
  File "/usr/lib/python3.11/asyncio/tasks.py", line 277, in __step
    result = coro.send(None)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/playwright/_impl/_connection.py", line 279, in run
    await self._transport.connect()
  File "/usr/local/lib/python3.11/dist-packages/playwright/_impl/_transport.py", line 133, in connect
    raise exc
  File "/usr/local/lib/python3.11/dist-packages/playwright/_impl/_transport.py", line 120, in connect
    self._proc = await asyncio.create_subprocess_exec(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/subprocess.py", line 223, in create_subprocess_exec
    transport, protocol = await loop.subprocess_exec(

<ipython-input-17-ba99aa5a667e>:1209: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  "configuration": self.config.dict(),


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "<ipython-input-17-ba99aa5a667e>", line 1121, in run
    await self.initialize_browser()
  File "<ipython-input-17-ba99aa5a667e>", line 878, in initialize_browser
    playwright = await async_playwright().start()
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/playwright/async_api/_context_manager.py", line 51, in start
    return await self.__aenter__()
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/playwright/async_api/_context_manager.py", line 46, in __aenter__
    playwright = AsyncPlaywright(next(iter(done)).result())
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/asyncio/futures.py", line 203, in result
    raise self._exception.with_traceback(self._exception_tb)
  File "/usr/local/lib/python3.11/dist-packages/playwright/_impl/_transport.py", line 120, in connect
    self._proc = await asyncio.create_subprocess_ex

TypeError: object of type 'NoneType' has no len()

In [None]:
 import shutil
import os

for root, dirs, files in os.walk("", topdown=False):
    for dir in dirs:
        dir_path = os.path.join(root, dir)
        try:
            shutil.rmtree(dir_path)  # Use shutil.rmtree to remove non-empty dirs
            print(f"Deleted directory: {dir_path}")
        except OSError as e:
            print(f"Error deleting directory {dir_path}: {e}")