In [2]:
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy

async def main():
    # Configure a 2-level deep crawl
    config = CrawlerRunConfig(
        deep_crawl_strategy=BFSDeepCrawlStrategy(
            max_depth=2,
            include_external=False
        ),
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=True
    )

    async with AsyncWebCrawler() as crawler:
        results = await crawler.arun("https://www.wikipedia.org", config=config)

        print(f"Crawled {len(results)} pages in total")

        # Access individual results
        for result in results[:3]:  # Show first 3 results
            print(f"URL: {result.url}")
            print(f"Depth: {result.metadata.get('depth', 0)}")


In [4]:
print("Starting scraper...")
try:
    # Try to get the running event loop
    loop = asyncio.get_running_loop()
    import nest_asyncio
    print("Yes, utilizing nest_asyncio")
    nest_asyncio.apply()
    asyncio.run(main())
except exception as e:
    print(e)

Starting scraper...
Yes, utilizing nest_asyncio


  for el in reversed(list(root.iterdescendants())):


Invalid URL: wikimedia:Spenden, error: Missing scheme or netloc


NameError: name 'exception' is not defined

pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe closed by peer or os.write(pipe, data) raised exception.
pipe clo

In [7]:
from collections import deque
from urllib.parse import urljoin, urlparse, urldefrag

In [10]:
async def parallel_crawl():
    visited = set()
    queued = set()  # Track URLs already in queue
    queue = deque([("https://www.wikipedia.org", 0)])
    queued.add("https://www.wikipedia.org")
    
    results = []
    max_workers = 10
    lock = asyncio.Lock()
    active_tasks = [0]  # Use list to make it mutable in closure
    
    config = CrawlerRunConfig(
        scraping_strategy=LXMLWebScrapingStrategy(),
        verbose=False,
        page_timeout=20000,  # 20 second timeout
    )
    
    def is_valid_url(url):
        """Filter out unwanted URLs"""
        url_lower = url.lower()
        
        # Skip Wikipedia special pages
        skip_patterns = [
            '/w/index.php',  # MediaWiki script
            'special:', 'خاص:',
            'user:', 'usuario:',
            'talk:', 'discusión:',
            'file:', 'archivo:',
            'help:', 'ayuda:',
            'category:', 'categoría:',
            'template:', 'plantilla:',
            'wikipedia:',
            'action=edit',
            'action=history',
            'login', 'signup',
            'createaccount',
            'preferences',
            'watchlist',
            'donate'
        ]
        
        for pattern in skip_patterns:
            if pattern in url_lower:
                return False
        
        return True
    
    async def worker(worker_id, crawler):
        while True:
            url = None
            depth = 0
            
            # Try to get work from queue
            async with lock:
                if queue:
                    url, depth = queue.popleft()
                    active_tasks[0] += 1
                elif active_tasks[0] == 0:
                    # Queue empty and no active tasks - done
                    break
            
            if url is None:
                # Queue empty but other workers active - wait
                await asyncio.sleep(0.1)
                continue
            
            # Check if already visited (race condition protection)
            async with lock:
                if url in visited:
                    active_tasks[0] -= 1
                    continue
                visited.add(url)
            
            print(f"Worker {worker_id} crawling: {url} (depth={depth})")
            
            try:
                # Crawl the page
                result = await crawler.arun(url, config=config)
                
                if result and len(result) > 0 and result[0].success:
                    async with lock:
                        results.append(result[0])
                    
                    # Extract links for next depth
                    if depth < 2 and result[0].links:
                        new_links = []
                        
                        for link_data in result[0].links.get('internal', [])[:50]:
                            link = link_data.get('href', '')
                            
                            if link and is_valid_url(link):
                                async with lock:
                                    if link not in visited and link not in queued:
                                        new_links.append(link)
                                        queued.add(link)
                                        queue.append((link, depth + 1))
                        
                        if new_links:
                            print(f"Worker {worker_id} added {len(new_links)} links at depth {depth + 1}")
                
            except Exception as e:
                print(f"Worker {worker_id} error on {url}: {str(e)[:100]}")
            
            finally:
                async with lock:
                    active_tasks[0] -= 1
        
        print(f"Worker {worker_id} finished")
    
    async with AsyncWebCrawler(headless=True) as crawler:
        # Create worker tasks
        workers = [
            asyncio.create_task(worker(i, crawler)) 
            for i in range(max_workers)
        ]
        
        # Wait for all workers to complete
        await asyncio.gather(*workers)
    
    print(f"\nCrawled {len(results)} pages successfully")
    
    # Show stats
    depth_count = {}
    for result in results:
        depth = result.metadata.get('depth', 0)
        depth_count[depth] = depth_count.get(depth, 0) + 1
    
    print("\nPages per depth:")
    for depth in sorted(depth_count.keys()):
        print(f"  Depth {depth}: {depth_count[depth]} pages")
    
    return results

In [11]:
if __name__ == "__main__":
    print("Starting scraper...")
    try:
        results = asyncio.run(parallel_crawl())
        print(f"\nFirst 3 results:")
        for i, result in enumerate(results[:3], 1):
            print(f"{i}. {result.url}")
    except Exception as e:
        print(f"Error: {e}")

Starting scraper...


Worker 0 crawling: https://www.wikipedia.org (depth=0)
Worker 0 added 50 links at depth 1
Worker 0 crawling: https://en.wikipedia.org/ (depth=1)
Worker 1 crawling: https://ja.wikipedia.org/ (depth=1)
Worker 2 crawling: https://ru.wikipedia.org/ (depth=1)
Worker 3 crawling: https://de.wikipedia.org/ (depth=1)
Worker 4 crawling: https://fr.wikipedia.org/ (depth=1)
Worker 5 crawling: https://es.wikipedia.org/ (depth=1)
Worker 6 crawling: https://zh.wikipedia.org/ (depth=1)
Worker 7 crawling: https://it.wikipedia.org/ (depth=1)
Worker 8 crawling: https://pl.wikipedia.org/ (depth=1)
Worker 9 crawling: https://pt.wikipedia.org/ (depth=1)
Worker 1 added 38 links at depth 2
Worker 1 crawling: https://ar.wikipedia.org/ (depth=1)
Worker 0 added 39 links at depth 2
Worker 0 crawling: https://fa.wikipedia.org/ (depth=1)
Worker 2 added 50 links at depth 2
Worker 2 crawling: https://arz.wikipedia.org/ (depth=1)
Worker 3 added 40 links at depth 2
Worker 3 crawling: https://nl.wikipedia.org/ (depth=1)