# crawler

> crawler with call backs

In [None]:
#| default_exp crawler

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export 
from pw.core import  *
from pw.helper import *
from operator import attrgetter
import inspect
from fastcore.all import *
import asyncio

In [None]:
#| export 
#| hide
class Callback(): order = 0

async def run_cbs(cbs, method_nm, crawler=None, *args, **kwargs):
    for cb in sorted(cbs, key=attrgetter('order')):
        method = getattr(cb, method_nm, None)
        if method :
            if inspect.iscoroutinefunction(method):
                await method(crawler, *args, **kwargs)
            else:
                method(crawler, *args, **kwargs)

class with_cbs:
    def __init__(self, nm): self.nm = nm
    def __call__(self, f):
        def _f(o, *args, **kwargs):
            try:
                o.callback(f'before_{self.nm}')
                print(self.nm)
                f(o, *args, **kwargs)
                o.callback(f'after_{self.nm}')
            except globals()[f'Cancel{self.nm.title()}Exception']: pass
            finally: o.callback(f'cleanup_{self.nm}')
        return _f

# Web Crawler with Callback System

This crawler implements a flexible web scraping system with callback hooks for extensibility, inspired by fastai's [callback system](https://docs.fast.ai/callback.core.html).<br>
![image](flow.png)
## Architecture
The crawler operates with two main callback hooks: `before_visit` and `after_visit`, with an `ord` parameter controlling execution order.

## Key Features
1. **Parallel Processing**: 
   - Configurable number of pages (`np`) for concurrent processing
   - Efficient browser resource management

2. **URL Management**:
   - Input: List of URLs to visit (`to_visit`)
   - Tracks progress through callback-accessible sets:
     - `visited`: Already processed URLs
     - `unvisited`: Pending URLs
     - `visit_window`: Current batch of URLs (size = `np`)

3. **Callback System**:
   - Extensible through custom callbacks
   - Ordered execution (`ord`)
   - Full access to crawler state

In [None]:
#| export
class Crawl():
    def __init__(self, np: int = 1, to_visit: Optional[List[str]] = None, cbs=None): # type: ignore
        self.np = np
        self.visited = set()
        self.unvisited = set(to_visit)
        self.cbs = L(cbs)
        
    async def one_visit(self, idx):
        page = self.pages[idx]
        
        await self.callback('before_visit',idx)
        
        await page.goto(self.visit_window[idx])
        await page.wait()

        await self.callback('after_visit', idx)
        
    async def run(self, stealth: bool = False, **kwargs):
        async with setup_browser(n=self.np, stealth = stealth, **kwargs) as obj:
            if obj.is_valid:
                    self.pages, self.brow, self.ctx = obj.pages, obj.brow, obj.ctx

                    while self.unvisited:
                        self.visit_window = list(self.unvisited - self.visited)[:self.np] #not visited 
                        if len(self.visit_window) == 0: break
                        tasks = [self.one_visit(i) for i in range(len(self.visit_window)) ]
                    
                        await asyncio.gather(*tasks)  
                        visited_urls = set(self.visit_window)
                        self.unvisited.difference_update(visited_urls) # remove the urls from the to_visit
                        
                        self.visited.update(visited_urls)
                        
    def __getattr__(self, name):
        if name.startswith('before_') or name.startswith('after_'): return partial(self.callback, name)
        raise AttributeError(name)

    async def callback(self, method_nm, *args, **kwargs ): 
        await run_cbs(self.cbs, method_nm, self, *args, **kwargs)          

In [None]:
#| hide
class CB(Callback):
    def __init__(self):
        print(f"__init__ " )

    def before_visit(self, crawler, idx):
        print(f"before_visit  {idx=}" )

    def after_visit(self, crawler, idx):
        print(f"before_visit  {idx=}" )

C = Crawl(1, ['https://solveit.fast.ai/', 'https://fastcore.fast.ai/'], [CB()])
await C.run(headless=False)

__init__ 
before_visit  idx=0
before_visit  idx=0
before_visit  idx=0
before_visit  idx=0


In [None]:
#| hide
C = Crawl(2, ['https://solveit.fast.ai/', 'https://fastcore.fast.ai/'], [CB()])
await C.run(headless=False)

__init__ 
before_visit  idx=0
before_visit  idx=1
before_visit  idx=0
before_visit  idx=1


# __callback__ 

## extract text for a given `xpath`

In [None]:
class GetTextCB(Callback):
    

    async def after_visit(self, crawler, idx):
        if crawler.pages[idx].url == 'https://fastcore.fast.ai/':
            loc = await crawler.pages[idx].find_ele('//span[contains(text(), "Welcome to fastcore")]')
            if loc:
                assert await loc[0].get_text() == "Welcome to fastcore"

C = Crawl(2, ['https://solveit.fast.ai/', 'https://fastcore.fast.ai/'], [GetTextCB()])
await C.run(headless=False)

## To traverse all webpages within the same domain using

In [None]:
#| export
class TraveseSameDomainCB(Callback):
    """
    Callback helping traveling all the links available in the same domain.
    """
    def __init__(self, url):
        self.base_domain = domain(url)
        self.order = 1
    
    async def after_visit(self, crawler, idx):
        url = crawler.pages[idx].url
        if domain(url) == self.base_domain:

            links = await find_all_links(crawler.pages[idx])

            if links:
                links = [i for i in links if self.base_domain == domain(i) and not is_same_resource(url, i) ]
                links = {i for i in links if not any ( j in i for j in IGNORE_EXT)}
                links.difference_update(crawler.visited)
                
                crawler.unvisited.update(links)

In [None]:
url = 'https://solveit.fast.ai/'
C = Crawl(5, [url], [TraveseSameDomainCB(url)])
await C.run(headless=True)
assert all([domain(i)==domain(url) for i in C.visited])
assert len(C.unvisited) == 0

### Crawl a url and save in md

In [None]:
#| export
class ToMDCB(Callback):
    """
    Callback helping traveling all the links available in the same domain.
    """
    def __init__(self, base_dir="PW"):
        self.order = 2
        self.base_dir = base_dir 
    
    async def after_visit(self, crawler, idx):
        url = crawler.pages[idx].url
        P = Path(f"{self.base_dir}/{url2fn(url)}")
        P.mkdir(exist_ok=True, parents=True)
        fn = P/'index.md'
        print(f"writing to {fn=}")
        md_str = await crawler.pages[idx].h2md()
        #print_md(md_str)
        fn.write_text(md_str)

In [None]:
url = 'https://solveit.fast.ai/'
C = Crawl(3, [url], [TraveseSameDomainCB(url), ToMDCB()])
await C.run(headless=False)

writing to fn=Path('PW/solveit_fast_ai/index.md')
writing to fn=Path('PW/solveit_fast_ai_privacy/index.md')
writing to fn=Path('PW/solveit_fast_ai_course_info/index.md')
writing to fn=Path('PW/solveit_fast_ai_terms/index.md')
writing to fn=Path('PW/solveit_fast_ai_learn_more/index.md')


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()