# scrap

> Fill in a module description here

In [None]:
#| default_exp scrap

In [9]:
#| export
import asyncio
from playwright.async_api import async_playwright, Page, Playwright
import traceback
from scraper.core import *

In [10]:
#| export

DEBUG = True
BROWSERS = {
    "ch": lambda pw: pw.chromium,
    "ff": lambda pw: pw.firefox,
    "wk": lambda pw: pw.webkit,
}

async def get_brow(pw: Playwright, brow_n: str):
    """
    return browser_name object from 
    ch -> chromium
    ff -> firefox
    wk -> webkit
    """
    browser_func = BROWSERS.get(brow_n)
    if not browser_func:
        raise ValueError(f"Unknown browser: {brow_n}")
    return await browser_func(pw).launch(headless=DEBUG==False)

In [12]:
async with async_playwright() as pw:
    async def _test(bn):
        browser = await get_brow(pw, bn)
        assert browser is not None
        print(f"Launched {bn}")
        await browser.close()

    await asyncio.gather(
        _test("ch"),
        _test("ff"),
        _test("wk"),
    )

Launched ch
Launched wk
Launched ff


In [14]:
#| export 
async def get_href(page:Page):
    """
    Takes in Page object and get back all the href which are not part of `ignore_href`.\n
    It is doen by loop through all the a tags.
    """
    try:
        links = [await tag.get_attribute('href')  for tag in await page.query_selector_all('a')]
        return [ link for link in links if valid_href(link) ]
    except Exception as e:
        print(f"failed for {await page.url}")
        traceback.print_exc()
        raise e

In [15]:
async with async_playwright() as pw:
    brow = await get_brow(pw, "ch")
    page = await brow.new_page()
    await page.goto('https://nbdev.fast.ai/') 
    hrefs = await get_href(page)
    await page.close(); await brow.close()

assert len(hrefs) != 0, "Expected href to contain links, but it is empty."
print(f"{hrefs=}")

hrefs=['https://nbdev.fast.ai/', 'https://nbdev.fast.ai/getting_started.html', 'https://nbdev.fast.ai/tutorials/tutorial.html', 'https://nbdev.fast.ai/blog/', 'https://nbdev.fast.ai/#', 'https://github.com/fastai/nbdev/issues', 'https://forums.fast.ai/', 'https://nbdev.fast.ai/getting_started.html#faq', 'https://github.com/fastai/nbdev', 'https://twitter.com/fastdotai', 'https://nbdev.fast.ai/getting_started.html', 'https://nbdev.fast.ai/getting_started.html', 'https://github.com/fastai/nbdev/issues/new']


In [16]:
for i in hrefs:
    print(f"{i=} -> ", hydrate_links("nbdev.fast.ai", i) )

i='https://nbdev.fast.ai/' ->  https://nbdev.fast.ai
i='https://nbdev.fast.ai/getting_started.html' ->  https://nbdev.fast.ai/getting_started.html
i='https://nbdev.fast.ai/tutorials/tutorial.html' ->  https://nbdev.fast.ai/tutorials/tutorial.html
i='https://nbdev.fast.ai/blog/' ->  https://nbdev.fast.ai/blog
i='https://nbdev.fast.ai/#' ->  https://nbdev.fast.ai
i='https://github.com/fastai/nbdev/issues' ->  None
i='https://forums.fast.ai/' ->  None
i='https://nbdev.fast.ai/getting_started.html#faq' ->  https://nbdev.fast.ai/getting_started.html#faq
i='https://github.com/fastai/nbdev' ->  None
i='https://twitter.com/fastdotai' ->  None
i='https://nbdev.fast.ai/getting_started.html' ->  https://nbdev.fast.ai/getting_started.html
i='https://nbdev.fast.ai/getting_started.html' ->  https://nbdev.fast.ai/getting_started.html
i='https://github.com/fastai/nbdev/issues/new' ->  None


In [17]:
#| hide
import nbdev; nbdev.nbdev_export()