# playwright_

> Fill in a module description here

In [None]:
#| default_exp playwright_

In [None]:
#| export
import asyncio
from playwright.async_api import async_playwright, Page, Playwright, Browser
import traceback
import re
from fastcore.all import *
from urllib.parse import urlparse, urlencode, quote_plus, unquote
from scraper.core import *
import requests
from collections import deque

In [None]:
#| export

DEBUG = True
BROWSERS = {
    "ch": lambda pw: pw.chromium,
    "ff": lambda pw: pw.firefox,
    "wk": lambda pw: pw.webkit,
}

async def get_brow(pw: Playwright, brow_n: str):
    """
    return browser_name object from 
    ch -> chromium
    ff -> firefox
    wk -> webkit
    """
    browser_func = BROWSERS.get(brow_n)
    if not browser_func:
        raise ValueError(f"Unknown browser: {brow_n}")
    return await browser_func(pw).launch(headless=DEBUG==False)

In [None]:
async with async_playwright() as pw:
    async def _test(bn):
        browser = await get_brow(pw, bn)
        assert browser is not None
        print(f"Launched {bn}")
        await browser.close()

    await asyncio.gather(
        _test("ch"),
        _test("ff"),
        _test("wk"),
    )

Launched wk
Launched ch
Launched ff


In [None]:
#| export 
async def get_href(page:Page):
    """
    Takes in Page object and get back all the href which are not part of `ignore_href`.\n
    It is doen by loop through all the a tags.
    """
    try:
        links = [await tag.get_attribute('href')  for tag in await page.query_selector_all('a')]
        return [ link for link in links if valid_href(link) ]
    except Exception as e:
        print(f"failed for {await page.url}")
        traceback.print_exc()
        raise e

In [None]:
async with async_playwright() as pw:
    brow = await get_brow(pw, "ch")
    page = await brow.new_page()
    await page.goto('https://nbdev.fast.ai/') 
    hrefs = await get_href(page)
    await page.close(); await brow.close()

assert len(hrefs) != 0, "Expected href to contain links, but it is empty."
print(f"{hrefs=}")

hrefs=['https://nbdev.fast.ai/', 'https://nbdev.fast.ai/getting_started.html', 'https://nbdev.fast.ai/tutorials/tutorial.html', 'https://nbdev.fast.ai/blog/', 'https://nbdev.fast.ai/#', 'https://github.com/fastai/nbdev/issues', 'https://forums.fast.ai/', 'https://nbdev.fast.ai/getting_started.html#faq', 'https://github.com/fastai/nbdev', 'https://twitter.com/fastdotai', 'https://nbdev.fast.ai/getting_started.html', 'https://nbdev.fast.ai/getting_started.html', 'https://github.com/fastai/nbdev/issues/new']


In [None]:
for i in hrefs:
    print(f"{i=} -> ", hydrate_links("nbdev.fast.ai", i) )

i='https://nbdev.fast.ai/' ->  https://nbdev.fast.ai
i='https://nbdev.fast.ai/getting_started.html' ->  https://nbdev.fast.ai/getting_started.html
i='https://nbdev.fast.ai/tutorials/tutorial.html' ->  https://nbdev.fast.ai/tutorials/tutorial.html
i='https://nbdev.fast.ai/blog/' ->  https://nbdev.fast.ai/blog
i='https://nbdev.fast.ai/#' ->  None
i='https://github.com/fastai/nbdev/issues' ->  None
i='https://forums.fast.ai/' ->  None
i='https://nbdev.fast.ai/getting_started.html#faq' ->  None
i='https://github.com/fastai/nbdev' ->  None
i='https://twitter.com/fastdotai' ->  None
i='https://nbdev.fast.ai/getting_started.html' ->  https://nbdev.fast.ai/getting_started.html
i='https://nbdev.fast.ai/getting_started.html' ->  https://nbdev.fast.ai/getting_started.html
i='https://github.com/fastai/nbdev/issues/new' ->  None


In [None]:
#| hide
url = 'https://nbdev.fast.ai/'
fn = "text"
local_domain = urlparse(url).netloc
local_domain

'nbdev.fast.ai'

In [None]:
#| hide
domain_dir = Path(f"../{fn}/{local_domain}")
print(domain_dir)
domain_dir.mkdir(exist_ok=True, parents=True)


../text/nbdev.fast.ai


In [None]:
#| export
log_err = lambda func, url, err: print(f"Error in {func=}\n{url=}\n{err}")

def download_file(url:str, fn:Path='.'):
    """
    Downloads a file from the specified URL and saves it to the given path.
    Args:
        url (str): The URL from which to download the file.
        fn (Path): The destination file path to save the downloaded content.
    Raises:
        Exception: If an error occurs during the download or saving process.
    """
    try:
        resp = requests.get(url)
        if resp.status_code == 200:
            with open(fn, 'wb') as f:
                f.write(resp.content)
    except Exception as e:
        log_err('download_file', url, e)
        traceback.print_exc()
Path('Test/dummy.pdf').unlink(missing_ok=True)

In [None]:
#| hide
dir_n = Path('Test/')
dir_n.mkdir(parents=True, exist_ok=True)

In [None]:
download_file('https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf', 
              Path('Test/dummy.pdf'))
assert Path('Test/dummy.pdf').exists(), "File not found"
assert Path('Test/dummy.pdf').stat().st_size > 0, "File is empty"

In [None]:
#| export
async def download_html(brow:Browser, url:str, fn:Path):
        """
        takes in url either ending with .html or any abs adress
        retruns all the href with the abs links
        """
        
        page = await brow.new_page()

        try:
            await page.goto(url) 
            #c_typ =  resp.headers.get('content-type', '').lower()
            #c_typ_ext = [k for k, v in ALLOWED_EXT_CONTENT_TYPS.items() if v in c_typ][0][1:]
            #print(f"{url=}\n{fn=}\n{f_ext=}\n{c_typ=}\n{c_typ_ext=}")
            
            with open(fn, 'w', encoding='utf-8') as f:
                    f.write(await page.content())
            return await get_href(page)
 
        except Exception as e:
            log_err('download_html', url, e)
            traceback.print_exc()
        finally:
            await page.close()
        return []

In [None]:
async with async_playwright() as pw:
    brow = await get_brow(pw, "ch")
    assert len(await download_html(brow, url, Path('./Text/'))) > 0 
    await brow.close()

In [None]:
#| export
import traceback

async def crawl(url: str, dir_n: Path, brow_typ: str = "ch"):
    """
    Asynchronously crawls and downloads HTML and specific file types within a given domain.

    Args:
        url (str): The starting URL for crawling.
        dir_n (Path): The directory path where downloaded files will be saved.
        brow_typ (str): The browser type (default is "ch").
    """
    local_domain = urlparse(url).netloc
    queue = deque([url])
    seen = set()

    # Create directory for the domain
    dir_n = dir_n / local_domain
    dir_n.mkdir(parents=True, exist_ok=True)

    async with async_playwright() as pw:
        brow = await get_brow(pw, brow_typ)
        while queue:
            url = queue.pop()
            try:
                if url and url not in seen:
                    #print(f"{url=}")
                    fn = get_fn_from_url(url)
                    f_ext = fn.split('.')[-1]  # Get file extension

                    if f_ext == 'html':
                        links = await download_html(brow, url, dir_n / fn)
                        # Ensure each link is valid for deque.extend
                        queue.extend([hydrate_links(local_domain, i) for i in links])
                    elif '.' + f_ext in ['.pdf', '.doc', '.docx', '.odt', '.xls', '.xlsx', '.ppt', '.pptx', '.txt', '.csv']:
                        download_file(url, dir_n / fn)
                    else:
                        print(f"Cannot process {url=}")
                seen.add(url)
            except Exception as e:
                print(f"Error processing {url=}:")
                traceback.print_exception(type(e), e, e.__traceback__)

        await brow.close()


In [None]:
await crawl('https://fastcore.fast.ai/', Path('Test/'))

In [None]:
#| hide
assert len(Path('Test/fastcore.fast.ai').ls()) != 0
%rm -rf Test

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()