In [None]:
#!/usr/bin/env python
import asyncio
import pyppeteer as ptr
from typing import Optional
import pandas as pd
from glob import glob


async def download_html(browser: ptr.browser.Browser, url: str, selector: str) -> Optional[str]:
    page = await browser.newPage()
    await page.goto(url, waitUntil="load")
    content = await page.querySelector(selector)

    html = None
    if content:
        html = await page.evaluate('(element) => element.textContent', content)
        print(html)

    return html

async def get_training_data_from_folder(folder_path: str) -> None:
    browser = await ptr.launch(headless=True)
    files = glob(folder_path + '/**/*.csv', recursive=True)
    print(files)
    
    for file in files:
        await get_training_data(browser, file)
        
    await browser.close()

async def get_training_data(browser: ptr.browser.Browser, file_path: str) -> None:
    df = pd.read_csv(file_path)
    df.rename(columns=lambda x: x.strip(), inplace=True)

    
    if isinstance(df, pd.DataFrame):
        for _, row in df.iterrows():
            await download_html(browser, row["link"], row["selector"])

await (
    get_training_data_from_folder("../resources/sources")
)


['../resources/sources/depression.csv', '../resources/sources/migraine.csv', '../resources/sources/tetanus.csv']
