In [None]:
#!/usr/bin/env python
import asyncio
import pyppeteer as ptr
from typing import Optional
import pandas as pd
from glob import glob
import nltk
from nltk.corpus import stopwords

async def download_html(browser: ptr.browser.Browser, url: str, selector: str) -> Optional[str]:
    page = await browser.newPage()
    await page.goto(url, waitUntil="load")
    content = await page.querySelector(selector)

    html = ''
    if content:
        html = await page.evaluate('(element) => element.textContent', content)

    return html

async def get_training_data_from_folder(folder_path: str) -> None:
    browser = await ptr.launch(headless=True)
    files = glob(folder_path + '/**/*.csv', recursive=True)
    
    for file in files:
        await get_training_data(browser, file)
        
    await browser.close()

async def get_training_data(browser: ptr.browser.Browser, file_path: str) -> None:
    df = pd.read_csv(file_path)
    df.rename(columns=lambda x: x.strip(), inplace=True)

    
    if isinstance(df, pd.DataFrame):
        for _, row in df.iterrows():
            result = await download_html(browser, row["link"], row["selector"])
            await clean_up_words(await tokenize(result))
            
async def tokenize(data: str):
    tokenized_words = nltk.word_tokenize(data)
    tagged_words = nltk.pos_tag(tokenized_words)
    return tagged_words

            
async def clean_up_words(words: list[tuple[str, str]]):
    # Remove the stop words.
    print(stopwords.words("english"))
    # print(words)
    filtered_words = [word for word in words if word[0] not in stopwords.words("english")]
    # print(filtered_words)

    # Remove the punctuation.
    filterd_punctuation = [word for word in filtered_words if word[0] != word[1]]
    print(filterd_punctuation)
    # do nltk stuff
    pass

await (
    get_training_data_from_folder("../resources/sources")
)
