In [9]:
#!/usr/bin/env python
import asyncio
import pyppeteer as ptr
from typing import Optional
import pandas as pd
from glob import glob
import nltk
from nltk.corpus import stopwords
import re
from typing import Callable

## Scraper.
async def download_html(browser: ptr.browser.Browser, url: str, selector: str) -> Optional[str]:
    page = await browser.newPage()
    await page.goto(url, waitUntil="load")
    content = await page.querySelector(selector)

    html = ''
    if content:
        html = await page.evaluate('(element) => element.textContent', content)

    return html

async def get_training_data_from_folder(folder_path: str) -> None:
    browser = await ptr.launch(headless=True)
    files = glob(folder_path + '/**/*.csv', recursive=True)
    
    words = dict()
    
    for file in files:
        result = await get_training_data(browser, file)
        words[file] = result
        
    print(words)
        
    await browser.close()

async def get_training_data(browser: ptr.browser.Browser, file_path: str) -> None:
    df = pd.read_csv(file_path)
    df.rename(columns=lambda x: x.strip(), inplace=True)

    words = []
    if isinstance(df, pd.DataFrame):
        for _, row in df.iterrows():
            result = await download_html(browser, row["link"], row["selector"])
            words.extend(clean_up_words(tokenize(result)))
    
    return words

## Cleaners.
def tokenize(data: str):
    tokenized_words = nltk.word_tokenize(data)
    tagged_words = nltk.pos_tag(tokenized_words)
    return tagged_words

def filter_words(x: tuple[str, str], fns: list[Callable[[tuple[str, str]], bool]], keep=True) -> bool:
    if not keep:
        return False
    
    if fns:
        return filter_words(x, fns[1:], keep=fns[0](x))
    
    return True
    
def filter_by_duplicate(x: tuple[str, str]) -> bool:
    return x[0] != x[1]

def filter_by_stop_word(x: tuple[str, str]) -> bool:
    return x[0] not in stopwords.words("english")

def filter_by_alphabet(x: tuple[str, str]) -> bool:
    regex = re.compile("^([a-zA-Z]|')+$")
    return regex.match(x[0])
            
def clean_up_words(words: list[tuple[str, str]]) -> list[tuple[str, str]]:
    return list(filter(
        lambda x: filter_words(x, [filter_by_duplicate, filter_by_stop_word, filter_by_alphabet]),
        words
        )
    )

In [10]:
# type: ignore
await (get_training_data_from_folder("../resources/sources"))

