In [3]:
#!/usr/bin/env python
import asyncio
import pyppeteer as ptr
from typing import Optional
import pandas as pd
from glob import glob
import nltk
from nltk.corpus import stopwords
import re
from typing import Callable
import csv

In [4]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [5]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/urmzd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/urmzd/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/urmzd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
PosTag = tuple[str, str]
PosTagList = list[PosTag]

In [10]:
## Scraper.
async def download_html(browser: ptr.browser.Browser, page: ptr.page.Page, url: str, selector: str) -> Optional[str]:
    await page.goto(url, waitUntil="load", timeout=0)
    content = await page.querySelector(selector)

    html = ''
    if content:
        html = await page.evaluate('(element) => element.textContent', content)
        
    return html

def write_to_resource_target(file_path: str, content: PosTagList) -> None:
    with open(file_path, "w") as file:
        writer = csv.writer(file)
        writer.writerow(["value", "tag"])
        writer.writerows(content)


async def get_training_data_from_folder(source_path: str, target_path: str, force=False) -> None:
    browser = await ptr.launch(headless=True)
    page = await browser.newPage()

    source_files = glob(source_path + '/**/*.csv', recursive=True)
    target_files = glob(target_path + "/**/*.csv", recursive=True)
    target_file_names = [file_path.split("/")[-1] for file_path in target_files]
    
    print(source_files, source_path, target_files, target_path, sep="\n")
    words = dict()
    
    for file_path in source_files:
        file_name = file_path.split("/")[-1]
        # ~ A ^ ~ B
        print(file_name, target_file_names)
        print(file_name in target_file_names)
        if not (file_name in target_file_names or force):
            result = await get_training_data(browser, page, file_path)
            write_to_resource_target(f"{target_path}/{file_name}", result)
            words[file_name] = result
        
                
    await browser.close()

async def get_training_data(browser: ptr.browser.Browser, page: ptr.page.Page, file_path: str) -> PosTagList:
    df = pd.read_csv(file_path)
    df.rename(columns=lambda x: x.strip(), inplace=True)

    words = []
    if isinstance(df, pd.DataFrame):
        for _, row in df.iterrows():
            print(row)
            result = await download_html(browser, page, row["link"], row["selector"])
            words.extend(clean_up_words(tokenize(result)))
    
    return words

In [11]:
## Cleaners.
def tokenize(data: str):
    tokenized_words = nltk.word_tokenize(data)
    tagged_words = nltk.pos_tag(tokenized_words)
    return tagged_words

def filter_words(x: PosTag, fns: list[Callable[[PosTag], bool]], keep=True) -> bool:
    if not keep:
        return False
    
    if fns:
        return filter_words(x, fns[1:], keep=fns[0](x))
    
    return True
    
def filter_by_duplicate(x: tuple[str, str]) -> bool:
    return x[0] != x[1]

def filter_by_stop_word(x: tuple[str, str]) -> bool:
    return x[0] not in stopwords.words("english")

def filter_by_alphabet(x: tuple[str, str]) -> bool:
    regex = re.compile("^([a-zA-Z]|')+$")
    return regex.match(x[0])
            
def clean_up_words(words: PosTagList) -> PosTagList:
    return list(filter(
        lambda x: filter_words(x, [filter_by_duplicate, filter_by_stop_word, filter_by_alphabet]),
        words
        )
    )

In [12]:
# type: ignore
await (get_training_data_from_folder("../resources/sources", "../resources/targets"))

['../resources/sources/depression.csv', '../resources/sources/migraine.csv', '../resources/sources/tetanus.csv']
../resources/sources
['../resources/targets/headache.csv', '../resources/targets/depression.csv']
../resources/targets
depression.csv ['headache.csv', 'depression.csv']
True
migraine.csv ['headache.csv', 'depression.csv']
False
selector                                           #topicText
link        https://www.uptodate.com/contents/acute-treatm...
Name: 0, dtype: object
selector                                        #main-content
link        https://www.mayoclinic.org/diseases-conditions...
Name: 1, dtype: object
selector                          #mw-content-text
link        http://en.wikipedia.org/wiki/Migraines
Name: 2, dtype: object
selector                 #maincontent > article > div > div
link        https://www.nhs.uk/conditions/migraine/symptoms
Name: 3, dtype: object
selector    #__next > div.css-fdjy12 > div.css-stl7tm > di...
link        https://www.healthline.