In [None]:
#!/usr/bin/env python
import asyncio
import pyppeteer as ptr
from typing import Optional
import pandas as pd
from glob import glob
import nltk


async def download_html(browser: ptr.browser.Browser, url: str, selector: str) -> Optional[str]:
    page = await browser.newPage()
    await page.goto(url, waitUntil="load")
    content = await page.querySelector(selector)

    html = ''
    if content:
        html = await page.evaluate('(element) => element.textContent', content)

    return html

async def get_training_data_from_folder(folder_path: str) -> None:
    browser = await ptr.launch(headless=True)
    files = glob(folder_path + '/**/*.csv', recursive=True)
    
    for file in files:
        await get_training_data(browser, file)
        
    await browser.close()

async def get_training_data(browser: ptr.browser.Browser, file_path: str) -> None:
    df = pd.read_csv(file_path)
    df.rename(columns=lambda x: x.strip(), inplace=True)

    
    if isinstance(df, pd.DataFrame):
        for _, row in df.iterrows():
            result = await download_html(browser, row["link"], row["selector"])
            print(await tokenize(result))
            
async def tokenize(data: str):
    tokenized_words = nltk.word_tokenize(data)
    pos = nltk.pos_tag(tokenized_words)
    return pos

            
async def clean_up_data():
    
    # do nltk stuff
    pass

await (
    get_training_data_from_folder("../resources/sources")
)


[('What', 'WP'), ('is', 'VBZ'), ('depression', 'NN'), ('?', '.'), ('—', 'JJ'), ('Depression', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('disorder', 'NN'), ('that', 'WDT'), ('makes', 'VBZ'), ('you', 'PRP'), ('sad', 'JJ'), (',', ','), ('but', 'CC'), ('it', 'PRP'), ('is', 'VBZ'), ('different', 'JJ'), ('than', 'IN'), ('normal', 'JJ'), ('sadness', 'NN'), ('(', '('), ('figure', 'JJ'), ('1', 'CD'), (')', ')'), ('.', '.'), ('Depression', 'NN'), ('can', 'MD'), ('make', 'VB'), ('it', 'PRP'), ('hard', 'JJ'), ('for', 'IN'), ('you', 'PRP'), ('to', 'TO'), ('work', 'VB'), (',', ','), ('study', 'VB'), (',', ','), ('or', 'CC'), ('do', 'VBP'), ('everyday', 'JJ'), ('tasks.How', 'VB'), ('do', 'VBP'), ('I', 'PRP'), ('know', 'VBP'), ('if', 'IN'), ('I', 'PRP'), ('am', 'VBP'), ('depressed', 'VBN'), ('?', '.'), ('—', 'NNS'), ('Depressed', 'VBD'), ('people', 'NNS'), ('feel', 'VBP'), ('down', 'IN'), ('most', 'JJS'), ('of', 'IN'), ('the', 'DT'), ('time', 'NN'), ('for', 'IN'), ('at', 'IN'), ('least', 'JJS'), ('2', 'CD'

[('.mw-parser-output', 'JJ'), ('.infobox-subbox', 'JJ'), ('{', '('), ('padding:0', 'NN'), (';', ':'), ('border', 'NN'), (':', ':'), ('none', 'NN'), (';', ':'), ('margin', 'NN'), (':', ':'), ('-3px', 'NN'), (';', ':'), ('width', 'NN'), (':', ':'), ('auto', 'NN'), (';', ':'), ('min-width:100', 'JJ'), ('%', 'NN'), (';', ':'), ('font-size:100', 'JJ'), ('%', 'NN'), (';', ':'), ('clear', 'JJ'), (':', ':'), ('none', 'NN'), (';', ':'), ('float', 'NN'), (':', ':'), ('none', 'NN'), (';', ':'), ('background-color', 'NN'), (':', ':'), ('transparent', 'NN'), ('}', ')'), ('.mw-parser-output', 'JJ'), ('.infobox-3cols-child', 'JJ'), ('{', '('), ('margin', 'NN'), (':', ':'), ('auto', 'NN'), ('}', ')'), ('Major', 'NNP'), ('depressive', 'JJ'), ('disorderClassification', 'NN'), ('and', 'CC'), ('external', 'JJ'), ('resourcesDepression', 'NN'), ('is', 'VBZ'), ('common', 'JJ'), (',', ','), ('can', 'MD'), ('affect', 'VB'), ('anyone', 'NN'), (',', ','), ('and', 'CC'), ('can', 'MD'), ('be', 'VB'), ('treated.ICD