In [1]:
#!/usr/bin/env python
import asyncio
import re
import csv
import pyppeteer as ptr
import pandas as pd
import nltk
from typing import Optional, TypeVar
from typing import Callable
from glob import glob
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [2]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/urmzd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/urmzd/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/urmzd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
PosTag = tuple[str, str]
PosTagList = list[PosTag]
StemWord = str
StemWordList = list[StemWord]

In [5]:
def get_name_and_extension(file_path: str) -> tuple[str, str]:
    regex = re.compile(r"(.*)/(.*)\.(.*)")
    return regex.match(file_path).group(2,3)

In [6]:
## Scraper.
async def download_html(browser: ptr.browser.Browser, page: ptr.page.Page, url: str, selector: str) -> Optional[str]:
    await page.goto(url, waitUntil="load", timeout=0)
    content = await page.querySelector(selector)

    html = ''
    if content:
        html = await page.evaluate('(element) => element.textContent', content)
        
    return html

def write_to_resource_target(target_path: str, file_name: str, content: StemWordList, extension="txt") -> None:
    with open(f"{target_path}/{file_name}.{extension}", "w") as file:
        file.write("\n".join(content))


async def get_training_data_from_folder(source_path: str, target_path: str, force=False) -> None:
    browser = await ptr.launch(headless=True)
    page = await browser.newPage()
    
    glob_pattern = "/**/*.csv"
    source_files = glob(source_path + glob_pattern, recursive=True)
    target_files = glob(target_path + glob_pattern, recursive=True)
    target_file_names = [get_name_and_extension(file_path)[0] for file_path in target_files]
    
    for file_path in source_files:
        [file_name, file_extension] = get_name_and_extension(file_path)
        
        if not file_name in target_file_names or force:
            result = await get_training_data(browser, page, file_path)
            write_to_resource_target(target_path, file_name, result)        
                
    await browser.close()

async def get_training_data(browser: ptr.browser.Browser, page: ptr.page.Page, file_path: str) -> StemWordList:
    df = pd.read_csv(file_path)
    df.rename(columns=lambda x: x.strip(), inplace=True)

    words = []
    if isinstance(df, pd.DataFrame):
        for _, row in df.iterrows():
            print(row)
            result = await download_html(browser, page, row["link"], row["selector"])
            words.extend(clean_up_words(tokenize(result)))
    
    return words

In [7]:
## Cleaners.
T = TypeVar("T")
R = TypeVar("R")

TestValueStrCallable = Callable[[T], str]
TestValueBoolCallable = Callable[[T], bool]
ValueTestFnCallable = Callable[T, TestValueStrCallable]
FilterCallable = Callable[ValueTestFnCallable, bool]
MapCallable = Callable[ValueTestFnCallable, str]

def tokenize(data: str) -> PosTagList:
    tokenized_words = nltk.word_tokenize(data)
    mutated_words = nltk.pos_tag(tokenized_words)
    return mutated_words

def filter_words(x: T, test_value: TestValueStrCallable, *fns: FilterCallable) -> bool:
    if fns:
        if fns[0](x, test_value):
            return filter_words(x, test_value, *fns[1:])
        else:
            return False
    
    return True
    
def filter_by_punctuation(x: T, test_value: TestValueStrCallable = lambda t: t[0]) -> bool:
    return x[0] != x[1]

def filter_by_stop_word(x: T, test_value: TestValueStrCallable = lambda t: t[0]) -> bool:
    return test_value(x) not in stopwords.words("english")

def regex_filter(regex: str):
    def filter_by_regex(x: T, test_value: TestValueStrCallable = lambda t: t[0]):
        rgx = re.compile(regex)
        return rgx.match(test_value(x))
    return filter_by_regex

filter_by_alphabet = regex_filter(r"^([a-zA-Z]|')+$")
filter_by_apostrophe = regex_filter(r"^[^']*$")

def map_by_stem_words(x: PosTag, test_value: TestValueStrCallable = lambda t: t[0], ps=PorterStemmer()) -> StemWord:
    return ps.stem(test_value(x), True)

def map_words(x: T, test_value: TestValueStrCallable, *fns: MapCallable) -> StemWord:
    if fns:
        return map_words(fns[0](test_value(x)), test_value, *fns[1:])

    return x
            
def clean_up_words(words: PosTagList) -> StemWordList:
    filtered_words = list(
        filter(
        lambda x: filter_words(x, lambda x: x[0], filter_by_punctuation, filter_by_stop_word, filter_by_alphabet),
        words
        )
    )

    stem_words = list(map(lambda x: map_words(x, lambda x: x, map_by_stem_words), filtered_words))
    
    return list(
        filter(
            lambda x: filter_words(x, lambda x: x, filter_by_apostrophe), 
            stem_words)
    )

In [8]:
# type: ignore
await (get_training_data_from_folder("../resources/sources", "../resources/targets"))

selector                                           #topicText
link        https://www.uptodate.com/contents/depression-t...
Name: 0, dtype: object
selector                                        #main-content
link        https://www.mayoclinic.org/diseases-conditions...
Name: 1, dtype: object
selector                                     #mw-content-text
link        https://simple.wikipedia.org/wiki/Depression_(...
Name: 2, dtype: object
selector                                         #maincontent
link        https://www.nhs.uk/mental-health/conditions/cl...
Name: 3, dtype: object
selector    #__next > div.css-fdjy12 > div:nth-child(5) > ...
link        https://www.healthline.com/health/depression#s...
Name: 4, dtype: object
selector    #__next > div.css-fdjy12 > div:nth-child(5) > ...
link        https://www.medicalnewstoday.com/articles/3213...
Name: 5, dtype: object
selector                   #skip > div:nth-child(1) > article
link        https://www.hopkinsmedicine.org/health/condi

In [9]:
# Parsers
def get_cleaned_data_from_file(input_file_path: str) -> Optional[list[str]]:
    with open(input_file_path, "r") as file:
        return [word.strip("\n") for word in list(file.readlines())]

def get_cleaned_data_from_folder(input_path: str) -> dict[str, PosTagList]:
    input_files = glob(input_path + "/**/*", recursive=True)
    
    data = dict()
    for file_path in input_files:
        [file_name, file_extension] = get_name_and_extension(file_path)
        classification_data = get_cleaned_data_from_file(file_path)
        data[file_name] = classification_data
        
    return data


def group_by_tags(pos_tag_list: PosTagList) -> dict[str, list[str]]:
    """
      @unused
    """
    groups = dict()
    
    for value,tag in pos_tag_list:
        if tag in groups:
            groups[tag]
            groups[tag].append(value)
        else:
            groups[tag] = [value]
            
    return groups
    

result = get_cleaned_data_from_folder("../resources/targets")

{'migraine': ['introduct',
  'migrain',
  'common',
  'episod',
  'disord',
  'hallmark',
  'disabl',
  'headach',
  'gener',
  'associ',
  'nausea',
  'light',
  'sound',
  'sensit',
  'the',
  'acut',
  'treatment',
  'migrain',
  'adult',
  'review',
  'prevent',
  'treatment',
  'migrain',
  'adult',
  'discuss',
  'separ',
  'see',
  'prevent',
  'treatment',
  'episod',
  'migrain',
  'adult',
  'the',
  'pathophysiolog',
  'clinic',
  'manifest',
  'diagnosi',
  'migrain',
  'also',
  'discuss',
  'separ',
  'see',
  'pathophysiolog',
  'clinic',
  'manifest',
  'diagnosi',
  'migrain',
  'adult',
  'approach',
  'to',
  'treatment',
  'the',
  'abort',
  'symptomat',
  'therapi',
  'migrain',
  'rang',
  'use',
  'simpl',
  'analges',
  'nonsteroid',
  'drug',
  'nsaid',
  'acetaminophen',
  'triptan',
  'antiemet',
  'calcitonin',
  'peptid',
  'cgrp',
  'antagonist',
  'lasmiditan',
  'dihydroergotamin',
  'noninvas',
  'neuromodul',
  'devic',
  'typic',
  'use',
  'patient'

In [69]:
from nltk.probability import WittenBellProbDist, FreqDist,LaplaceProbDist

def generate_input(population: StemWordList, n_unique_words, chr_limit = 280, max_itr=100):
    freq_dist = FreqDist(population)
    prob_dist = WittenBellProbDist(freq_dist, n_unique_words)
    
    samples = []
    chr_count = 0
    
    for _ in range(max_itr):
        generated_v = prob_dist.generate()
        
        if len(generated_v) + chr_count < chr_limit:
            samples.append(generated_v)
            chr_count += 1
        else:
            continue
            
    return samples

def generate_samples(data: dict[str, StemWordList], n_samples = 100):
    n_unique_words = len({v for k in result for v in result[k]})
    
    return {k: [generate_input(data[k], n_unique_words) for _ in range(n_samples)] for k in data}









{'migraine': [['journal', 'june', 'limit', 'lack', 'updat', 'risk', 'cardiovascular', 'recurr', 'fatigu', 'cadasil', 'g', 'receptor', 'therapi', 'quit', 'buzzi', 'a', 'dihydroergotamin', 'associ', 'l', 'mayo', 'april', 'research', 'disord', 'reorgan', 'effect', 'rather', 'efficaci', 'bajwa', 'believ', 'kirthi', 'medic', 'it', 'refresh', 'caus', 'some', 'death', 'aura', 'migrain', 'toxin', 'activ', 'nsaid', 'eletriptan', 'often', 'buy', 'sometim', 'diffus', 'field', 'stage', 'nonor', 'pain', 'some', 'inhibitor', 'zolmitriptan', 'butterbur', 'significantli', 'differ', 'disturb', 'slower', 'degener', 'isbn', 'databas', 'effect', 'treatment', 'disambigu', 'octob', 'interfer', 'found', 'may', 'pattern', 'pmid', 'ketogen', 'k', 'counteract', 'access', 'irrit', 'adult', 'commerci', 'proven', 'white', 'aurafor', 'symptom', 'medicin', 'transmiss', 'supposedli', 'caus', 'possibl', 'review', 'pregnanc', 'greatli', 'may', 'jl', 'metoclopramid', 'migrain', 'trigemin', 'usual', 'vertosick', 'within'