# Known Keyword Extracton

In [29]:
import pandas as pd
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from urllib.request import Request, urlopen
from tldextract import extract
import progressbar
import pandas as pd
import certifi
from fake_useragent import UserAgent
import requests
import warnings
import re
import os
import pickle
import multiprocessing
import ast

warnings.filterwarnings('ignore')

ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}

links_substring_blacklist = [
    'sitemap',
    'terms',
    'contact',
    'careers',
    'uploads',
    'policy',
    '/media',
    'events/',
    '.jpg',
    '.png',
    '.pdf'
]
keywords_limit = 15
crawl_limit = 30

def link_from_same_subdomain(url, domain):
    _, sub_hostname, sub_suffix = extract(url)
    _, hostname, suffix = extract(domain)
    
    return sub_hostname == hostname and sub_suffix == suffix

def get_text_from_url(url):
    print(f"getting text from url {url}")
    try:
        html_page = requests.get(url, headers=header, verify=False, timeout=10).content
    except Exception as e:
        print(f'Failed to fetch {url}, {e}\n')
        return ""

    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)

    for elem in soup.find_all(['script', 'li', 'header', 'noscript', 'h1', 'h2', 'h3', 'h4']):
        elem.decompose()

    strips = list(soup.stripped_strings)
    
    return '\n'.join([f' {strip} ' for strip in strips if len(strip) > 50])

def get_links_from_url(url):
    try:
        print(f"getting links url {url}")
        html_page = requests.get(url, headers=header, verify=False, timeout=10).content
    except Exception as e:
        print(f'Faile to fetch {url} {e}\n')
        return set()

    soup = BeautifulSoup(html_page, "lxml")

    # get all links with absolute paths
    links = set()
    for link in soup.findAll('a'):
        link_href = urljoin(url, link.get('href'))
        parsed = urlparse(link_href, scheme='https')._replace(fragment="")
        
        if parsed.scheme[:4] == 'http':
            links.add(parsed.geturl().rstrip('/'))

    # return urls with same base domain
    return {link for link in links if link_from_same_subdomain(link, url)}

def get_links_recursive(url, existing_links=set(), max_depth=1, maxlen=20):
    if (len(existing_links) >= max_depth):
        return set()

    links = get_links_from_url(url)
    
    existing_links = existing_links | {url}
    for link in links - existing_links:
        links |= get_links_recursive(link, existing_links)
        
    for blacklisted_domain_substring in links_substring_blacklist:
        links = [link for link in links if blacklisted_domain_substring not in link]
    
    return links

def get_text_from_website(website):
    text = ''
    for link in get_links_recursive(website)[:crawl_limit]:
        text += get_text_from_url(link) + '\n'
        
    return text.replace('.', ' ').lower()

def get_known_keywords_for_website(website):
    print(f"printing {website}")
    print(f"printing {website}")
    try:
        text = get_text_from_website(website)
        known_keywords = pd.read_csv('known_keywords.csv').drop_duplicates()
        known_keywords['keyword'] = known_keywords['keyword'].str.lower()
        known_keywords['counts'] = known_keywords['keyword'].map(lambda x: text.count(f' {x.strip()} '))
        known_keywords['keyword_word_count'] = known_keywords['keyword'].map(lambda x: len(x.split(' ')))

        keyword_counts = known_keywords[known_keywords['counts'] > 0]
        keyword_counts = keyword_counts.sort_values(by=['keyword_word_count', 'counts'], ascending=False)

        return keyword_counts['keyword'].tolist()[:keywords_limit]
    except Exception as e:
        print(f'got an error with parsing {website}. Skipping. {e}')
        return []

def get_title(url):
    _, hostname, suffix = extract(url)
    return hostname.capitalize()

safe_url_path = lambda url: "keywords_parts/" + re.sub(r'\W+', '', url)

def get_known_keywords_for_website_parallel(url):
    keywords_part_path = safe_url_path(url)
    if os.path.exists(keywords_part_path):
        #print(f"skipping, {keywords_part_path} already exists")
        return []
    else:
        keywords = get_known_keywords_for_website(url)
        with open(keywords_part_path, 'wb') as fp:
            pickle.dump(keywords, fp)
        return []

In [30]:
domains = pd.read_csv('domains.csv')
domains["domain"] = domains["url"].map(lambda x: urlparse(x).scheme + '://' + urlparse(x).netloc)
domains = domains[["domain"]].drop_duplicates()

urls = domains["domain"]
print(urls.tolist())

['https://1library.net', 'https://3pbio.com', 'https://908devices.com', 'https://99bitcoins.com', 'https://aad.portal.azure.com', 'https://aatbs.com', 'https://abaqus-docs.mit.edu', 'https://abzena.com', 'https://academicjournals.org', 'https://access.clarivate.com', 'https://accessiblemeds.org', 'https://accr.natboard.edu.in', 'https://acs.figshare.com', 'https://acuraanalytical.com', 'https://adarepharmasolutions.com', 'https://adelphi-hp.com', 'https://aetsoft.net', 'https://agrojournal.org', 'https://air.unimi.it', 'https://akouos.com', 'http://alttox.org', 'https://ambrx.com', 'https://amplelogic.com', 'https://amubiochemicalsociety.org', 'https://angstromtechnology.com', 'https://anvipharma.com', 'https://api.grundfos.com', 'https://app.beapplied.com', 'https://app.wonder.me', 'https://approcess.com', 'https://apvma.gov.au', 'http://archive.org', 'https://archive.org', 'https://ardena.com', 'https://ascendiapharma.com', 'https://assets.kpmg', 'https://assets.thermofisher.com', 'h

In [31]:
"""pool = multiprocessing.Pool()
pool.map(get_known_keywords_for_website_parallel, urls)

extracted_keywords = []
for url in urls:
    with open(safe_url_path(url), "rb") as f:
        keywords = pickle.load(f)
    extracted_keywords.append({
        'domain': url,
        'keywords': keywords,
        'company_name': get_title(url)
    })

extracted_kw = pd.DataFrame(extracted_keywords)
extracted_kw.to_csv("extracted_keywords.csv", index=False)"""

'pool = multiprocessing.Pool()\npool.map(get_known_keywords_for_website_parallel, urls)\n\nextracted_keywords = []\nfor url in urls:\n    with open(safe_url_path(url), "rb") as f:\n        keywords = pickle.load(f)\n    extracted_keywords.append({\n        \'domain\': url,\n        \'keywords\': keywords,\n        \'company_name\': get_title(url)\n    })\n\nextracted_kw = pd.DataFrame(extracted_keywords)\nextracted_kw.to_csv("extracted_keywords.csv", index=False)'

In [54]:
# remove domains with few keywords
extracted_kw = pd.read_csv('extracted_keywords.csv')
extracted_kw['keywords'] = extracted_kw['keywords'].apply(lambda x: ast.literal_eval(x))


extracted_kw = extracted_kw[extracted_kw["keywords"].map(lambda x: len(x)) > 8]
extracted_kw.to_csv("extracted_keywords.csv", index=False)

# Unknown Keywords Extraction

In [None]:
from flair.data import Sentence
from flair.models import MultiTagger
from flair.tokenization import SciSpacyTokenizer

sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")

# load biomedical tagger
tagger = MultiTagger.load("hunflair")

tags = []
for s in text:
    if s[:4] == 'http':
        continue
    sentence = Sentence(s)
    tagger.predict(sentence)
    print(s)
    for annotation_layer in sentence.annotation_layers.keys():
        for entity in sentence.get_spans(annotation_layer):
            tags += [entity.text]
            print('TAG!', entity.text)