# Known Keyword Extracton

In [None]:
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from urllib.request import Request, urlopen
from tldextract import extract
import progressbar
import pandas as pd
import certifi
from fake_useragent import UserAgent
import requests

ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}

links_substring_blacklist = [
    'sitemap',
    'terms',
    'contact',
    'careers',
    'uploads',
    'policy',
    '/media',
    'events/'
]
keywords_limit = 15
crawl_limit = 40

def link_from_same_subdomain(url, domain):
    _, sub_hostname, sub_suffix = extract(url)
    _, hostname, suffix = extract(domain)
    
    return sub_hostname == hostname and sub_suffix == suffix

def get_text_from_url(url):
    try:
        html_page = requests.get(url, headers=header, verify=False).content
    except Exception as e:
        print('Failed to fetch', url, e, '\n')
        return []

    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)

    for elem in soup.find_all(['script', 'li', 'header', 'noscript', 'h1', 'h2', 'h3', 'h4']):
        elem.decompose()

    strips = list(soup.stripped_strings)
    
    return '\n'.join([f' {strip} ' for strip in strips if len(strip) > 50])

def get_links_from_url(url):
    try:
        html_page = requests.get(url, headers=header, verify=False).content
    except Exception as e:
        print('faile to fetch', url, e, '\n')
        return set()

    soup = BeautifulSoup(html_page, "lxml")

    # get all links with absolute paths
    links = set()
    for link in soup.findAll('a'):
        link_href = urljoin(url, link.get('href'))
        parsed = urlparse(link_href, scheme='https')._replace(fragment="")
        
        if parsed.scheme[:4] == 'http':
            links.add(parsed.geturl().rstrip('/'))

    # return urls with same base domain
    return {link for link in links if link_from_same_subdomain(link, url)}

def get_links_recursive(url, existing_links=set(), max_depth=1, maxlen=20):
    if (len(existing_links) >= max_depth):
        return set()

    links = get_links_from_url(url)
    
    existing_links = existing_links | {url}
    for link in links - existing_links:
        links |= get_links_recursive(link, existing_links)
        
    for blacklisted_domain_substring in links_substring_blacklist:
        links = [link for link in links if blacklisted_domain_substring not in link]
    
    return links

def get_text_from_website(website):
    text = ''
    for link in get_links_recursive(website)[:crawl_limit]:
        print('getting', link, end='\r')
        text += get_text_from_url(link) + '\n'
        
    return text.replace('.', ' ').lower()

def get_known_keywords_for_website(website):
    text = get_text_from_website(website)
    known_keywords = pd.read_csv('known_keywords.csv').drop_duplicates()
    known_keywords['keyword'] = known_keywords['keyword'].str.lower()
    known_keywords['counts'] = known_keywords['keyword'].map(lambda x: text.count(f' {x.strip()} '))
    known_keywords['keyword_word_count'] = known_keywords['keyword'].map(lambda x: len(x.split(' ')))

    keyword_counts = known_keywords[known_keywords['counts'] > 0]
    keyword_counts = keyword_counts.sort_values(by=['keyword_word_count', 'counts'], ascending=False)

    return keyword_counts['keyword'].tolist()[:keywords_limit]

def get_title(url):
    _, hostname, suffix = extract(url)
    return hostname.capitalize()




In [None]:
# run on a list of hanb-picked domains

urls = [
    'https://bionicalemas.com/',
    'https://abzena.com/',
    'https://lonza.com',
    'https://www.merck.com/',
    'https://www.eurofins.com/',
    'https://www.sgs.com/'
]

extracted_keywords = []
for url in urls:
    extracted_keywords.append({
        'domain': url,
        'keywords': get_known_keywords_for_website(url),
        'company_name': get_title(url)
    })
    
extracted_keywords

In [None]:
extracted_kw = pd.DataFrame(extracted_keywords)
extracted_kw.to_csv("extracted_keywords.csv", index=False)

In [None]:
extracted_kw

# Unknown Keywords Extraction

In [None]:
from flair.data import Sentence
from flair.models import MultiTagger
from flair.tokenization import SciSpacyTokenizer

sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")

# load biomedical tagger
tagger = MultiTagger.load("hunflair")

tags = []
for s in text:
    if s[:4] == 'http':
        continue
    sentence = Sentence(s)
    tagger.predict(sentence)
    print(s)
    for annotation_layer in sentence.annotation_layers.keys():
        for entity in sentence.get_spans(annotation_layer):
            tags += [entity.text]
            print('TAG!', entity.text)