# Known Keyword Extracton

In [None]:
import pandas as pd
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from urllib.request import Request, urlopen
from tldextract import extract
import progressbar
import pandas as pd
import certifi
from fake_useragent import UserAgent
import requests
import warnings
import re
import os
import pickle
import multiprocessing
import ast

warnings.filterwarnings('ignore')

ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}

links_substring_blacklist = [
    'sitemap',
    'terms',
    'contact',
    'careers',
    'uploads',
    'policy',
    '/media',
    'events/',
    '.jpg',
    '.png',
    '.pdf'
]
keywords_limit = 15
crawl_limit = 30

def link_from_same_subdomain(url, domain):
    _, sub_hostname, sub_suffix = extract(url)
    _, hostname, suffix = extract(domain)
    
    return sub_hostname == hostname and sub_suffix == suffix

def get_text_from_url(url):
    print(f"getting text from url {url}")
    try:
        html_page = requests.get(url, headers=header, verify=False, timeout=10).content
    except Exception as e:
        print(f'Failed to fetch {url}, {e}\n')
        return ""

    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)

    for elem in soup.find_all(['script', 'li', 'header', 'noscript', 'h1', 'h2', 'h3', 'h4']):
        elem.decompose()

    strips = list(soup.stripped_strings)
    
    return '\n'.join([f' {strip} ' for strip in strips if len(strip) > 50])

def get_links_from_url(url):
    try:
        print(f"getting links url {url}")
        html_page = requests.get(url, headers=header, verify=False, timeout=10).content
    except Exception as e:
        print(f'Faile to fetch {url} {e}\n')
        return set()

    soup = BeautifulSoup(html_page, "lxml")

    # get all links with absolute paths
    links = set()
    for link in soup.findAll('a'):
        link_href = urljoin(url, link.get('href'))
        parsed = urlparse(link_href, scheme='https')._replace(fragment="")
        
        if parsed.scheme[:4] == 'http':
            links.add(parsed.geturl().rstrip('/'))

    # return urls with same base domain
    return {link for link in links if link_from_same_subdomain(link, url)}

def get_links_recursive(url, existing_links=set(), max_depth=1, maxlen=20):
    if (len(existing_links) >= max_depth):
        return set()

    links = get_links_from_url(url)
    
    existing_links = existing_links | {url}
    for link in links - existing_links:
        links |= get_links_recursive(link, existing_links)
        
    for blacklisted_domain_substring in links_substring_blacklist:
        links = [link for link in links if blacklisted_domain_substring not in link]
    
    return links

def get_text_from_website(website):
    text = ''
    for link in get_links_recursive(website)[:crawl_limit]:
        text += get_text_from_url(link) + '\n'
        
    return text.replace('.', ' ').lower()

def get_known_keywords_for_website(website):
    print(f"printing {website}")
    print(f"printing {website}")
    try:
        text = get_text_from_website(website)
        known_keywords = pd.read_csv('known_keywords.csv').drop_duplicates()
        known_keywords['keyword'] = known_keywords['keyword'].str.lower()
        known_keywords['counts'] = known_keywords['keyword'].map(lambda x: text.count(f' {x.strip()} '))
        known_keywords['keyword_word_count'] = known_keywords['keyword'].map(lambda x: len(x.split(' ')))

        keyword_counts = known_keywords[known_keywords['counts'] > 0]
        keyword_counts = keyword_counts.sort_values(by=['keyword_word_count', 'counts'], ascending=False)

        return keyword_counts['keyword'].tolist()[:keywords_limit]
    except Exception as e:
        print(f'got an error with parsing {website}. Skipping. {e}')
        return []

def get_title(url):
    _, hostname, suffix = extract(url)
    return hostname.capitalize()

safe_url_path = lambda url: "keywords_parts/" + re.sub(r'\W+', '', url)

def get_known_keywords_for_website_parallel(url):
    keywords_part_path = safe_url_path(url)
    if os.path.exists(keywords_part_path):
        #print(f"skipping, {keywords_part_path} already exists")
        return []
    else:
        keywords = get_known_keywords_for_website(url)
        with open(keywords_part_path, 'wb') as fp:
            pickle.dump(keywords, fp)
        return []

### Read and process domains from domains.csv

In [None]:
domains = pd.read_csv('domains.csv')
domains["domain"] = domains["url"].map(lambda x: urlparse(x).scheme + '://' + urlparse(x).netloc)
domains = domains[["domain"]].drop_duplicates()

urls = domains["domain"]
print(urls.tolist())

### Perform hierarchical website scrapoing traversal

In [None]:
"""pool = multiprocessing.Pool()
pool.map(get_known_keywords_for_website_parallel, urls)
"""

### Build a dataframe with extracted keywords

In [None]:
extracted_keywords = []
for url in urls:
    with open(safe_url_path(url), "rb") as f:
        keywords = pickle.load(f)
    extracted_keywords.append({
        'domain': url,
        'keywords': keywords,
        'company_name': get_title(url)
    })

extracted_kw = pd.DataFrame(extracted_keywords)

In [None]:
extracted_kw = extracted_kw[extracted_kw["keywords"].map(lambda x: len(x)) > 8]

In [None]:
import fnmatch

blacklist = list()
with open("domain_blacklist.txt", "r") as f:
    blacklist = f.read().splitlines()

def filter_domains(domain):
    domain = domain.replace('http://', '').replace('https://', '')
    for blacklisted_domain in blacklist:
        if fnmatch.fnmatch(domain, blacklisted_domain):
            return False
    return True

In [None]:
filtered_kw_list = extracted_kw[extracted_kw['domain'].map(filter_domains)]

In [None]:
filtered_kw_list

In [None]:
filtered_kw_list.to_csv("extracted_keywords.csv", index=False)

# Unknown Keywords Extraction (POC use case)

In [13]:
text = get_text_from_website("https://www.lonza.com")

getting links url https://www.lonza.com
getting text from url https://www.lonza.com/news-and-media/image-library
getting text from url https://www.lonza.com/public/conditions
getting text from url https://www.lonza.com/company-overview/our-history
getting text from url https://www.lonza.com/news-and-media/a-view-on-podcast
getting text from url https://www.lonza.com/news-and-media/leadership-portraits
getting text from url https://www.lonza.com/company-overview/leadership
getting text from url https://pharma.lonza.com
getting text from url https://www.lonza.com/news-and-media
getting text from url https://www.lonza.com/investor-relations/shareholders-and-stock-information
getting text from url https://www.lonza.com/sustainability/people
getting text from url https://www.lonza.com/company-overview/our-locations
getting text from url https://www.lonza.com/sustainability
getting text from url https://www.lonza.com/news-and-media/news-archive
getting text from url https://www.lonza.com/com

## POC Example: Perform new keywords discovery from www.lonza.com

In [12]:
import spacy

text = get_text_from_website("https://www.lonza.com")

nlp = spacy.load("en_ner_bionlp13cg_md")
doc = nlp(text)

print(doc.ents)

(cell, patient-scale cell, patient-scale cell, patient-scale cell, calcium, nitrogen, ammonia, calcium, nitric acid, naphtha, niacin, vitamin, joint, niacinamide, mammalian cell cultures, mammalian cell culture, ucb, arabinogalactan, high-growth, u s, cerium, hydro-québec, isophthalic acid, amaxa  , amaxa, line, cell, joint, cell, l-carnitine, vitamin, gmp, glaxosmithkline, gsk, slough, cgmp, vitamin b3  , vitamin b3, human, mesoblast, mesoblast, stem cell, arch, gmp, gmp, urethanes, sartorius stedim, cell, sartorius stedim, niacinamide, wood, slough, hepatocyte, platinum, joint, retinal, l-asparaginase, patient, capsule, 3d, cannabinoids  , cancer  , cancer, cells, patients, antibody-drug, tumors, tumor cells, cancer cells, cells, patients, bioconjugates, bioconjugates, antibody-drug conjugates, vaccine conjugates, cells, cellular, cancerous cells, tumor, cell, tumor cells, cancer, patient, swiss, swiss, eth, human, chro, cell, cell, bioconjugates, cell, patients, cgmp, cell, cell, co

# PDF Keywords Extraction POC

In [88]:
import PyPDF2

text = ""
with open("../mr.pdf",mode="rb") as in_file:
    pdf_reader = PyPDF2.PdfFileReader(in_file)
    the_number_of_pages = pdf_reader.getNumPages()
    for i in range(the_number_of_pages):
        page = pdf_reader.getPage(i)
        text += page.extractText() + "\n"
        
import re
text = text.replace("\n", " ")
text = re.sub(' +', ' ', text)
text = text.replace("dsad", " ").lower()

In [89]:
text

'sgs analytic s switzerland ag dr. stephan pelser ; dr . bernhar dschnu rr lifesciences life i nspir ed, quali ty dri ven 2 © s gs s a 2 0 2 0 a ll righ ts rese rve d sgs in brief testingandce rtifica tionco mpanyre now nedfor havingthehighestsc ientificandcompliance stand ar ds. \x0bù \x0bù \x0bù address: 1 place des alpes ch - 1211 geneva switzerland 3 © s gs s a 2 0 2 0 a ll righ ts rese rve d hea lt hscien cenet work dr ugdev elo pme nt am e r i c as e u r o p e as i a c an ad a toronto ( m a r k h a m , o n ) toronto ( m i s s i s s a u g a , o n ) u s a c h i c a g o ( l i n c o l n s h i r e , il) n e w j ersey ( f air f ield , n j ) p h i l a d e l ph i a ( w es tches ter , pa) b e l g i u m b r u s s e l s ( w avr e ) b r u s s e l s ( z e l l i k ) f r an c e pari s ( v i l l e n e u v e - l a - g a r e n n e ) poi ti ers g e r m an y b e r l i n frank fur t ( t aunus s tein) w i e s b a d e n a a c h e n ( s yn l a b ) b e r l i n ( s yn l a b ) m a r k k l e e b e r g ( s y

In [86]:
known_keywords = pd.read_csv('known_keywords.csv').drop_duplicates()
known_keywords['keyword'] = known_keywords['keyword'].str.lower()
known_keywords['counts'] = known_keywords['keyword'].map(lambda x: text.count(f' {x.strip()} '))
known_keywords['keyword_word_count'] = known_keywords['keyword'].map(lambda x: len(x.split(' ')))

keyword_counts = known_keywords[known_keywords['counts'] > 0]
keyword_counts = keyword_counts.sort_values(by=['keyword_word_count', 'counts'], ascending=False)

keyword_counts['keyword'].tolist()[:keywords_limit]

['validation',
 'ms',
 'ce',
 'qc',
 'gmp',
 'scale',
 'development',
 'ad',
 'qa',
 'clinical',
 'sec',
 'hplc']