# Known Keyword Extracton

In [7]:
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from urllib.request import Request, urlopen
from tldextract import extract
import progressbar
import pandas as pd
import certifi
from fake_useragent import UserAgent
import requests

ua = UserAgent()
header = {'User-Agent':str(ua.chrome)}

links_substring_blacklist = [
    'sitemap',
    'terms',
    'contact',
    'careers',
    'uploads',
    'policy',
    '/media',
    'events/'
]
keywords_limit = 15
crawl_limit = 40

def link_from_same_subdomain(url, domain):
    _, sub_hostname, sub_suffix = extract(url)
    _, hostname, suffix = extract(domain)
    
    return sub_hostname == hostname and sub_suffix == suffix

def get_text_from_url(url):
    try:
        html_page = requests.get(url, headers=header, verify=False).content
    except Exception as e:
        print('Failed to fetch', url, e, '\n')
        return []

    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)

    for elem in soup.find_all(['script', 'li', 'header', 'noscript', 'h1', 'h2', 'h3', 'h4']):
        elem.decompose()

    strips = list(soup.stripped_strings)
    
    return '\n'.join([f' {strip} ' for strip in strips if len(strip) > 50])

def get_links_from_url(url):
    try:
        html_page = requests.get(url, headers=header, verify=False).content
    except Exception as e:
        print('faile to fetch', url, e, '\n')
        return set()

    soup = BeautifulSoup(html_page, "lxml")

    # get all links with absolute paths
    links = set()
    for link in soup.findAll('a'):
        link_href = urljoin(url, link.get('href'))
        parsed = urlparse(link_href, scheme='https')._replace(fragment="")
        
        if parsed.scheme[:4] == 'http':
            links.add(parsed.geturl().rstrip('/'))

    # return urls with same base domain
    return {link for link in links if link_from_same_subdomain(link, url)}

def get_links_recursive(url, existing_links=set(), max_depth=1, maxlen=20):
    if (len(existing_links) >= max_depth):
        return set()

    links = get_links_from_url(url)
    
    existing_links = existing_links | {url}
    for link in links - existing_links:
        links |= get_links_recursive(link, existing_links)
        
    for blacklisted_domain_substring in links_substring_blacklist:
        links = [link for link in links if blacklisted_domain_substring not in link]
    
    return links

def get_text_from_website(website):
    text = ''
    for link in get_links_recursive(website)[:crawl_limit]:
        print('getting', link, end='\r')
        text += get_text_from_url(link) + '\n'
        
    return text.replace('.', ' ').lower()

def get_known_keywords_for_website(website):
    text = get_text_from_website(website)
    known_keywords = pd.read_csv('known_keywords.csv').drop_duplicates()
    known_keywords['keyword'] = known_keywords['keyword'].str.lower()
    known_keywords['counts'] = known_keywords['keyword'].map(lambda x: text.count(f' {x.strip()} '))
    known_keywords['keyword_word_count'] = known_keywords['keyword'].map(lambda x: len(x.split(' ')))

    keyword_counts = known_keywords[known_keywords['counts'] > 0]
    keyword_counts = keyword_counts.sort_values(by=['keyword_word_count', 'counts'], ascending=False)

    return keyword_counts['keyword'].tolist()[:keywords_limit]

def get_title(url):
    _, hostname, suffix = extract(url)
    return hostname.capitalize()




In [8]:
# run on a list of hanb-picked domains

urls = [
    'https://bionicalemas.com/',
    'https://abzena.com/',
    'https://lonza.com',
    'https://www.merck.com/',
    'https://www.eurofins.com/',
    'https://www.sgs.com/'
]

extracted_keywords = []
for url in urls:
    extracted_keywords.append({
        'domain': url,
        'keywords': get_known_keywords_for_website(url),
        'company_name': get_title(url)
    })
    
extracted_keywords



getting https://bionicalemas.com/aboutgetting https://bionicalemas.com/clinical-development/clinical-operations



getting https://bionicalemas.com/news/bionical-emas-stands-with-ukrainegetting https://bionicalemas.com/case-studies



getting https://bionicalemas.com/early-access-programs/real-world-data-collection-evidence-generation



getting https://bionicalemas.com/clinical-development/pharmacovigilance



getting https://bionicalemas.com/?s=



getting https://bionicalemas.comgetting https://bionicalemas.com/clinical-development/data-services



getting https://bionicalemas.com/news/a-statement-from-the-bionical-emas-group-on-guidance-measures-regarding-covid-19getting https://bionicalemas.com/news/amylyx-pharmaceuticals-announces-launch-of-u-s-expanded-access-program-for-amx0035



getting https://bionicalemas.com/news/ian-fraser-appointed-as-executive-vice-president-clinical-developmentgetting https://bionicalemas.com/clinical-trial-supply/comparator-ancillary-supply



getting https://bionicalemas.com/clinical-trial-supply/yourbionical-appgetting https://bionicalemas.com/clinical-trial-supply



getting https://bionicalemas.com/section-172-statementgetting https://bionicalemas.com/early-access-programs



getting https://bionicalemas.com/newsgetting https://bionicalemas.com/clinical-development



getting https://bionicalemas.com/clinical-development/medical-affairs



getting https://bionicalemas.com/clinical-trial-supply/biosimilar-servicesgetting https://bionicalemas.com/clinical-development/regulatory-health-economic-science



getting https://bionicalemas.com/eventsgetting https://bionicalemas.com/our-people



getting https://bionicalemas.com/news/auxora-vs-placebo-for-the-treatment-of-patients-with-severe-covid-19-pneumonia-a-randomized-controlled-clinical-trialgetting https://bionicalemas.com/clinical-development/quality-assurance



getting https://bionicalemas.com/early-access-programs/program-strategy-and-designgetting https://bionicalemas.com/clinical-development/project-management



getting https://bionicalemas.com/fact-sheetsgetting https://bionicalemas.com/anti-slavery-and-human-trafficking-statement-2021



getting https://bionicalemas.com/early-access-programs/program-delivery



getting https://abzena.com/development-services/protein-engineering/antibody-reformatting



getting https://abzena.com/about/about-us/management



getting https://abzena.com/development-services/bioconjugation/adc-cascade



getting https://abzena.com/manufacturing/mammalian-manufacturing



getting https://abzena.com/development-services/protein-engineering



getting https://abzena.com/manufacturing/bioconjugates-manufacturing/manufacturing-capabilities



getting https://abzena.com/development-services/bioassays/functional-assays/immuno-oncology-assays-phosphoflow-validation



getting https://abzena.com/development-services/bioanalytics



getting https://abzena.com/development-services/discovery



getting https://abzena.com/chemistry-home



getting https://abzena.com/development-services/cell-line-development/biosimilar-cell-line-development



getting https://abzena.com/manufacturing/chemistry/custom-synthesis



getting https://abzena.com/development-services/bioanalytics/analytical-method-development



getting https://abzena.com/manufacturing/peptides/peptide-synthesis



getting https://abzena.com/manufacturing/peptides



getting https://abzena.com/news/abzena-opens-a-new-biologics-testing-lab-to-support-partners-programs



getting https://abzena.com/development-services/protein-engineering/ig-class-and-isotype-switching



getting https://abzena.com/development-services/bioconjugation



getting https://abzena.com/development-services/bioconjugation/antibody-drug-conjugate-manufacturing



getting https://abzena.com/development-services/bioanalytics/biosimilarity



getting https://abzena.com/development-services/bioconjugation/payloads



getting https://abzena.com/development-services/discovery/developability-assessment



getting https://abzena.com/news/abzena-announces-appointment-of-petra-dieterich-as-scientific-lead



getting https://abzena.com/development-services/bioassays/cell-health



getting https://abzena.com/about/about-us



getting https://abzena.com/development-services/protein-engineering/antibody-production



getting https://abzena.com/development-services/bioassays/custom-assay-development



getting https://abzena.com/manufacturing/peptides/peptide-analytics



getting https://abzena.com/development-services/bioassays



getting https://abzena.com/development-services/bioanalytics/candidate-selection



getting https://abzena.com/manufacturing/chemistry/compound-library-synthesis



getting https://abzena.com/development-services/bioassays/fc-binding-and-function-assays



getting https://abzena.com/manufacturing/bioconjugates-manufacturing



getting https://abzena.com/development-services/bioconjugation/antibody-drug-conjugation



getting https://abzena.com/development-services/protein-engineering/affinity-maturation



getting https://abzena.com/development-services/protein-engineering/bespoke-projects



getting https://abzena.com/manufacturing/bioconjugates-manufacturing/process-development



getting https://abzena.com/development-services/immunology



getting https://abzena.com/development-services/bioassays/adc-characterisation



getting https://abzena.com/development-services/cell-line-development/host-cell-switching-and-recloning



getting https://lonza.com/company-overview/our-locations



getting https://lonza.com/company-overview/our-websites



getting https://lonza.com/sustainability/planet



getting https://lonza.com/sustainability/people



getting https://lonza.com/investor-relations/agenda-and-events



getting https://lonza.com/public/conditions



getting https://pharma.lonza.com:443



getting https://lonza.com/investor-relations/corporate-governance



getting https://lonza.com/news-and-media/news-archive



getting https://lonza.com/investor-relations/reporting-center



getting https://lonza.com/sustainability/performance



getting https://lonza.com/investor-relations/annual-report



getting https://lonza.com/sustainability/ethics-and-compliance



getting https://lonza.com/company-overview/strategy



getting https://lonza.com/company-overview/our-history



getting https://lonza.com/news-and-media/blog



getting https://lonza.com/company-overview



getting https://lonza.com/sustainability/community



getting https://lonza.com/news-and-media/leadership-portraits



getting https://lonza.com/news-and-media/image-library



getting https://lonza.com/sustainability/global-quality



getting https://lonza.com/news-and-media/a-view-on-podcast



getting https://lonza.com/company-overview/leadership



getting https://lonza.com



getting https://lonza.com/investor-relations/shareholders-and-stock-information



getting https://lonza.com/news-and-media/videos



getting https://lonza.com/news-and-media/logo-guidelines



getting https://lonza.com/events



getting https://lonza.com/news-and-media



getting https://lonza.com/sustainability



getting https://lonza.com/investor-relations



getting https://pharma.lonza.com



getting https://www.merck.com/research-and-products/products-list



getting https://www.merck.com/company-overview/culture-and-valuesgetting https://www.merck.com/research-and-products/infectious-diseases



getting https://jobs.merck.com/us/en/research-laboratories



getting https://www.merck.com/research-and-products/discovery-developmentgetting https://www.merck.com/company-overview/history



getting https://jobs.merck.com/us/en/manufacturing



getting https://www.merck.com/stories/3-ways-to-advance-cancer-health-care-equitygetting https://www.merck.com/company-overview/responsibility/merck-medical-outreach-program



getting https://www.merck.com/patients/patient-and-treatment-education



getting https://www.merck.com/research-and-products/covid-19getting https://jobs.merck.com/us/en



getting https://www.merck.com/company-overview/responsibility/philanthropy



getting https://jobs.merck.com/us/en/jointalentcommunity?applyType=JTC



getting https://www.merck.com/company-overview/responsibility/esg-resourcesgetting https://www.merck.com/research-and-products/clinical-trials



getting https://www.merck.com/investor-relations/events-and-presentations



getting https://www.merck.com/research-and-products/cardio-metabolic-disordersgetting https://www.merck.com/company-overview/business-development-licensing



getting https://www.merck.com/investor-relations/organon-resourcesgetting https://www.merck.com/research-and-products/vaccines



getting https://jobs.merck.com/us/en/home



getting https://www.merck.com/company-overview/culture-and-values/code-of-conductgetting https://www.merck.com/company-overview/responsibility/impact-investing



getting https://www.merck.com/research-and-productsgetting https://www.merck.com/company-overview/leadership/executive-team



getting https://jobs.merck.com/us/en/animal-health



getting https://www.merck.com/research-and-products/oncologygetting https://www.merck.com/our-covid-19-response



getting https://www.merck.com/company-overview/policies-and-positionsgetting https://jobs.merck.com/us/en/student-opportunities



getting https://www.merck.com/stories/celebrating-our-asian-american-and-pacific-islander-colleaguesgetting https://www.merck.com/investor-relations/financial-information



getting https://www.merck.com/investor-relations/stock-info



getting https://www.merck.com/research-and-products/distributors



getting https://www.merck.com/storiesgetting https://www.merck.com/research-and-products/patent



getting https://www.merck.com/company-overview/leadership



getting https://jobs.merck.com/us/en/veteran-opportunities



getting https://www.merck.com/company-overview/responsibility/transparency-disclosures



getting https://www.eurofins.com/food-and-feed-testing



getting https://www.eurofins.com/about-us/corporate-sustainability/governance/eurofins-core-compliance-documents



getting https://www.eurofins.com/investors/reports-and-presentations



getting http://www.eurofins.com/investor-relations



getting https://www.eurofins.com/our-services/clinical-diagnostics/usa



getting https://www.eurofins.com/investors/executives-dealings-disclosures



getting https://www.eurofins.com/our-services/agroscience-services/specialisms



getting https://www.eurofins.com/our-services/biopharma-services/biopharma-product-testing



getting https://www.eurofins.com/scientific-impact/scientific-innovation/the-future-of-bees-the-future-of-life



getting https://www.eurofins.com/our-services/genomic-services/genotyping-gene-expression



getting https://www.eurofins.com/about-us/corporate-sustainability/social-and-employee-matters/ft-diversity-leader-by-financial-times



getting https://www.eurofins.com/scientific-impact/scientific-innovation/helping-to-stop-zika-in-its-tracks



getting https://www.eurofins.com/investors/debt-hybrid-capital-instruments/credit-presentations/2017



getting https://www.eurofins.com/our-services/genomic-services/news-events



getting https://www.eurofins.com/scientific-impact/scientific-innovation



getting https://www.eurofins.com/about-us/corporate-sustainability/our-approach-to-corporate-sustainability/scientific-collaborations



getting https://www.eurofins.com/scientific-impact/scientific-innovation/testing-for-traceability-of-meat-from-farm-to-fork-down-to-individual-animals



getting https://www.eurofins.com/scientific-impact/scientific-publications



getting https://www.eurofins.com/covid-19-response/surface-testing-services



getting https://www.eurofins.com/covid-19-response/clinical-diagnostic-services



getting https://www.eurofins.com/scientific-impact/scientific-innovation/new-generation-of-non-invasive-prenatal-testing-methods



getting https://www.eurofins.com/about-us/corporate-sustainability/governance/whistleblowing



getting https://www.eurofins.com/our-services/genomic-services/dna-rna-oligonucleotides



getting https://www.eurofins.com/our-services/food-and-feed-testing/food-testing-flyers



getting https://www.eurofins.com/our-services/reach-services



getting https://www.eurofins.com/scientific-impact/scientific-innovation/algae-algae-everywhere-toxic-and-what-a-stink



getting https://www.eurofins.com/environment-testing



getting https://www.eurofins.com/materials-and-engineering-sciences



getting https://www.eurofins.com/about-us/corporate-sustainability/social-and-employee-matters/total-recordable-incidence-rate-trir



getting https://www.eurofins.com/our-services/consumer-product-testing/about-us



getting https://www.eurofins.com/investors/debt-hybrid-capital-instruments/rating



getting https://www.eurofins.com/our-services



getting https://www.eurofins.com/covid-19-response/product-testing-of-ppe



getting https://www.eurofins.com/scientific-impact/scientific-innovation/solving-old-crimes-with-new-technology



getting https://www.eurofins.com/our-services/environment-testing/environmental-testing-news



getting https://www.eurofins.com/scientific-impact/scientific-innovation/acrylamide-on-the-tip-of-everyone-s-tongue



getting https://www.eurofins.com/about-us



getting https://www.eurofins.com/our-services/environment-testing



getting https://www.eurofins.com/scientific-impact/scientific-innovation/beating-drug-assisted-rape-forensically



getting https://www.eurofins.com/scientific-impact/scientific-innovation/the-forefathers-of-pesticide-testing



getting https://www.sgs.com/en/privacy-at-sgsnd-products/product-inspection



getting https://www.sgs.com/en/sustainability-solutions/sustainable-infrastructures



getting https://www.sgs.com/en/icm



getting https://www.sgs.com/en/news/2022/03/understanding-complex-regulatory-landscapes-for-consumer-electricals-in-latin-america



getting https://www.sgs.com/en/natural-resources/agricultural-commodities



getting https://www.sgs.com/en/sustainability-solutions/sustainable-production



getting https://www.sgs.com/en/verify-sgs-documents



getting https://www.sgs.com/en/industries-and-environment



getting https://www.sgs.com/en/sustainability-solutions/sustainable-energy



getting https://www.sgs.com/en/health-and-nutrition/food



getting https://www.sgs.com/en/connectivity-and-products/e-commerce-product-compliance



getting https://www.sgs.com/en/natural-resources/laboratory-testing-petroleum-and-chemicals



getting https://www.sgs.com/en/connectivity-and-products/hardgoods-toys-and-juvenile-products



getting https://www.sgs.com/en/news/2022/04/earth-day-2022-sgs-invests-in-energy-efficiency-measures-around-the-globe



getting https://www.sgs.com/en/our-company/corporate-sustainability



getting https://www.sgs.com/en/social-media



getting https://www.sgs.com/en/white-paper-library



getting https://www.sgs.com/en/connectivity-and-products



getting https://www.sgs.com/en/health-and-nutrition/crop-science



getting https://www.sgs.com/en/connectivity-and-products/brand-protection



getting https://www.sgs.com/en/sustainability-solutions/sustainable-use-of-natural-resources



getting https://www.sgs.com/en/health-and-nutrition



getting https://www.sgs.com/en/industries-and-environment/transportation



getting https://www.sgs.com/en/news/2022/04/sgs-s-callao-city-food-testing-laboratory-upgraded-increasing-response-capacity-and-sustainability



getting https://www.sgs.com/en/knowledge-solutions/esg-assurance-services



getting https://www.sgs.com/en/connectivity-and-products/product-certification



getting https://www.sgs.com/en/health-and-nutrition/health-science



getting https://www.sgs.com/en/connectivity-and-products/digital-solutions-for-connectivity-and-products



getting https://www.sgs.com/en/natural-resources/oil-gas-and-chemical-commodities



getting https://www.sgs.com/en/health-and-nutrition/cosmetics-and-hygiene



getting https://www.sgs.com/en/knowledge-solutions/supply-chain-assurance



getting https://www.sgs.com/en/knowledge-solutions/technical-consulting



getting https://www.sgs.com/en/knowledge-solutions/medical-devices-solutions



getting https://www.sgs.com/en/connectivity-and-products/connectivity



getting https://www.sgs.com/en/our-company/compliance-and-integrity



getting https://www.sgs.com/en/digital-solutions/general-cybersecurity-services



getting https://www.sgs.com/en/news/2022/04/sgs-opens-new-marine-laboratory-in-panama



getting https://www.sgs.com/en/digital-solutions/pitch-an-idea



getting https://www.sgs.com/en/connectivity-and-products/softlines-and-accessories



[{'domain': 'https://bionicalemas.com/',
  'keywords': ['clinical trial',
   'clinical development',
   'quality assurance',
   'clinical supply',
   'clinical',
   'development',
   'compliance',
   'research',
   'qa',
   'solution',
   'gdp',
   'tested'],
  'company_name': 'Bionicalemas'},
 {'domain': 'https://abzena.com/',
  'keywords': ['cell line development',
   'cell line',
   'cell lines',
   'drug substance',
   'clinical development',
   'bispecific antibodies',
   'recombinant proteins',
   'monoclonal antibody',
   'monoclonal antibodies',
   'fusion proteins',
   'small molecules',
   'small molecule',
   'analytical development',
   'quality control',
   'development'],
  'company_name': 'Abzena'},
 {'domain': 'https://lonza.com',
  'keywords': ['highly potent apis',
   'drug substance',
   'contract development',
   'small molecules',
   'small molecule',
   'large molecules',
   'recombinant proteins',
   'quality control',
   'commercial molecules',
   'clinical supp

In [11]:
extracted_kw = pd.DataFrame(extracted_keywords)
extracted_kw.to_csv("extracted_keywords.csv", index=False)

In [12]:
extracted_kw

Unnamed: 0,domain,keywords,company_name
0,https://bionicalemas.com/,"[clinical trial, clinical development, quality...",Bionicalemas
1,https://abzena.com/,"[cell line development, cell line, cell lines,...",Abzena
2,https://lonza.com,"[highly potent apis, drug substance, contract ...",Lonza
3,https://www.merck.com/,"[research and development, clinical trial, cli...",Merck
4,https://www.eurofins.com/,"[research and development, contract developmen...",Eurofins
5,https://www.sgs.com/,"[research and development, quality control, qu...",Sgs


# Unknown Keywords Extraction

In [44]:
from flair.data import Sentence
from flair.models import MultiTagger
from flair.tokenization import SciSpacyTokenizer

sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")

# load biomedical tagger
tagger = MultiTagger.load("hunflair")

tags = []
for s in text:
    if s[:4] == 'http':
        continue
    sentence = Sentence(s)
    tagger.predict(sentence)
    print(s)
    for annotation_layer in sentence.annotation_layers.keys():
        for entity in sentence.get_spans(annotation_layer):
            tags += [entity.text]
            print('TAG!', entity.text)

Where might a career with Lonza take you? Join us and help to make a meaningful difference.
The vast majority of us are now going online to look for our next career move. Unfortunately, this has presented new opportunities for scammers. Several fraud attempts using the Lonza name have been reported to us in various countries. These activities range from fake Lonza job adverts being created, to fraudsters posing as Lonza recruiters. Their ultimate aim is almost always to obtain money and / or access personal and confidential information.
Over the past few years, online recruitment scams have increased substantially. They are very difficult to stop but, fortunately, they’re usually easy to identify. Please remain aware of the following practices, which define an authentic Lonza recruitment experience:
If you suspect that an attempt has been made to commit recruitment fraud, please report this to your local authority.
Belgium, Verviers; United States, Walkersville (Maryland)
Lab Technicia

A firm commitment to responsible
business and sustainability underpins everything we do. Minimizing our impact
on the environment, conserving energy and natural resources, and helping to
improve life quality are all central to our culture. Lonza's Vision Zero
initiative is a prime example, as we strive to achieve zero workplace accidents
and injuries, zero environmental incidents, zero product transportation
incidents and zero manufacturing process incidents. We work to attract and
retain the best talent, to make a meaningful difference to our own business, as
well as the communities in which we operate.
Lonza generated sales of CHF 5.4 billion with a CORE EBITDA of CHF 1.7 billion in Full-Year 2021. The Lonza shares are listed on the SIX Swiss Exchange and Swiss Market Index (SMI). We also maintain a secondary listing on the SGX Singapore Exchange.
TAG! CHF
TAG! CHF
Our journey to become the leading healthcare solution provider and help overcome some of the world’s greatest challenges

KeyboardInterrupt: 