In [None]:
# https://stackoverflow.com/questions/28289699/python-web-scraping-for-javascript-generated-content?noredirect=1&lq=1
# https://www.mikejohnpage.com/blog/rendering-javascript-content-using-python-selenium-and-a-headless-browser/

In [2]:
# Import modules
import requests

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.wait import WebDriverWait

from datetime import datetime

from bs4 import BeautifulSoup
import lxml

import pandas as pd

In [3]:
base_url = 'https://clinicaltrials.gov'
content_url = '%s/ct2/about-studies/glossary'%base_url
js_url = '%s/ct2/html/js/main-glossary.js?v=44'%base_url

In [4]:
# HTTP request robots.txt file and check permissions
r = requests.get('%s/robots.txt'%base_url)
print(r.text)

# robots.txt - robot exclusion file - back-end server version - no robots!
User-agent: htdig/3.1.6
Disallow: /cgi-bin
Disallow: /entrez
Disallow: /COG
Disallow: /Entrez
Disallow: /myncbi
Disallow: /sutils
Disallow: /Taxonomy/Selector
Disallow: /Taxonomy/CommonTree
Disallow: /entrez/sutils
Disallow: /mapview
Disallow: /blast/BlastAlign.cgi 
Disallow: /blast/bl2seq/wblast2.cgi
Disallow: /portal
Disallow: /pmc/utilities/
Disallow: /pmc/issues/
Disallow: /pmc/articles/
Disallow: /pmc/ivip/
Disallow: /pmc/articlerender.fcgi
Disallow: /pmc/pagerender.fcgi
Disallow: /pmc/picrender.fcgi
Disallow: /pmc/tocrender.fcgi
Disallow: /pmc/*report=
Disallow: /pmc/*page=
Disallow: /pmc/articles/*/citedby/
Disallow: /pmc/publisherportal/api/
Disallow: /pmc/publisherportal/application/
Disallow: /pmc/publisherportal/download
Disallow: /pmc/publisherportal/journalmanager/
Disallow: /labs/pmc/
Disallow: /geo/tools
Disallow: /projects/geo/tools
Disallow: /geo/download
Disallow: /projects/geo/download
Disallo

In [5]:
# C:\Users\172470\scripts\geckodriver-v0.29.0-win64

# Requests library cannot return rendered javascript content, only the unmodified DOM (static web page).
# Use selenium and a web driver to automate a web browser and return rendered javascript content (dynamic web page).
# Initiate headless Firefox driver
options = Options()
options.headless = True
driver = webdriver.Firefox(executable_path = 'C:/Users/172470/scripts/geckodriver-v0.29.0-win64/geckodriver', options = options)

In [6]:
driver.get(content_url)

In [7]:
# Retrieve page source
soup = BeautifulSoup(driver.page_source, 'lxml')

In [8]:
# Quit driver
driver.quit()

In [9]:
soup

<html class="fa-events-icons-ready" lang="en" xmlns="https://www.w3.org/1999/xhtml"><head>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="origin-when-cross-origin" name="referrer"/>
<title>Glossary of Common Site Terms - ClinicalTrials.gov</title>
<!-- Open Graph data -->
<meta content="article" property="og:type"/>
<meta content="Glossary of Common Site Terms - ClinicalTrials.gov" property="og:title"/>
<meta content="https://clinicaltrials.gov/ct2/html/images/ct.gov-nlm-nih-logo.png" property="og:image"/>
<link href="/ct2/html/images/favicon.ico" rel="shortcut icon"/>
<link href="/ct2/html/images/apple-touch-icon-144x144.png" rel="apple-touch-icon-precomposed" sizes="144x144"/>
<link href="/ct2/html/images/apple-touch-icon-114x114.png" rel="apple-touch-icon-precomposed" sizes="114x114"/>
<link href="/ct2/html/images/apple-tou

In [10]:
# Term class_ = 'data-glossary-term glossary__term'
# Def class_ = 'glossary__definition'

test = []

for i in soup.find_all(class_='data-glossary-term glossary__term'):
    test.append(i.get_text())
    get_def = i.find_next(class_='glossary__definition')
    test.append(get_def.get_text())

In [13]:
df = pd.DataFrame(columns = ['Term', 'Description'])

term = []
description = []

for i in soup.find_all(class_='data-glossary-term glossary__term'):
    term.append(i.get_text())
    get_def = i.find_next(class_='glossary__definition')
    description.append(get_def.get_text())
    
df['Term'] = term
df['Description'] = description

In [14]:
df

Unnamed: 0,Term,Description
0,Accepts healthy volunteers,A type of eligibility criteria that indicates ...
1,Active comparator arm,An arm type in which a group of participants r...
2,Adverse event,An unfavorable change in the health of a parti...
3,Age or age group,A type of eligibility criteria that indicates ...
4,All-cause mortality,"A measure of all deaths, due to any cause, tha..."
...,...,...
116,Title,The official title of a protocol used to ident...
117,Title acronym,The acronym or initials used to identify a cli...
118,U.S. Agency for Healthcare Research and Qualit...,An agency within the U.S. Department of Health...
119,U.S. Food and Drug Administration (FDA),An agency within the U.S. Department of Health...


In [15]:
df.to_csv('nih_clinical_trials_glossary.csv', index=False)