In [1]:
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
import concurrent.futures
import time
import csv
import pandas as pd
import lxml

Fetch page urls

In [2]:
def fetch_page_urls(page, driver):
    url = f'https://data.nasdaq.com/search?page={page + 1}' # urls of all core financial data
    #url = f'https://data.nasdaq.com/search?filters=%5B%22Prices%20%26%20Volumes%22%5D&page={page + 1}' # urls of Price&Volumns data 
    #url = f'https://data.nasdaq.com/search?filters=%5B%22Fundamentals%22%5D&page={page + 1}' #urls of Fundamental data
    #url = f'https://data.nasdaq.com/search?filters=%5B%22National%20Statistics%22%5D&page={page + 1}' #urls of National Statistics
    driver.get(url)
    time.sleep(10)  # wait until the webpage is completely loaded. Adjust the sleep time as needed
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, "html.parser") # get data of webpage
    #driver.quit()
    product_cards = soup.findAll("a", attrs={"class": "product-card__overview-content"}) # search for needed urls of datatables
    return [card.get('href') for card in product_cards]

def scrape_and_save_urls():
    max_page = 15 # adjust the maximum number of pages according to different web pages
    driver = uc.Chrome(headless=True, use_subprocess=True)
    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: 
        futures = [executor.submit(fetch_page_urls, page, driver) for page in range(max_page)]
        results = [future.result() for future in concurrent.futures.as_completed(futures)]
    driver.quit()
    url_list = [href for sublist in results for href in sublist]

    filename = "url_list_Fundamentals.csv"
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        for item in url_list:
            item1 = f'https://data.nasdaq.com' + item
            writer.writerow([item1])
    
    print(f"Data has been written to {filename}")

In [None]:
scrape_and_save_urls()

Search datatable code in each webpage

In [3]:
with open('url_list.csv', mode='r') as file:
    reader = csv.reader(file)
    url_list = [row for row in reader]  # Read the rest of the rows

In [4]:
url_list

[['https://data.nasdaq.com/databases/RTAT'],
 ['https://data.nasdaq.com/databases/EVML'],
 ['https://data.nasdaq.com/databases/EVMLW'],
 ['https://data.nasdaq.com/databases/SEP'],
 ['https://data.nasdaq.com/databases/EVAF'],
 ['https://data.nasdaq.com/databases/EOD'],
 ['https://data.nasdaq.com/databases/SFA'],
 ['https://data.nasdaq.com/databases/ECD'],
 ['https://data.nasdaq.com/databases/SF1'],
 ['https://data.nasdaq.com/databases/ZFB'],
 ['https://data.nasdaq.com/databases/ZES'],
 ['https://data.nasdaq.com/databases/MF1'],
 ['https://data.nasdaq.com/databases/WVD'],
 ['https://data.nasdaq.com/databases/WVBBT'],
 ['https://data.nasdaq.com/databases/NDWTA'],
 ['https://data.nasdaq.com/databases/NDWFUNDTA'],
 ['https://data.nasdaq.com/databases/NDWEQTA'],
 ['https://data.nasdaq.com/databases/ETFC'],
 ['https://data.nasdaq.com/databases/ETFCCA'],
 ['https://data.nasdaq.com/databases/PTSR'],
 ['https://data.nasdaq.com/databases/SFB'],
 ['https://data.nasdaq.com/databases/SF3'],
 ['https

Example: https://data.nasdaq.com/databases/RTAT

In [5]:
driver = uc.Chrome(headless=True, use_subprocess=False)
driver.get('https://data.nasdaq.com/databases/RTAT')
page_source = driver.page_source
soup = BeautifulSoup(page_source, "html.parser")
driver.quit()

In [6]:
table = soup.find("section", attrs={"data-anchor": "anchor-data-organization"}).find("div", attrs={"class": "documentation-markdown"}).find("table")

In [7]:
table

<table><thead><tr><th>TABLE</th><th>TABLE CODE</th><th>TABLE DESCRIPTION</th></tr></thead><tbody><tr><td><a href="https://data.nasdaq.com/databases/RTAT/documentation?anchor=retail-trading-activity-tracker-daily-top-10-free-ndaq-rtat10-">Retail Trading Activity Tracker - Daily Top 10 (Free)</a></td><td>NDAQ/RTAT10</td><td>Daily ticker-level insights into retail activity and sentiment (top 10 tickers as ranked by activity)</td></tr><tr><td><a href="https://data.nasdaq.com/databases/RTAT/documentation?anchor=retail-trading-activity-tracker-daily-full-universe-premium-ndaq-rtat-">Retail Trading Activity Tracker - Daily Full Universe (Premium)</a></td><td>NDAQ/RTAT</td><td>Daily ticker-level insights into retail activity and sentiment</td></tr></tbody></table>

Transform the table element above to pandas dataframe

In [8]:
def table_to_df(table):
    try:
        header = [th.text for th in table.find('thead').find_all('th')]
        rows = []
        for tr in table.find('tbody').find_all('tr'):
            cells = []
            for td in tr.find_all('td'):
                a = td.find('a')
                if a:
                    cell_content = a.text
                else:
                    cell_content = td.text
                cells.append(cell_content)
            rows.append(cells)
        df = pd.DataFrame(rows, columns=header)
        
    except:
        table_code = None
        df = pd.DataFrame({'TABLE CODE': [table_code]})
    return df

In [50]:
dataframes = []
driver = uc.Chrome(headless=True,use_subprocess=False)
for page in url_list:
        driver.get(page[0])
        time.sleep(6)
        print('Success!' + page[0])
        page_source_test = driver.page_source
        soup_test = BeautifulSoup(page_source_test, "html.parser")
        try:
                table = soup_test.find("section", attrs={"data-anchor": "anchor-data-organization"}).find("div", attrs={"class": "documentation-markdown"}).find("table")
        except:
                table = None
        df = table_to_df(table)
        df['urls'] = page[0]
        dataframes.append(df)

could not detect version_main.therefore, we are assuming it is chrome 108 or higher


Success!https://data.nasdaq.com/databases/RTAT
Success!https://data.nasdaq.com/databases/EVML
Success!https://data.nasdaq.com/databases/EVMLW
Success!https://data.nasdaq.com/databases/SEP
Success!https://data.nasdaq.com/databases/EVAF
Success!https://data.nasdaq.com/databases/EOD
Success!https://data.nasdaq.com/databases/SFA
Success!https://data.nasdaq.com/databases/ECD
Success!https://data.nasdaq.com/databases/SF1
Success!https://data.nasdaq.com/databases/ZFB
Success!https://data.nasdaq.com/databases/ZES
Success!https://data.nasdaq.com/databases/MF1
Success!https://data.nasdaq.com/databases/WVD
Success!https://data.nasdaq.com/databases/WVBBT
Success!https://data.nasdaq.com/databases/NDWTA
Success!https://data.nasdaq.com/databases/NDWFUNDTA
Success!https://data.nasdaq.com/databases/NDWEQTA
Success!https://data.nasdaq.com/databases/ETFC


KeyboardInterrupt: 

In [9]:
tc = []
for i in dataframes:
    try: 
        a = i['TABLE CODE'].values.tolist()
        tc.append(i['TABLE CODE'].values.tolist())
    except:
        try:
            a = i['Quandl Code'].values.tolist()
            tc.append(i['TABLE CODE'].values.tolist())
        except:
            tc.append([None])

with open('table_code.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(tc)
print(f'Data saved to {'table_code.csv'}')

SyntaxError: invalid syntax (32758748.py, line 16)