# Scrape the table of spatial transcriptomics datasets from 10X Genomics website
The website code isn't static so we need to use Selenium 

`geckodriver` can be downloaded from from https://github.com/mozilla/geckodriver/releases

In [40]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd

# Set up driver for browser
firefox_options = Options()
# firefox_options.add_argument("--headless")
s = Service("./geckodriver") 
driver = webdriver.Firefox(service=s, options=firefox_options)

# Get table
url = "https://www.10xgenomics.com/datasets?query=&page=1&configure%5Bfacets%5D%5B0%5D=chemistryVersionAndThroughput&configure%5Bfacets%5D%5B1%5D=pipeline.version&configure%5BhitsPerPage%5D=150&configure%5BmaxValuesPerFacet%5D=1000&menu%5Bproducts.name%5D=Spatial%20Gene%20Expression&refinementList%5Bproduct.name%5D%5B0%5D=In%20Situ%20Gene%20Expression&refinementList%5Bproduct.name%5D%5B1%5D=HD%20Spatial%20Gene%20Expression&refinementList%5Bproduct.name%5D%5B2%5D=Spatial%20Gene%20Expression&refinementList%5Bproduct.name%5D%5B3%5D=CytAssist%20Spatial%20Gene%20and%20Protein%20Expression"
driver.get(url)
time.sleep(20)

# Scroll because it's dynamically loaded
scroll_increment = 500  
while True:
    driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
    time.sleep(5) 

    # Check if we've reached the bottom
    scroll_height = driver.execute_script("return document.body.scrollHeight;")
    current_position = driver.execute_script(
        "return window.pageYOffset + window.innerHeight;"
    )
    if current_position >= scroll_height:
        break
html = driver.page_source

# Parse table
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", class_="css-118cv4k")
headers = [th.get_text() for th in table.find_all("th")]

rows = driver.find_elements(By.CSS_SELECTOR, "tr.css-1mz91s4") # might have to adjust when they change it
dataset_information = []
for row in rows:
    tds = row.find_elements(By.CSS_SELECTOR, "td")
    row_data = [td.text for td in tds]
    dataset_information.append(row_data)

# Close the WebDriver
driver.quit()

## Santise and convert to csv

In [41]:
headers = [h for h in headers if len(h) > 0]
rows_for_df = []
for row in dataset_information:
    if len(row) != 0:

        # weirdly, sometimes the 2nd element is empty
        if len(row) == len(headers) + 1 and len(row[1]) == 0:
            row.pop(1)
        assert len(row) == len(headers)
        rows_for_df.append(row)

df = pd.DataFrame(rows_for_df, columns=headers)
df

Unnamed: 0,Datasets,Products,Chemistry Version,Additional Applications,Software,Pipeline Version,Subpipeline,Species,Disease State,Anatomical entity,...,Preservation Method,Staining Method,Selected Cell Types,Biomaterial type,Cell line,Feature Barcode,Donor count,Development Stage,10x Instrument(s),Publish Date
0,FFPE Human Brain Cancer Data with Human Immuno...,In Situ Gene Expression,v1,,Xenium Onboard Analysis,v2.0.0,,Human,glioblastoma multiforme,brain,...,FFPE,,,Specimen from Organism,,,1,adult,Xenium Analyzer,2024-04-15
1,Mouse Bone Data with Custom Add-on Panel,In Situ Gene Expression,v1,,Xenium Onboard Analysis,v1.9.0,,Mouse,,bone,...,FFPE,,,Specimen from Organism,,,3,adult,Xenium Analyzer,2024-04-03
2,Human Bone and Bone Marrow Data with Custom Ad...,In Situ Gene Expression,v1,,Xenium Onboard Analysis,v1.9.0,,Human,acute lymphoid leukemia,"bone, bone marrow",...,FFPE,,,Specimen from Organism,,,3,adult,Xenium Analyzer,2024-04-03
3,"Visium HD Spatial Gene Expression Library, Mou...",HD Spatial Gene Expression,v1,,Space Ranger,v3.0.0,spaceranger count,Mouse,Healthy,Brain,...,FFPE,H&E,,"Specimen from Organism, Imaged Specimen",,,1,juvenile,Visium CytAssist,2024-03-29
4,"Visium HD Spatial Gene Expression Library, Hum...",HD Spatial Gene Expression,v1,,Space Ranger,v3.0.0,spaceranger count,Human,Lung cancer,Lung,...,FFPE,IF,,"Specimen from Organism, Imaged Specimen",,,1,adult,Visium CytAssist,2024-03-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,Mouse Brain Serial Section 2 (Sagittal-Posterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,Fresh Frozen,H&E,,Specimen from Organism,,,1,,,2019-12-02
120,Mouse Brain Serial Section 2 (Sagittal-Anterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,Fresh Frozen,H&E,,Specimen from Organism,,,1,,,2019-12-02
121,Mouse Brain Serial Section 1 (Sagittal-Posterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,Fresh Frozen,H&E,,Specimen from Organism,,,1,,,2019-12-02
122,Mouse Brain Serial Section 1 (Sagittal-Anterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,Fresh Frozen,H&E,,Specimen from Organism,,,1,,,2019-12-02


In [43]:
df.to_csv("../data/10x_datasets.csv", index=False)