# Scrape the table of spatial transcriptomics datasets from 10X Genomics website
The website code isn't static so we need to use Selenium 

`geckodriver` can be downloaded from from https://github.com/mozilla/geckodriver/releases

In [2]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import pandas as pd

# Set up driver for browser
firefox_options = Options()
firefox_options.add_argument("--headless") 
s = Service("./geckodriver")  # Update the path to your GeckoDriver
driver = webdriver.Firefox(service=s, options=firefox_options)

# Get table
url = "https://www.10xgenomics.com/datasets?query=&page=2&configure%5BhitsPerPage%5D=50&configure%5BmaxValuesPerFacet%5D=1000&refinementList%5Bproduct.name%5D%5B0%5D=Spatial%20Gene%20Expression"
driver.get(url)
time.sleep(20)  # Adjust time according to network speed 
html = driver.page_source

# Parse table
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", class_="css-118cv4k")
headers = [th.get_text() for th in table.find_all("th")]

rows = driver.find_elements(By.CSS_SELECTOR, "tr.css-1mz91s4") # might have to adjust when they change it
dataset_information = []
for row in rows:
    tds = row.find_elements(By.CSS_SELECTOR, "td")
    row_data = [td.text for td in tds]
    dataset_information.append(row_data)

# Close the WebDriver
driver.quit()

## Santise and convert to csv

In [19]:
headers = [h for h in headers if len(h) > 0]
rows_for_df = []
for row in dataset_information:
    if len(row) != 0:

        # weirdly, sometimes the 2nd element is empty
        if len(row) == len(headers) + 1 and len(row[1]) == 0:
            row.pop(1)
        assert len(row) == len(headers)
        rows_for_df.append(row)

df = pd.DataFrame(rows_for_df, columns=headers)
df

Unnamed: 0,Datasets,Products,Chemistry Version,Additional Applications,Software,Pipeline Version,Subpipeline,Species,Disease State,Anatomical entity,...,Preservation Method,Staining Method,Selected Cell Types,Biomaterial type,Cell line,Feature Barcode,Donor count,Development Stage,10x Instrument(s),Publish Date
0,Visium CytAssist Gene Expression Libraries of ...,Spatial Gene Expression,v2,,Space Ranger,v2.1.0,spaceranger count,Human,cancer; adenocarcinoma stage IIA,colon,...,FFPE,,,"Specimen from Organism, Imaged Specimen",,,1,,Visium CytAssist,2023-10-05
1,Visium CytAssist Gene Expression Libraries of ...,Spatial Gene Expression,v2,,Space Ranger,v2.1.0,spaceranger count,Mouse,,brain,...,Fresh Frozen,,,"Specimen from Organism, Imaged Specimen",,,1,,Visium CytAssist,2023-10-05
2,Preservation Method Comparison on Visium CytAs...,Spatial Gene Expression,v2,,Space Ranger,v2.1.0,spaceranger count,Mouse,healthy,brain,...,Fixed Frozen,H&E,,"Specimen from Organism, Imaged Specimen",,,,,Visium CytAssist,2023-06-08
3,Preservation Method Comparison on Visium CytAs...,Spatial Gene Expression,v2,,Space Ranger,v2.1.0,spaceranger count,Mouse,healthy,brain,...,Fresh Frozen,H&E,,"Specimen from Organism, Imaged Specimen",,,,,Visium CytAssist,2023-06-08
4,Preservation Method Comparison on Visium CytAs...,Spatial Gene Expression,v2,,Space Ranger,v2.1.0,spaceranger count,Mouse,healthy,brain,...,FFPE,H&E,,"Specimen from Organism, Imaged Specimen",,,,,Visium CytAssist,2023-06-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,Mouse Brain Serial Section 2 (Sagittal-Posterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,Fresh Frozen,H&E,,Specimen from Organism,,,1,,,2019-12-02
84,Mouse Brain Serial Section 2 (Sagittal-Anterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,Fresh Frozen,H&E,,Specimen from Organism,,,1,,,2019-12-02
85,Mouse Brain Serial Section 1 (Sagittal-Posterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,Fresh Frozen,H&E,,Specimen from Organism,,,1,,,2019-12-02
86,Mouse Brain Serial Section 1 (Sagittal-Anterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,Fresh Frozen,H&E,,Specimen from Organism,,,1,,,2019-12-02


In [20]:
df.to_csv("../data/10x_datasets.csv", index=False)