# Scrape the table of spatial transcriptomics datasets from 10X Genomics website
The website code isn't static so we need to use Selenium 

`geckodriver` can be downloaded from from https://github.com/mozilla/geckodriver/releases

In [40]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd
import re
import json
import random
import string

random.seed(8)

In [2]:
# Set up driver for browser
firefox_options = Options()

# Use my user-profile so I'm already logged in
firefox_options.add_argument("-profile")
firefox_options.add_argument("/Users/tim.treis/Library/Application Support/Firefox/Profiles/9qiwq385.default-release-1697557194190")
s = Service("./geckodriver") 
driver = webdriver.Firefox(
    service=s, options=firefox_options
)

# only 4 datasets for testing
url = "https://www.10xgenomics.com/datasets?query=&page=1&configure%5Bfacets%5D%5B0%5D=chemistryVersionAndThroughput&configure%5Bfacets%5D%5B1%5D=pipeline.version&configure%5BhitsPerPage%5D=50&configure%5BmaxValuesPerFacet%5D=1000&menu%5Bproducts.name%5D=Spatial%20Gene%20Expression&refinementList%5Bproduct.name%5D%5B0%5D=HD%20Spatial%20Gene%20Expression"

# all datasets
url = "https://www.10xgenomics.com/datasets?query=&page=1&configure%5Bfacets%5D%5B0%5D=chemistryVersionAndThroughput&configure%5Bfacets%5D%5B1%5D=pipeline.version&configure%5BhitsPerPage%5D=150&configure%5BmaxValuesPerFacet%5D=1000&menu%5Bproducts.name%5D=Spatial%20Gene%20Expression&refinementList%5Bproduct.name%5D%5B0%5D=In%20Situ%20Gene%20Expression&refinementList%5Bproduct.name%5D%5B1%5D=HD%20Spatial%20Gene%20Expression&refinementList%5Bproduct.name%5D%5B2%5D=Spatial%20Gene%20Expression&refinementList%5Bproduct.name%5D%5B3%5D=CytAssist%20Spatial%20Gene%20and%20Protein%20Expression"
driver.get(url)
time.sleep(10)

# Scroll because it's dynamically loaded
scroll_increment = 500  
while True:
    driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
    time.sleep(10) 

    # Check if we've reached the bottom
    scroll_height = driver.execute_script("return document.body.scrollHeight;")
    current_position = driver.execute_script(
        "return window.pageYOffset + window.innerHeight;"
    )
    if current_position >= scroll_height:
        break
html = driver.page_source

# Parse table
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", class_="css-118cv4k")
headers = [th.get_text() for th in table.find_all("th")]

rows = driver.find_elements(By.CSS_SELECTOR, "tr.css-1mz91s4") # might have to adjust when they change it
dataset_information = []
for row in rows:
    tds = row.find_elements(By.CSS_SELECTOR, "td")
    row_data = [td.text for td in tds]
    link = None
    for td in tds:
        # Check if the 'td' element contains an 'a' tag (link)
        try:
            link = td.find_element(By.TAG_NAME, 'a').get_attribute('href')
            # Get the full URL if the href attribute is a relative path
            if link.startswith('/'):
                link = driver.current_url.rsplit('/', 1)[0] + link
        except Exception as e:
            # If there is no 'a' tag, just extract the text
            pass
    row_data.append(link)
    dataset_information.append(row_data)

# Close the WebDriver
driver.quit()

## Santise and convert to csv

In [3]:
extended_headers = [h for h in headers if len(h) > 0]
extended_headers += ["dataset_link"]
rows_for_df = []
for row in dataset_information:
    if len(row) != 0 and not (len(row) == 1 and row[0] is None):

        # weirdly, sometimes the 2nd element is empty
        if len(row) == len(extended_headers) + 1 and len(row[1]) == 0:
            row.pop(1)
        assert len(row) == len(extended_headers)
        rows_for_df.append(row)

df = pd.DataFrame(rows_for_df, columns=extended_headers)
df

Unnamed: 0,Datasets,Products,Chemistry Version,Additional Applications,Software,Pipeline Version,Subpipeline,Species,Disease State,Anatomical entity,...,Staining Method,Selected Cell Types,Biomaterial type,Cell line,Feature Barcode,Donor count,Development Stage,10x Instrument(s),Publish Date,dataset_link
0,FFPE Human Brain Cancer Data with Human Immuno...,In Situ Gene Expression,v1,,Xenium Onboard Analysis,v2.0.0,,Human,glioblastoma multiforme,brain,...,,,Specimen from Organism,,,1,adult,Xenium Analyzer,2024-04-15,https://www.10xgenomics.com/datasets/ffpe-huma...
1,Mouse Bone Data with Custom Add-on Panel,In Situ Gene Expression,v1,,Xenium Onboard Analysis,v1.9.0,,Mouse,,bone,...,,,Specimen from Organism,,,3,adult,Xenium Analyzer,2024-04-03,https://www.10xgenomics.com/datasets/mouse-bon...
2,Human Bone and Bone Marrow Data with Custom Ad...,In Situ Gene Expression,v1,,Xenium Onboard Analysis,v1.9.0,,Human,acute lymphoid leukemia,"bone, bone marrow",...,,,Specimen from Organism,,,3,adult,Xenium Analyzer,2024-04-03,https://www.10xgenomics.com/datasets/human-bon...
3,"Visium HD Spatial Gene Expression Library, Mou...",HD Spatial Gene Expression,v1,,Space Ranger,v3.0.0,spaceranger count,Mouse,Healthy,Brain,...,H&E,,"Specimen from Organism, Imaged Specimen",,,1,juvenile,Visium CytAssist,2024-03-29,https://www.10xgenomics.com/datasets/visium-hd...
4,"Visium HD Spatial Gene Expression Library, Hum...",HD Spatial Gene Expression,v1,,Space Ranger,v3.0.0,spaceranger count,Human,Lung cancer,Lung,...,IF,,"Specimen from Organism, Imaged Specimen",,,1,adult,Visium CytAssist,2024-03-29,https://www.10xgenomics.com/datasets/visium-hd...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,Mouse Brain Serial Section 2 (Sagittal-Posterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,H&E,,Specimen from Organism,,,1,,,2019-12-02,https://www.10xgenomics.com/datasets/mouse-bra...
120,Mouse Brain Serial Section 2 (Sagittal-Anterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,H&E,,Specimen from Organism,,,1,,,2019-12-02,https://www.10xgenomics.com/datasets/mouse-bra...
121,Mouse Brain Serial Section 1 (Sagittal-Posterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,H&E,,Specimen from Organism,,,1,,,2019-12-02,https://www.10xgenomics.com/datasets/mouse-bra...
122,Mouse Brain Serial Section 1 (Sagittal-Anterior),Spatial Gene Expression,v1,,Space Ranger,v1.0.0,spaceranger count,Mouse,,brain,...,H&E,,Specimen from Organism,,,1,,,2019-12-02,https://www.10xgenomics.com/datasets/mouse-bra...


In [37]:
len(df.dataset_link.unique())

124

## Expand links if the data has replicates

In [27]:
def has_replicates(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    files_box = soup.find('div', text='Files')
    return files_box is not None


def escape_special_characters(name):
    """Escape special characters in the dataset name."""
    return name.replace("'", "&apos;")


def extract_replicate_links(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    files_box = soup.find("div", text="Files")
    container_div = files_box.parent.parent
    buttons = container_div.find_all("button")
    replicate_data = []
    for button in buttons:
        time.sleep(5)
        dataset_name = button.find("div").text.strip()
        selenium_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable(
                (
                    By.XPATH,
                    f'//button[.//div[contains(text(), "{dataset_name}")]]',
                )
            )
        )
        selenium_button.click()
        time.sleep(5)
        replicate_data.append({"Replicate": dataset_name, "HTML": driver.page_source})

    return replicate_data

driver = webdriver.Firefox(service=s, options=firefox_options)
link_df = pd.DataFrame(
    columns=["Datasets", "Replicate", "HTML"]
)

for idx, row in df.iterrows():
    driver.get(row["dataset_link"])
    time.sleep(10)
    html_content = driver.page_source
    if has_replicates(html_content):
        replicate_data = extract_replicate_links(html_content)
        for replicate_info in replicate_data:
            new_row = {
                "Datasets": row["Datasets"],
                "Replicate": replicate_info["Replicate"],
                "HTML": replicate_info["HTML"],
            }
            link_df = pd.concat([link_df, pd.DataFrame([new_row])], ignore_index=True)
    else:
        new_row = {
            "Datasets": row["Datasets"],
            "Replicate": None,
            "HTML": html_content,
        }
        link_df = pd.concat([link_df, pd.DataFrame([new_row])], ignore_index=True)


driver.quit()

In [30]:
for html in link_df.HTML:
    soup = BeautifulSoup(html, "html.parser")
    tables = soup.find_all("table")
    input_files = output_files = None
    for table in tables:
        if "Input Files" in table.text:
            input_files_html = str(table)
            input_soup = BeautifulSoup(input_files_html, "html.parser")
            input_links = input_soup.select("table a")
            input_files = [
                {"name": link.get_text(strip=True), "url": link["href"]}
                for link in input_links
            ]
        elif "Output Files" in table.text:
            output_files_html = str(table)
            output_soup = BeautifulSoup(output_files_html, "html.parser")
            output_links = output_soup.select("table a")
            output_files = [
                {"name": link.get_text(strip=True), "url": link["href"]}
                for link in output_links
            ]

    # Convert input_files and output_files to JSON strings
    input_files_json = json.dumps(input_files) if input_files else None
    output_files_json = json.dumps(output_files) if output_files else None

    # Append JSON strings to DataFrame as new columns
    link_df.loc[link_df["HTML"] == html, "input_links"] = input_files_json
    link_df.loc[link_df["HTML"] == html, "output_links"] = output_files_json

In [32]:
link_df

Unnamed: 0,Datasets,Replicate,HTML,input_links,output_links
0,FFPE Human Brain Cancer Data with Human Immuno...,,"<html lang=""en"" class="" qisqwzyyz idc0_350""><h...","[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s..."
1,Mouse Bone Data with Custom Add-on Panel,10% Formic acid decalcification,"<html lang=""en"" class="" qwmodgos idc0_350""><he...","[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s..."
2,Mouse Bone Data with Custom Add-on Panel,0.5M EDTA decalcification,"<html lang=""en"" class="" qwmodgos idc0_350""><he...","[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s..."
3,Mouse Bone Data with Custom Add-on Panel,15% EDTA/0.4% PFA decalcification,"<html lang=""en"" class="" qwmodgos idc0_350""><he...","[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s..."
4,Human Bone and Bone Marrow Data with Custom Ad...,Acute Lymphoid Leukemia Bone Marrow,"<html lang=""en"" class="" krtgsdovr idc0_350""><h...","[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s..."
...,...,...,...,...,...
167,Mouse Brain Serial Section 2 (Sagittal-Posterior),,"<html lang=""en"" class="" wvfadltjbg idc0_350""><...","[{""name"": ""FASTQs"", ""url"": ""https://s3-us-west...","[{""name"": ""Format details"", ""url"": ""https://su..."
168,Mouse Brain Serial Section 2 (Sagittal-Anterior),,"<html lang=""en"" class="" etfvoyz idc0_350""><hea...","[{""name"": ""FASTQs"", ""url"": ""https://s3-us-west...","[{""name"": ""Format details"", ""url"": ""https://su..."
169,Mouse Brain Serial Section 1 (Sagittal-Posterior),,"<html lang=""en"" class="" pdpwbd idc0_350""><head...","[{""name"": ""FASTQs"", ""url"": ""https://s3-us-west...","[{""name"": ""Format details"", ""url"": ""https://su..."
170,Mouse Brain Serial Section 1 (Sagittal-Anterior),,"<html lang=""en"" class="" xmwsknxq idc0_350""><he...","[{""name"": ""FASTQs"", ""url"": ""https://s3-us-west...","[{""name"": ""Format details"", ""url"": ""https://su..."


## Merge links with metadata

In [31]:
all_data = df.merge(link_df, on="Datasets", how="left")

## Prepare unique IDs

In [38]:
tech_names = {
    "In Situ Gene Expression": "Xenium",
    "Spatial Gene Expression": "Visium",
    "HD Spatial Gene Expression": "VisiumHD",
    "CytAssist Spatial Gene and Protein Expression": "CytAssist",
}
all_data["tech"] = all_data["Products"].map(tech_names)
all_data["organ"] = (
    all_data["Anatomical entity"]
    .str.lower()
    .str.replace("(", "")
    .str.replace(")", "")
    .str.replace(",", "")
    .str.replace("mouse ", "")
    .str.replace(" ", "_")
)
all_data["year"] = all_data["Publish Date"].apply(lambda x: x[:4])

In [41]:
def generate_unique_ids(
    n: int, size: int = 5, chars=string.ascii_lowercase + string.digits
):
    """Generate a list of n unique pseudo-random IDs."""
    unique_ids = set()
    while len(unique_ids) < n:
        # Generate a new ID
        new_id = "".join(random.choices(chars, k=size))
        # Add to the set of unique IDs if not already present
        unique_ids.add(new_id)

    assert len(unique_ids) == n

    return list(unique_ids)


all_data["uid"] = generate_unique_ids(len(all_data), size=5)

In [43]:
all_data["id"] = all_data.apply(
    lambda row: f"{row['uid']}__10X__{row['tech']}__{row['Species']}__{row['organ']}__{row['year']}__{row['Pipeline Version']}",
    axis=1,
)

# optionally add replicate information if available
all_data["rep"] = (
    all_data.Replicate.str.replace(" ", "").str.replace("/", "").str.replace(",", "")
)

all_data["id"] = (
    all_data.apply(
        lambda row: f"{row['id']}__{row['rep']}",
        axis=1,
    )
    .str.replace("__nan", "")
    .unique()
)
all_data.head(3)

Unnamed: 0,Datasets,Products,Chemistry Version,Additional Applications,Software,Pipeline Version,Subpipeline,Species,Disease State,Anatomical entity,...,Replicate,HTML,input_links,output_links,tech,organ,year,uid,id,rep
0,FFPE Human Brain Cancer Data with Human Immuno...,In Situ Gene Expression,v1,,Xenium Onboard Analysis,v2.0.0,,Human,glioblastoma multiforme,brain,...,,"<html lang=""en"" class="" qisqwzyyz idc0_350""><h...","[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s...",Xenium,brain,2024,ikt9n,ikt9n__10X__Xenium__Human__brain__2024__v2.0.0...,
1,Mouse Bone Data with Custom Add-on Panel,In Situ Gene Expression,v1,,Xenium Onboard Analysis,v1.9.0,,Mouse,,bone,...,10% Formic acid decalcification,"<html lang=""en"" class="" qwmodgos idc0_350""><he...","[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s...",Xenium,bone,2024,20vj4,20vj4__10X__Xenium__Mouse__bone__2024__v1.9.0_...,10%Formicaciddecalcification
2,Mouse Bone Data with Custom Add-on Panel,In Situ Gene Expression,v1,,Xenium Onboard Analysis,v1.9.0,,Mouse,,bone,...,0.5M EDTA decalcification,"<html lang=""en"" class="" qwmodgos idc0_350""><he...","[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s...",Xenium,bone,2024,mym6o,mym6o__10X__Xenium__Mouse__bone__2024__v1.9.0_...,0.5MEDTAdecalcification


## Write

In [44]:
all_data.to_csv("../data/10x_datasets.csv", index=False)