# Download 10X data 

In [1]:
import pathlib as pl
import pandas as pd
import json
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from urllib.parse import urlparse
from datetime import datetime


OUTPUT_DIR = pl.Path("/lustre/groups/ml01/projects/2024_spatialdata_db/data")

In [4]:
data = pd.read_csv("/home/icb/tim.treis/projects/spatialdata-db/utils/data/10x_datasets.csv", sep=";")
data

Unnamed: 0,Datasets,Products,Chemistry Version,Additional Applications,Software,Pipeline Version,Subpipeline,Species,Disease State,Anatomical entity,...,dataset_link,Replicate,input_links,output_links,tech,organ,date,uid,id,rep
0,FFPE Human Brain Cancer Data with Human Immuno...,In Situ Gene Expression,v1,unknown,Xenium Onboard Analysis,v2.0.0,unknown,Human,glioblastoma multiforme,brain,...,https://www.10xgenomics.com/datasets/ffpe-huma...,,"[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s...",Xenium,brain,20240415,ooaie,ooaie__10X__Xenium__Human__brain__20240415__v2...,
1,Mouse Bone Data with Custom Add-on Panel,In Situ Gene Expression,v1,unknown,Xenium Onboard Analysis,v1.9.0,unknown,Mouse,unknown,bone,...,https://www.10xgenomics.com/datasets/mouse-bon...,10% Formic acid decalcification,"[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s...",Xenium,bone,20240403,nqe4e,nqe4e__10X__Xenium__Mouse__bone__20240403__v1....,10Formicaciddecalcification
2,Mouse Bone Data with Custom Add-on Panel,In Situ Gene Expression,v1,unknown,Xenium Onboard Analysis,v1.9.0,unknown,Mouse,unknown,bone,...,https://www.10xgenomics.com/datasets/mouse-bon...,0.5M EDTA decalcification,"[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s...",Xenium,bone,20240403,56r5s,56r5s__10X__Xenium__Mouse__bone__20240403__v1....,0.5MEDTAdecalcification
3,Mouse Bone Data with Custom Add-on Panel,In Situ Gene Expression,v1,unknown,Xenium Onboard Analysis,v1.9.0,unknown,Mouse,unknown,bone,...,https://www.10xgenomics.com/datasets/mouse-bon...,15% EDTA/0.4% PFA decalcification,"[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s...",Xenium,bone,20240403,1lhfw,1lhfw__10X__Xenium__Mouse__bone__20240403__v1....,15EDTA0.4PFAdecalcification
4,Human Bone and Bone Marrow Data with Custom Ad...,In Situ Gene Expression,v1,unknown,Xenium Onboard Analysis,v1.9.0,unknown,Human,acute lymphoid leukemia,"bone, bone marrow",...,https://www.10xgenomics.com/datasets/human-bon...,Acute Lymphoid Leukemia Bone Marrow,"[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s...",Xenium,bone_bone_marrow,20240403,fulvo,fulvo__10X__Xenium__Human__bone_bone_marrow__2...,AcuteLymphoidLeukemiaBoneMarrow
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,Mouse Brain Serial Section 1 (Sagittal-Anterior),Spatial Gene Expression,v1,unknown,Space Ranger,v1.0.0,spaceranger count,Mouse,unknown,brain,...,https://www.10xgenomics.com/datasets/mouse-bra...,,"[{""name"": ""FASTQs"", ""url"": ""https://s3-us-west...","[{""name"": ""Format details"", ""url"": ""https://su...",Visium,brain,20191202,6qjsq,6qjsq__10X__Visium__Mouse__brain__20191202__v1...,
208,Mouse Brain Serial Section 1 (Sagittal-Anterior),Spatial Gene Expression,v1,unknown,Space Ranger,v1.0.0,spaceranger count,Mouse,unknown,brain,...,https://www.10xgenomics.com/datasets/mouse-bra...,,"[{""name"": ""FASTQs"", ""url"": ""https://s3-us-west...","[{""name"": ""Format details"", ""url"": ""https://su...",Visium,brain,20191202,9zn0k,9zn0k__10X__Visium__Mouse__brain__20191202__v1...,
209,Mouse Brain Section (Coronal),Spatial Gene Expression,v1,unknown,Space Ranger,v1.0.0,spaceranger count,Mouse,unknown,brain,...,https://www.10xgenomics.com/datasets/mouse-bra...,,"[{""name"": ""FASTQs"", ""url"": ""https://s3-us-west...","[{""name"": ""Format details"", ""url"": ""https://su...",Visium,brain,20191202,09ohz,09ohz__10X__Visium__Mouse__brain__20191202__v1...,
210,Mouse Brain Section (Coronal),Spatial Gene Expression,v1,unknown,Space Ranger,v1.0.0,spaceranger count,Mouse,unknown,brain,...,https://www.10xgenomics.com/datasets/mouse-bra...,,"[{""name"": ""FASTQs"", ""url"": ""https://s3-us-west...","[{""name"": ""Format details"", ""url"": ""https://su...",Visium,brain,20191202,02uqv,02uqv__10X__Visium__Mouse__brain__20191202__v1...,


## Create folders and jobs for downloading

In [5]:
download_jobs = []

def create_directory(path: pl.Path):
    if not path.exists():
        path.mkdir()
        print(f"Created directory: {path}")

def sanitize_key(key: str):
    return (
        key
        .replace(" ", "_")
        .replace("(", "")
        .replace(")", "")
        .replace(":", "")
        .replace("&", "n")
    )

download_jobs = []

for idx, row in data.iterrows():
    dataset_path = OUTPUT_DIR / row["id"]
    create_directory(dataset_path) 

    raw_input_path = dataset_path / "raw_input"
    create_directory(raw_input_path)  
    raw_output_path = dataset_path / "raw_output"
    create_directory(raw_output_path)  

    for idx, field in enumerate(['input_links', 'output_links']):
        try:
            if row.get(field):  
                links = json.loads(row[field])
                links_dict = {sanitize_key(link['name']): link['url'] for link in links}
                for name, url in links_dict.items():
                    # filter out links to the support page and subsets
                    if "https" in url and "subset" not in name:
                        download_jobs.append({
                            "name": name,
                            "url": url,
                            "output_folder": [raw_input_path, raw_output_path][idx]
                        })
        except:
            pass

download_jobs

Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/ooaie__10X__Xenium__Human__brain__20240415__v2.0.0
Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/ooaie__10X__Xenium__Human__brain__20240415__v2.0.0/raw_input
Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/ooaie__10X__Xenium__Human__brain__20240415__v2.0.0/raw_output
Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/nqe4e__10X__Xenium__Mouse__bone__20240403__v1.9.0__10Formicaciddecalcification
Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/nqe4e__10X__Xenium__Mouse__bone__20240403__v1.9.0__10Formicaciddecalcification/raw_input
Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/nqe4e__10X__Xenium__Mouse__bone__20240403__v1.9.0__10Formicaciddecalcification/raw_output
Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/56r5s__10X__Xenium__Mouse__bone__20240403__v1.9.0__0.5MED

[{'name': 'Panel_JSON',
  'url': 'https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_Human_Brain_GBM_FFPE/Xenium_V1_Human_Brain_GBM_FFPE_gene_panel.json',
  'output_folder': PosixPath('/lustre/groups/ml01/projects/2024_spatialdata_db/data/ooaie__10X__Xenium__Human__brain__20240415__v2.0.0/raw_input')},
 {'name': 'Supplemental_Post-Xenium_HnE_image_OME-TIFF',
  'url': 'https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_Human_Brain_GBM_FFPE/Xenium_V1_Human_Brain_GBM_FFPE_he_image.ome.tif',
  'output_folder': PosixPath('/lustre/groups/ml01/projects/2024_spatialdata_db/data/ooaie__10X__Xenium__Human__brain__20240415__v2.0.0/raw_input')},
 {'name': 'Supplemental_HnE_Image_Alignment_File_CSV',
  'url': 'https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_Human_Brain_GBM_FFPE/Xenium_V1_Human_Brain_GBM_FFPE_he_imagealignment.csv',
  'output_folder': PosixPath('/lustre/groups/ml01/projects/2024_spatialdata_db/data/ooaie__10X__Xenium__Human__brain__20240415__v2.0.0/raw_input'

## Download files

In [6]:
def download_file(job):
    """Download a file using curl and save it to a specified path."""
    # Parse the file name from the URL
    parsed_url = urlparse(job['url'])
    file_name = pl.Path(parsed_url.path).name
    output_path = job['output_folder'] / file_name

    curl_command = [
        'curl', '-o', str(output_path), 
        '-L', job['url']
    ]

    try:
        subprocess.run(curl_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return job, None
    except subprocess.CalledProcessError as e:
        return job, e

def main(download_jobs):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = f'failed_downloads_{timestamp}.log'
    with open(log_file, 'w') as log:
        with ThreadPoolExecutor() adir
        s executor:
            futures = [executor.submit(download_file, job) for job in download_jobs]
            for future in tqdm(as_completed(futures), total=len(download_jobs), desc="Downloading files"):
                job, erro
                r = future.result()
                if error:
                    error_message = f"Failed to download {job['name']} from {job['url']}: {error}\n"
                    log.write(error_message)
                    print(error_message)

if __name__ == "__main__":
    main(download_jobs)

Downloading files:   0%|          | 0/2468 [00:00<?, ?it/s]

Failed to download Supplemental_Post-Xenium_HnE_image_OME-TIFF from https://cf.10xgenomics.com/samples/xenium/1.9.0/Xenium_V1_hTonsil_reactive_follicular_hyperplasia_section_FFPE/Xenium_V1_hTonsil_reactive_follicular_hyperplasia_section_FFPE_he_image.ome.tif: Command '['curl', '-o', '/lustre/groups/ml01/projects/2024_spatialdata_db/data/twgn1__10X__Xenium__Human__tonsil__20240305__v1.9.0__Reactivefollicularhyperplasia/raw_input/Xenium_V1_hTonsil_reactive_follicular_hyperplasia_section_FFPE_he_image.ome.tif', '-L', 'https://cf.10xgenomics.com/samples/xenium/1.9.0/Xenium_V1_hTonsil_reactive_follicular_hyperplasia_section_FFPE/Xenium_V1_hTonsil_reactive_follicular_hyperplasia_section_FFPE_he_image.ome.tif']' returned non-zero exit status 92.

Failed to download Loupe_file from https://cf.10xgenomics.com/samples/spatial-exp/3.0.0/Visium_HD_Human_Lung_Cancer/Visium_HD_Human_Lung_Cancer_cloupe_008um.cloupe: Command '['curl', '-o', '/lustre/groups/ml01/projects/2024_spatialdata_db/data/wxjgz_