# Download 10X data 

In [1]:
import pathlib as pl
import pandas as pd
import json
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from urllib.parse import urlparse
from datetime import datetime


OUTPUT_DIR = pl.Path("/lustre/groups/ml01/projects/2024_spatialdata_db/data")

In [2]:
data = pd.read_csv("/home/icb/tim.treis/projects/spatialdata-db/utils/data/10x_datasets.csv")[:2]
data

Unnamed: 0,Datasets,Products,Chemistry Version,Additional Applications,Software,Pipeline Version,Subpipeline,Species,Disease State,Anatomical entity,...,Replicate,HTML,input_links,output_links,tech,organ,uid,id,rep,date
0,FFPE Human Brain Cancer Data with Human Immuno...,In Situ Gene Expression,v1,,Xenium Onboard Analysis,v2.0.0,,Human,glioblastoma multiforme,brain,...,,"<html lang=""en"" class="" qisqwzyyz idc0_350""><h...","[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s...",Xenium,brain,ikt9n,ikt9n__10X__Xenium__Human__brain__20240415__v2...,,20240415
1,Mouse Bone Data with Custom Add-on Panel,In Situ Gene Expression,v1,,Xenium Onboard Analysis,v1.9.0,,Mouse,,bone,...,10% Formic acid decalcification,"<html lang=""en"" class="" qwmodgos idc0_350""><he...","[{""name"": ""Panel (JSON)"", ""url"": ""https://cf.1...","[{""name"": ""Format details"", ""url"": ""/support/s...",Xenium,bone,20vj4,20vj4__10X__Xenium__Mouse__bone__20240403__v1....,10Formicaciddecalcification,20240403


## Create folders and jobs for downloading

In [3]:
download_jobs = []

def create_directory(path: pl.Path):
    if not path.exists():
        path.mkdir()
        print(f"Created directory: {path}")

def sanitize_key(key: str):
    return (
        key
        .replace(" ", "_")
        .replace("(", "")
        .replace(")", "")
        .replace(":", "")
        .replace("&", "n")
    )

download_jobs = []

for idx, row in data.iterrows():
    dataset_path = OUTPUT_DIR / row["id"]
    create_directory(dataset_path) 

    raw_input_path = dataset_path / "raw_input"
    create_directory(raw_input_path)  
    raw_output_path = dataset_path / "raw_output"
    create_directory(raw_output_path)  

    for idx, field in enumerate(['input_links', 'output_links']):
        if row.get(field):  
            links = json.loads(row[field])
            links_dict = {sanitize_key(link['name']): link['url'] for link in links}
            for name, url in links_dict.items():
                # filter out links to the support page and subsets
                if "https" in url and "subset" not in name:
                    download_jobs.append({
                        "name": name,
                        "url": url,
                        "output_folder": [raw_input_path, raw_output_path][idx]
                    })

download_jobs

Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/ikt9n__10X__Xenium__Human__brain__20240415__v2.0.0
Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/ikt9n__10X__Xenium__Human__brain__20240415__v2.0.0/raw_input
Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/ikt9n__10X__Xenium__Human__brain__20240415__v2.0.0/raw_output
Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/20vj4__10X__Xenium__Mouse__bone__20240403__v1.9.0__10Formicaciddecalcification
Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/20vj4__10X__Xenium__Mouse__bone__20240403__v1.9.0__10Formicaciddecalcification/raw_input
Created directory: /lustre/groups/ml01/projects/2024_spatialdata_db/data/20vj4__10X__Xenium__Mouse__bone__20240403__v1.9.0__10Formicaciddecalcification/raw_output


[{'name': 'Panel_JSON',
  'url': 'https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_Human_Brain_GBM_FFPE/Xenium_V1_Human_Brain_GBM_FFPE_gene_panel.json',
  'output_folder': PosixPath('/lustre/groups/ml01/projects/2024_spatialdata_db/data/ikt9n__10X__Xenium__Human__brain__20240415__v2.0.0/raw_input')},
 {'name': 'Supplemental_Post-Xenium_HnE_image_OME-TIFF',
  'url': 'https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_Human_Brain_GBM_FFPE/Xenium_V1_Human_Brain_GBM_FFPE_he_image.ome.tif',
  'output_folder': PosixPath('/lustre/groups/ml01/projects/2024_spatialdata_db/data/ikt9n__10X__Xenium__Human__brain__20240415__v2.0.0/raw_input')},
 {'name': 'Supplemental_HnE_Image_Alignment_File_CSV',
  'url': 'https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_Human_Brain_GBM_FFPE/Xenium_V1_Human_Brain_GBM_FFPE_he_imagealignment.csv',
  'output_folder': PosixPath('/lustre/groups/ml01/projects/2024_spatialdata_db/data/ikt9n__10X__Xenium__Human__brain__20240415__v2.0.0/raw_input'

## Download files

In [5]:
def download_file(job):
    """Download a file using curl and save it to a specified path."""
    # Parse the file name from the URL
    parsed_url = urlparse(job['url'])
    file_name = pl.Path(parsed_url.path).name
    output_path = job['output_folder'] / file_name

    curl_command = [
        'curl', '-o', str(output_path), 
        '-L', job['url']
    ]

    try:
        subprocess.run(curl_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return job, None
    except subprocess.CalledProcessError as e:
        return job, e

def main(download_jobs):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = f'failed_downloads_{timestamp}.log'
    with open(log_file, 'w') as log:
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(download_file, job) for job in download_jobs]
            for future in tqdm(as_completed(futures), total=len(download_jobs), desc="Downloading files"):
                job, error = future.result()
                if error:
                    error_message = f"Failed to download {job['name']} from {job['url']}: {error}\n"
                    log.write(error_message)
                    print(error_message)

if __name__ == "__main__":
    main(download_jobs)

Downloading files:   0%|          | 0/9 [00:00<?, ?it/s]