In [3]:
import sys
sys.path.append('../src/')

import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

# data download

Run the cell below to download the datasets from gdrive. We do not include gdown in our [environment file](../env.yml) due to occasional system conflicts. If this is the case for you, you can manually download the folder from [this link](https://drive.google.com/drive/folders/1fWWaxBLUdacBT9r-1CymdyRICMPStvBJ?usp=share_link).

In [4]:
import gdown
gdown.download_folder(id="1fWWaxBLUdacBT9r-1CymdyRICMPStvBJ")

Retrieving folder contents


Processing file 10EZmVvgewZdw4o5LiUSxn3yx95dPUxTK ALM.h5ad
Processing file 1cmjBPzL1GonsK0juv5ZzIa1GYvs4wu54 filtered_gut_atlas_tcell.h5ad
Processing file 1Qpe2xEG45Os080498op0wuWWAF-riB32 gut_atlas_tcell.h5ad
Processing file 1IdVD5n8echNyrrf8vrF1ccGyNRqDeuzq LGN_human_exon.h5ad
Processing file 1p52MdM5BAQtJ_eO0D0sVOmXuvOZ6-hlg LGN_human_intron.h5ad
Processing file 1uSjtqhSOKh2xjt0QjcdJuV0cWJoLBeEu LGN_macaque_exon.h5ad
Processing file 1XtKUUKGEcMhKaLTe9KMHQGR1kFLlxKSx LGN_macaque_intron.h5ad
Processing file 1Yfjmk6fmmGF3WhrKdXBGG_5YptIsP1dE LGN_mouse_exon.h5ad
Processing file 1QPfRMdtXQCaPN2bq3A_G4Tuvr5P_Jitk LGN_mouse_intron.h5ad
Processing file 1IV7_1YUb22jX4GtbNgi4W5HWX1YpHlfQ MTG.h5ad
Processing file 1V7V4LeRq4i6Hss0ms55XGZmPPcGjXRuU pancreas_celseq.h5ad
Processing file 1-8FIY4XfqFNEZ6slJATMczlpgkBeTXIL pancreas_celseq2.h5ad
Processing file 1kRoM9bst9S5NvlQqhFdMWlEbOlfqylrA pancreas_fluidigmc1.h5ad
Processing file 177vMhCfo2JIZTeaP1A5F3YSMfcnubUOO pancreas_inDrop1.h5ad
Processing 

Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From (original): https://drive.google.com/uc?id=10EZmVvgewZdw4o5LiUSxn3yx95dPUxTK
From (redirected): https://drive.google.com/uc?id=10EZmVvgewZdw4o5LiUSxn3yx95dPUxTK&confirm=t&uuid=3c916fcb-6a23-45f1-ad4c-719ec5c0bc67
To: /Users/valerio/Desktop/RCM/data/data/ALM.h5ad
100%|██████████| 161M/161M [00:04<00:00, 38.2MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1cmjBPzL1GonsK0juv5ZzIa1GYvs4wu54
To: /Users/valerio/Desktop/RCM/data/data/filtered_gut_atlas_tcell.h5ad
100%|██████████| 85.8M/85.8M [00:02<00:00, 39.6MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1Qpe2xEG45Os080498op0wuWWAF-riB32
From (redirected): https://drive.google.com/uc?id=1Qpe2xEG45Os080498op0wuWWAF-riB32&confirm=t&uuid=cf1c6bd5-7726-419d-940e-ba1f8adf206d
To: /Users/valerio/Desktop/RCM/data/data/gut_atlas_tcell.h5ad
100%|██████████| 118M/118M [00:03<00:00, 38.9MB/s] 
Dow

# Data setup recreation

The following blocks of code document where the datasets were obtained, and how they were processed (to obtain raw counts in the .X attribute), into the final format we have made available for download. These are not necessary for installation after downloading the data from our provided google drive folder.

* Allen-Brain datasets; obtained from [this link](https://zenodo.org/records/3357167#.ZFWrsC_ML5g).

In [13]:
adata = sc.read_csv('MouseV1_MouseALM_HumanMTG.csv')

labels3 = pd.read_csv('MouseV1_MouseALM_HumanMTG_Labels3.csv')
labels34 = pd.read_csv('MouseV1_MouseALM_HumanMTG_Labels34.csv')

labels3.index = adata.to_df().index
labels34.index = adata.to_df().index

adata.obs['labels3'] = labels3
adata.obs['labels34'] = labels34

# splits into the ALM/MTG/VISp files based on the original paper's readme information
ixs = {'ALM': (12552, 20680), 'MTG': (20680, 34735), 'VISp': (0, 12552)}
for ds, (s, e) in ixs.items():
    adata[s:e].copy().write_h5ad(f'{ds}.h5ad', compression='gzip')


* The PBMC datasets, from the same source.

In [14]:
ds = [
    ('10Xv2', '10Xv2'), 
    ('10Xv3', '10Xv3'),
    ('CEL-Seq', 'CL'),
    ('Drop-Seq', 'DR'),
    ('inDrop', 'iD'),  
    ('Seq-Well', 'SW'),
    ('Smart-Seq2', 'SM2')
]

for dr, name in ds:
    adata = sc.read_csv(f'PbmcBench/{dr}/{name}_pbmc1.csv')
    labels = pd.read_csv(f'PbmcBench/{dr}/{name}_pbmc1Labels.csv')
     
    # add labels to anndata, and save to file
    labels.index = adata.to_df().index
    adata.obs['labels'] = labels
    adata.write_h5ad(f'pbmc_{dr}.h5ad', compression='gzip')

* pancrease datasets: [link](https://figshare.com/articles/dataset/Benchmarking_atlas-level_data_integration_in_single-cell_genomics_-_integration_task_datasets_Immune_and_pancreas_/12420968)

In [1]:
path_to_pancreas = "human_pancreas_norm_complexBatch.h5ad"

adata = sc.read_h5ad(path_to_pancreas) 
adata = ad.AnnData(X=adata.layers['counts'], obs=adata.obs)

for t in set(adata.obs['tech']):
    adata[adata.obs['tech'] == t].copy().write_h5ad(f'pancreas_{t}.h5ad', compression='gzip') 

* LGN datasets: [link](https://portal.brain-map.org/atlases-and-data/rnaseq/comparative-lgn)

In [20]:
for species in ['macaque', 'human', 'mouse']:
    
    # read and transpose
    exon_pd = pd.read_csv(f"{species}_LGN_2021_exon-matrix.csv", index_col=0).T
    intron_pd = pd.read_csv(f'{species}_LGN_2021_intron-matrix.csv', index_col=0).T
    obs = pd.read_csv(f'{species}_LGN_2021_metadata.csv', index_col=0).set_index('sample_name')
    obs.index.name = None
    
    # there's some discrepency where some of these dataframes have a few more cells 
    # than others, so we need to ensure their indices match up
    obs = obs[obs.index.isin(intron_pd.index)]
    obs = obs[obs.index.isin(exon_pd.index)]
    exon_pd = exon_pd[exon_pd.index.isin(obs.index)]
    intron_pd = intron_pd[intron_pd.index.isin(obs.index)]
    
    # filter out low quality cluster labels
    exon_pd = exon_pd[obs['cluster_label'] != "Low Quality"]
    intron_pd = intron_pd[obs['cluster_label'] != "Low Quality"]
    obs = obs[obs['cluster_label'] != "Low Quality"]
    
    # create corresponding anndata objects
    intron_adata = ad.AnnData(X=intron_pd, obs=obs)
    exon_adata = ad.AnnData(X=exon_pd, obs=obs)
    
    # write objects back out
    intron_adata.write_h5ad(f'LGN_{species}_intron.h5ad', compression='gzip')
    exon_adata.write_h5ad(f'LGN_{species}_exon.h5ad', compression='gzip')
    

* Gut cell atlas: [link](https://www.gutcellatlas.org)

In [21]:
path_to_gut_atlas_tcells = "Tcell_raw_counts02_v2.h5ad"
adata = sc.read_h5ad(path_to_gut_atlas_tcells)

adata.write_h5ad('gut_atlas_tcell.h5ad', compression='gzip')

# filtering to preserve only relevant cell type for our specific application
relevant_cell_types = [
    "Activated CD4 T",
    "Activated CD8 T",
    "CD8 Tmem", 
    "CX3CR1+ CD8 Tmem",
    "SELL+ CD4 T",
    "SELL+ CD8 T",
    "Tfh",
    "Th1",
    "Th17",
    "Treg"
]
filtered_adata = adata[adata.obs['annotation'].isin(relevant_cell_types)] 
filtered_adata.write_h5ad('filtered_gut_atlas_tcell.h5ad', compression='gzip')

* Celltypist datasets: [link](https://www.celltypist.org/organs)

In [3]:
# NOTE celltypist datasets are all row-count-normalized to 10.000
# so we are only able to retrieve raw counts up to the scaling factor

datasets = [
    "Blood.h5ad",
    "Bone_marrow.h5ad",
    "Heart.h5ad",
    "Hippocampus.h5ad",
    "Intestine.h5ad",
    "Kidney.h5ad",
    "Liver.h5ad",
    "Lung.h5ad",
    "Lymph_node.h5ad",
    "Pancreas.h5ad",
    "Skeletal_muscle.h5ad",
    "Spleen.h5ad"
]

for ds in datasets:
    adata = sc.read_h5ad(f'celltypist/{ds}')
    adata.X=adata.X.expm1()
    adata.write_h5ad(f'data/{ds}', compression='gzip')