# Human Cell Landscape Preprocessing
This jupyter notebook contains the code that is required to prepare the full Human Cell Landscape single cell dataset (Han et al., 2020. doi: 10.1038/s41586-020-2157-4) for use through sfaira dataloaders. The code downloads, annotates and cleans the provided adata pbject, saves it by sample and copies it into the right folders in your local sfaira dataset repository, so you can use it with sfaira dataloaders. The notebook saves all files in its working directory and requires you to provide the path to your local sfaira dataset repository in the last cell of this notebook.

In [1]:
import pandas as pd
import urllib.request
import numpy as np
import anndata as ad
import scipy.sparse
import os
import zipfile
from sfaira.versions.genome_versions.class_interface import SuperGenomeContainer

  from pandas.core.index import RangeIndex


In [2]:
# download required files from human cell landscape publication data: https://figshare.com/articles/HCL_DGE_Data/7235471
print(urllib.request.urlretrieve('https://ndownloader.figshare.com/files/17727365', 'HCL_Fig1_adata.h5ad'))
print(urllib.request.urlretrieve('https://ndownloader.figshare.com/files/21758835', 'HCL_Fig1_cell_Info.xlsx'))
print(urllib.request.urlretrieve('https://ndownloader.figshare.com/files/22447898', 'annotation_rmbatch_data_revised417.zip'))

('HCL_Fig1_adata.h5ad', <http.client.HTTPMessage object at 0x7faa38db8e90>)
('HCL_Fig1_cell_Info.xlsx', <http.client.HTTPMessage object at 0x7faa38dc5090>)
('annotation_rmbatch_data_revised417.zip', <http.client.HTTPMessage object at 0x7faa38dc5390>)


In [3]:
# extract the downloaded zip archive
with zipfile.ZipFile('annotation_rmbatch_data_revised417.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

In [4]:
# load data file
adata = ad.read('HCL_Fig1_adata.h5ad')

In [5]:
# convert to sparse matrix
adata.X = scipy.sparse.csr_matrix(adata.X).copy()

# harmonise annotations
for col in ['batch', 'tissue']:
    adata.obs[col] = adata.obs[col].astype('str')
adata.obs.index = adata.obs.index.str.replace('AdultJeJunum', 'AdultJejunum', regex=True).str.replace('AdultGallBladder', 'AdultGallbladder', regex=True).str.replace('FetalFemaleGonald', 'FetalFemaleGonad', regex=True)
adata.obs.replace({'AdultJeJunum': 'AdultJejunum', 'AdultGallBladder': 'AdultGallbladder', 'FetalFemaleGonald': 'FetalFemaleGonad'}, regex=True, inplace=True)
adata.obs.index = ["-".join(i.split('-')[:-1]) for i in adata.obs.index]

# load celltype labels and harmonise them
fig1_anno = pd.read_excel('HCL_Fig1_cell_Info.xlsx', index_col='cellnames')
fig1_anno.index = fig1_anno.index.str.replace('AdultJeJunum', 'AdultJejunum', regex=True).str.replace('AdultGallBladder', 'AdultGallbladder', regex=True).str.replace('FetalFemaleGonald', 'FetalFemaleGonad', regex=True)

# check that the order of cells and cell labels is the same
assert np.all(fig1_anno.index == adata.obs.index)

# add annotations to adata object and rename columns
adata.obs = pd.concat([adata.obs, fig1_anno[['cluster', 'stage', 'donor', 'celltype']]], axis=1)
adata.obs.columns = ['sample', 'tissue', 'n_genes', 'n_counts', 'cluster_global', 'stage', 'donor', 'celltype_global']

# add sample-wise annotations to the full adata object
df = pd.DataFrame(columns=['Cell_barcode', 'Sample', 'Batch', 'Cell_id', 'Cluster_id', 'Ages', 'Development_stage', 'Method', 'Gender', 'Source', 'Biomaterial', 'Name', 'ident', 'Celltype'])
for f in os.listdir('annotation_rmbatch_data_revised417/'):
    df1 = pd.read_csv('annotation_rmbatch_data_revised417/'+f, encoding='unicode_escape')
    df = pd.concat([df, df1], sort=True)
df = df.set_index('Cell_id')
adata = adata[[i in df.index for i in adata.obs.index]].copy()
a_idx = adata.obs.index.copy()
adata.obs = pd.concat([adata.obs, df[['Ages', 'Celltype', 'Cluster_id', 'Gender', 'Method', 'Source']]], axis=1)
assert np.all(a_idx == adata.obs.index)

# remove mouse cells from the object
adata = adata[adata.obs['Source'] != 'MCA2.0'].copy()

# tidy up the column names of the obs annotations
adata.obs.columns = ['sample', 'sub_tissue', 'n_genes', 'n_counts', 'cluster_global', 'dev_stage',
       'donor', 'celltype_global', 'age', 'celltype_specific', 'cluster_specific', 'gender',
       'protocol', 'source']

# create some annotations that are used in sfaira
adata.obs["healthy"] = True
adata.obs["state_exact"] = 'healthy'
adata.obs["cell_ontology_class"] = adata.obs["celltype_global"]
adata.obs["cell_ontology_id"] = None

# convert gene ids to ensembl ids and store both
gc = SuperGenomeContainer(species='human', genome='Homo_sapiens_GRCh38_97')
id_dict = gc.names_to_id_dict
adata.var = adata.var.reset_index().rename({'index': 'names'}, axis='columns')
adata.var['ensembl'] = [id_dict[n] if n in id_dict.keys() else 'n/a' for n in adata.var['names']]
adata.var.index = adata.var['ensembl'].values

# create a tidy organ annotaion which is then used in sfaira
adata.obs['organ']  = adata.obs['sub_tissue'] \
    .str.replace("Adult", "") \
    .str.replace("Fetal", "") \
    .str.replace("Neonatal", "") \
    .str.replace("Transverse", "") \
    .str.replace("Sigmoid", "") \
    .str.replace("Ascending", "") \
    .str.replace("Cord", "") \
    .str.replace("Peripheral", "") \
    .str.replace("CD34P", "") \
    .str.replace("Cerebellum", "Brain") \
    .str.replace("TemporalLobe", "Brain") \
    .str.replace("BoneMarrow", "Bone") \
    .str.replace("Spinal", "SpinalCord") \
    .str.replace("Intestine", "Stomach") \
    .str.replace("Eyes", "Eye") \
    .str.lower()

# print the number of cells per organ
adata.obs['organ'].value_counts()

adrenalgland       43476
stomach            41963
kidney             40691
blood              35533
lung               33698
brain              30493
liver              28501
pancreas           28473
colon              22301
pleura             19695
spleen             15806
malegonad          13211
omentum            12812
thyroid            12647
esophagus          11364
heart              10783
trachea             9949
chorionicvillus     9898
gallbladder         9769
artery              9652
placenta            9595
bladder             9048
bone                8704
cervix              8096
muscle              7775
uterus              7694
skin                6991
femalegonad         6941
fallopiantube       6556
rib                 5992
spinalcord          5916
rectum              5718
jejunum             5549
calvaria            5129
duodenum            4681
thymus              4516
epityphlon          4486
ileum               3367
prostate            2445
ureter              2390


In [6]:
# write full adata object to disk
adata.write('HCL_processed.h5ad')

... storing 'sample' as categorical
... storing 'sub_tissue' as categorical
... storing 'dev_stage' as categorical
... storing 'donor' as categorical
... storing 'celltype_global' as categorical
... storing 'age' as categorical
... storing 'celltype_specific' as categorical
... storing 'cluster_specific' as categorical
... storing 'gender' as categorical
... storing 'protocol' as categorical
... storing 'source' as categorical
... storing 'state_exact' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'organ' as categorical
... storing 'ensembl' as categorical


In [7]:
# write separate files per sample as used in sfaira
os.mkdir('hcl_organs/')
for i in adata.obs['sample'].unique():
    a = adata[adata.obs['sample'] == i].copy()
    a.write('hcl_organs/hcl_{}.h5ad'.format(i))

Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Vari

In [8]:
# copy the seperate h5ad files into your sfaira data repository
your_datarepository = '/path/to/repository' # path to the folder that contains the 'human' and 'mouse' directories
for samplefile in os.listdir('hcl_organs/'):
    if samplefile.startswith('hcl_'):
        a = ad.read('hcl_organs/'+samplefile)
        organ = a.obs['organ'][0]
        if organ not in os.listdir(f"{your_datarepository}/human"):
            os.mkdir(f"{your_datarepository}/human/{organ}")
        a.write(f'{your_datarepository}/human/{organ}/{samplefile}')