# Merging all human datasets: PBMCs + Bone Marrow

In [2]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
import glob

import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

In [3]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()

results_file = './write/Immune_ALL_human.h5ad'

scanpy==1.4.4+40.gbd5f862 anndata==0.6.22.post1 umap==0.3.9 numpy==1.15.4 scipy==1.3.0 pandas==0.24.2 scikit-learn==0.21.2 statsmodels==0.10.1 python-igraph==0.7.1 louvain==0.6.1


In [5]:
file_paths = './write/'
adatas_pp = []
for i in glob.glob(file_paths+'*.h5ad'):
    print(i)
    adatas_pp.append(sc.read(i, cache=True)) 

./write/Oetjen_hum_BM_pp.h5ad
./write/Freytag_hum_PBMCs_pp.h5ad
./write/10X_hum_PBMCs_pp.h5ad
./write/Sun_hum_PBMCs_pp.h5ad
./write/Villani_hum_PBMCs_pp.h5ad


In [9]:
for i in range(len(adatas_pp)):
    print(adatas_pp[i].shape)

(9581, 22946)
(3347, 24576)
(10727, 22966)
(8829, 20390)
(1022, 22321)


In [12]:
# Concatenate to unique adata object
adata_pp = adatas_pp[0].concatenate(adatas_pp[1:], batch_key='sample_ID',
                                                        index_unique=None)

layers are inconsistent - only layers that are shared among all AnnData objects are included.


In [13]:
adata_pp.var.head()

Unnamed: 0,gene_id-0,n_cells-0,gene_ids-1,n_cells-1,gene_ids-2,feature_types-2,n_cells-2,gene_id-3,n_cells-3,n_cells-4
LINC00115,ENSG00000225880,192,ENSG00000225880,18,ENSG00000225880,Gene Expression,338,ENSG00000225880,101,52
FAM41C,ENSG00000230368,524,ENSG00000230368,8,ENSG00000230368,Gene Expression,307,ENSG00000230368,240,44
SAMD11,ENSG00000187634,17,ENSG00000187634,1,ENSG00000187634,Gene Expression,9,ENSG00000187634,11,45
NOC2L,ENSG00000188976,1739,ENSG00000188976,396,ENSG00000188976,Gene Expression,2554,ENSG00000188976,1283,653
KLHL17,ENSG00000187961,52,ENSG00000187961,10,ENSG00000187961,Gene Expression,251,ENSG00000187961,30,24


In [14]:
adata_pp.obs.head()

Unnamed: 0_level_0,batch,chemistry,data_type,dpt_pseudotime,final_annotation,mt_frac,n_counts,n_genes,sample_ID,size_factors,species,study,tissue
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAACCTGCAGCGAACA-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD16+ Monocytes,0.04797,6379.0,1862.0,0,0.957719,Human,Oetjen,Bone_Marrow
AAACCTGCATGTCCTC-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD4+ T cells,0.024928,4172.0,1082.0,0,0.425532,Human,Oetjen,Bone_Marrow
AAACCTGGTCGACTGC-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD14+ Monocytes,0.051907,6608.0,1618.0,0,0.773111,Human,Oetjen,Bone_Marrow
AAACCTGGTCGCTTCT-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD14+ Monocytes,0.041716,5034.0,1413.0,0,0.641188,Human,Oetjen,Bone_Marrow
AAACCTGTCCCGACTT-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,NKT cells,0.043522,3998.0,1127.0,0,0.452426,Human,Oetjen,Bone_Marrow


In [15]:
adata_pp.shape

(33506, 12303)

In [16]:
adata_pp.write(results_file)

... storing 'batch' as categorical
... storing 'chemistry' as categorical
... storing 'data_type' as categorical
... storing 'final_annotation' as categorical
... storing 'study' as categorical
... storing 'tissue' as categorical
... storing 'feature_types-2' as categorical
