# Merging all mouse datasets: PB + Bone Marrow

In [1]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
import glob

import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

In [2]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()

results_file = './write/Immune_ALL_mouse.h5ad'

scanpy==1.4.4.post1 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.3 scipy==1.3.0 pandas==0.24.2 scikit-learn==0.21.2 statsmodels==0.10.1 python-igraph==0.7.1 louvain==0.6.1


In [10]:
file_paths = './write/'
adatas_pp = []
for i in glob.glob(file_paths+'*.h5ad'):
    print(i)
    adatas_pp.append(sc.read(i, cache=False)) 

./write/Dahlin_mou_BM_pp.h5ad
./write/MCA_mou_PB_pp.h5ad
./write/MCA_mou_BM_pp.h5ad


In [11]:
for i in range(len(adatas_pp)):
    print(adatas_pp[i].shape)

(30405, 18854)
(7576, 11346)
(26465, 15455)


In [12]:
# Concatenate to unique adata object
adata_pp = adatas_pp[0].concatenate(adatas_pp[1:], batch_key='sample_ID',
                                                        index_unique=None)

In [13]:
adata_pp.shape

(64446, 10108)

In [14]:
adata_pp.var.index.rename('mouse_gene_symbol', inplace=True)

In [15]:
adata_pp.var.head()

Unnamed: 0_level_0,gene_ensembl-0,n_cells-0,n_cells-1,n_cells-2
mouse_gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mrpl15,ENSMUSG00000033845,20003,618,5321
Lypla1,ENSMUSG00000025903,10938,411,1846
Tcea1,ENSMUSG00000033813,18486,208,826
Atp6v1h,ENSMUSG00000033793,8053,140,911
Rb1cc1,ENSMUSG00000025907,11350,82,390


In [16]:
adata_pp.obs.head()

Unnamed: 0_level_0,batch,chemistry,data_type,dpt_pseudotime_y,final_annotation,mt_frac,n_counts,n_genes,sample_ID,size_factors,species,study,tissue
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAACCTGAGGCAGGTT-1-Dahlin_1,Dahlin_1,v2_10X,UMI,0.865444,Erythrocytes,0.027973,30744.0,4452.0,0,1.976381,Mouse,Dahlin_BM,Bone_Marrow
AAACCTGCAAGCTGAG-1-Dahlin_1,Dahlin_1,v2_10X,UMI,,Neutrophils,0.022213,16657.0,3369.0,0,1.049823,Mouse,Dahlin_BM,Bone_Marrow
AAACCTGCAGATGGGT-1-Dahlin_1,Dahlin_1,v2_10X,UMI,,Lymphocyte progenitors,0.015106,16484.0,3372.0,0,1.081016,Mouse,Dahlin_BM,Bone_Marrow
AAACCTGCATACAGCT-1-Dahlin_1,Dahlin_1,v2_10X,UMI,0.750666,Erythrocytes,0.024411,34820.0,4648.0,0,2.257881,Mouse,Dahlin_BM,Bone_Marrow
AAACCTGCATCACGTA-1-Dahlin_1,Dahlin_1,v2_10X,UMI,0.887669,Erythrocytes,0.020295,26213.0,4258.0,0,1.75134,Mouse,Dahlin_BM,Bone_Marrow


## Human orthologous genes

In [17]:
mapMouse_Human_orth = pd.read_csv('../mapping_mouse_human.txt', sep='\t')
mapMouse_Human_orth.head()

Unnamed: 0,mouse_genes,HGNC.symbol
0,0610010F05Rik,KIAA1841
1,0610010K14Rik,C17orf49
2,0610010K14Rik,RNASEK-C17orf49
3,0610012G03Rik,NCBP2AS2
4,0610030E20Rik,C2orf68


In [18]:
mapMouse_Human_orth.rename(columns={'mouse_genes':'mouse_gene_symbol', 'HGNC.symbol':'human_gene_symbol'}, inplace=True)

In [19]:
adata_var_tmp = adata_pp.var.reset_index()
#merge 
adata_var_merged = adata_var_tmp.merge(mapMouse_Human_orth, how='left', on='mouse_gene_symbol')
adata_var_merged.head()

Unnamed: 0,mouse_gene_symbol,gene_ensembl-0,n_cells-0,n_cells-1,n_cells-2,human_gene_symbol
0,Mrpl15,ENSMUSG00000033845,20003,618,5321,MRPL15
1,Lypla1,ENSMUSG00000025903,10938,411,1846,LYPLA1
2,Tcea1,ENSMUSG00000033813,18486,208,826,TCEA1
3,Atp6v1h,ENSMUSG00000033793,8053,140,911,ATP6V1H
4,Rb1cc1,ENSMUSG00000025907,11350,82,390,RB1CC1


In [20]:
adata_var_merged.drop_duplicates(subset='mouse_gene_symbol', keep='first', inplace=True)

In [21]:
adata_var_merged.shape

(10108, 6)

In [22]:
adata_var_merged.set_index('mouse_gene_symbol', inplace = True)
adata_pp.var = adata_var_merged

In [23]:
adata_pp = adata_pp[:, ~pd.isnull(adata_pp.var['human_gene_symbol'])]
adata_pp.shape

(64446, 9161)

In [24]:
adata_pp.write(results_file)

Trying to set attribute `.obs` of view, making a copy.
... storing 'batch' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'chemistry' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'final_annotation' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'study' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'tissue' as categorical
Trying to set attribute `.var` of view, making a copy.
... storing 'human_gene_symbol' as categorical
