# Merging all immune cells across species: human + mouse

In [7]:
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
import glob

import rpy2.rinterface_lib.callbacks
import logging

from rpy2.robjects import pandas2ri
import anndata2ri

In [8]:
# Ignore R warning messages
#Note: this can be commented out to get more verbose R output
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython

plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()

results_file = './write/merged/Immune_ALL_hum_mou.h5ad'

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
scanpy==1.4.4.post1 anndata==0.6.22.post1 umap==0.3.10 numpy==1.17.3 scipy==1.3.0 pandas==0.24.2 scikit-learn==0.21.2 statsmodels==0.10.1 python-igraph==0.7.1 louvain==0.6.1


In [9]:
# Load human data
file_path = './write/merged/Immune_ALL_human.h5ad'
adata_hum = sc.read(file_path, cache=True)

In [10]:
adata_hum.obs.head()

Unnamed: 0_level_0,batch,chemistry,data_type,dpt_pseudotime_y,final_annotation,mt_frac,n_counts,n_genes,sample_ID,size_factors,species,study,tissue
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAACCTGCAGCGAACA-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD16+ Monocytes,0.04797,6379.0,1862.0,0,0.939366,Human,Oetjen,Bone_Marrow
AAACCTGCATGTCCTC-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD4+ T cells,0.024928,4172.0,1082.0,0,0.450271,Human,Oetjen,Bone_Marrow
AAACCTGGTCGACTGC-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD14+ Monocytes,0.051907,6608.0,1618.0,0,0.747701,Human,Oetjen,Bone_Marrow
AAACCTGGTCGCTTCT-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD14+ Monocytes,0.041716,5034.0,1413.0,0,0.620114,Human,Oetjen,Bone_Marrow
AAACCTGTCCCGACTT-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,NKT cells,0.043522,3998.0,1127.0,0,0.468185,Human,Oetjen,Bone_Marrow


In [11]:
# Load mouse data
file_path = './Mouse/write/Immune_ALL_mouse.h5ad'
adata_mou = sc.read(file_path, cache=True)

In [12]:
adata_mou.obs.head()

Unnamed: 0_level_0,batch,chemistry,data_type,dpt_pseudotime_y,final_annotation,mt_frac,n_counts,n_genes,sample_ID,size_factors,species,study,tissue
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAACCTGAGGCAGGTT-1-Dahlin_1,Dahlin_1,v2_10X,UMI,0.865444,Erythrocytes,0.027973,30744.0,4452.0,0,1.976381,Mouse,Dahlin_BM,Bone_Marrow
AAACCTGCAAGCTGAG-1-Dahlin_1,Dahlin_1,v2_10X,UMI,,Neutrophils,0.022213,16657.0,3369.0,0,1.049823,Mouse,Dahlin_BM,Bone_Marrow
AAACCTGCAGATGGGT-1-Dahlin_1,Dahlin_1,v2_10X,UMI,,Lymphocyte progenitors,0.015106,16484.0,3372.0,0,1.081016,Mouse,Dahlin_BM,Bone_Marrow
AAACCTGCATACAGCT-1-Dahlin_1,Dahlin_1,v2_10X,UMI,0.750666,Erythrocytes,0.024411,34820.0,4648.0,0,2.257881,Mouse,Dahlin_BM,Bone_Marrow
AAACCTGCATCACGTA-1-Dahlin_1,Dahlin_1,v2_10X,UMI,0.887669,Erythrocytes,0.020295,26213.0,4258.0,0,1.75134,Mouse,Dahlin_BM,Bone_Marrow


In [13]:
adata_mou.var.head()

Unnamed: 0_level_0,gene_ensembl-0,n_cells-0,n_cells-1,n_cells-2,human_gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mrpl15,ENSMUSG00000033845,20003,618,5321,MRPL15
Lypla1,ENSMUSG00000025903,10938,411,1846,LYPLA1
Tcea1,ENSMUSG00000033813,18486,208,826,TCEA1
Atp6v1h,ENSMUSG00000033793,8053,140,911,ATP6V1H
Rb1cc1,ENSMUSG00000025907,11350,82,390,RB1CC1


In [14]:
adata_mou.var_names

Index(['Mrpl15', 'Lypla1', 'Tcea1', 'Atp6v1h', 'Rb1cc1', 'Pcmtd1', 'Rrs1',
       'Vcpip1', 'Sgk3', 'Ppp1r42',
       ...
       'Csf2ra', 'mt-Nd1', 'mt-Nd2', 'mt-Co1', 'mt-Nd3', 'mt-Nd4', 'mt-Nd5',
       'mt-Nd6', 'mt-Cytb', 'Vamp7'],
      dtype='object', name='index', length=9161)

In [15]:
adata_mou.var['human_gene_symbol'] = adata_mou.var['human_gene_symbol'].astype('object')

In [16]:
adata_mou.var.set_index('human_gene_symbol', inplace=True)
adata_mou.var.head()

Unnamed: 0_level_0,gene_ensembl-0,n_cells-0,n_cells-1,n_cells-2
human_gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MRPL15,ENSMUSG00000033845,20003,618,5321
LYPLA1,ENSMUSG00000025903,10938,411,1846
TCEA1,ENSMUSG00000033813,18486,208,826
ATP6V1H,ENSMUSG00000033793,8053,140,911
RB1CC1,ENSMUSG00000025907,11350,82,390


In [17]:
adata_hum.var.index.rename('human_gene_symbol', inplace=True)
adata_hum.var.head()

Unnamed: 0_level_0,gene_id-0,n_cells-0,gene_ids-1,n_cells-1,gene_ids-2,feature_types-2,n_cells-2,gene_id-3,n_cells-3,n_cells-4
human_gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
LINC00115,ENSG00000225880,192,ENSG00000225880,18,ENSG00000225880,Gene Expression,338,ENSG00000225880,101,52
FAM41C,ENSG00000230368,524,ENSG00000230368,8,ENSG00000230368,Gene Expression,307,ENSG00000230368,240,44
SAMD11,ENSG00000187634,17,ENSG00000187634,1,ENSG00000187634,Gene Expression,9,ENSG00000187634,11,45
NOC2L,ENSG00000188976,1739,ENSG00000188976,396,ENSG00000188976,Gene Expression,2554,ENSG00000188976,1283,653
KLHL17,ENSG00000187961,52,ENSG00000187961,10,ENSG00000187961,Gene Expression,251,ENSG00000187961,30,24


## Concatenate human and mouse

In [18]:
adata_hum.var_names

Index(['LINC00115', 'FAM41C', 'SAMD11', 'NOC2L', 'KLHL17', 'PLEKHN1', 'HES4',
       'ISG15', 'AGRN', 'TTLL10',
       ...
       'COL6A2', 'FTCD', 'LSS', 'MCM3AP-AS1', 'MCM3AP', 'YBEY', 'PCNT',
       'DIP2A', 'S100B', 'PRMT2'],
      dtype='object', name='human_gene_symbol', length=12303)

In [19]:
adata_mou.var_names

Index(['MRPL15', 'LYPLA1', 'TCEA1', 'ATP6V1H', 'RB1CC1', 'PCMTD1', 'RRS1',
       'VCPIP1', 'SGK3', 'PPP1R42',
       ...
       'CSF2RA', 'MT-ND1', 'MT-ND2', 'MT-CO1', 'MT-ND3', 'MT-ND4', 'MT-ND5',
       'MT-ND6', 'MT-CYB', 'VAMP7'],
      dtype='object', name='human_gene_symbol', length=9161)

In [20]:
adata_immune = adata_hum.concatenate(adata_mou, index_unique=None, batch_key='sample_ID')
adata_immune.obs.drop(columns='sample_ID', inplace=True)
adata_immune.shape

Making variable names unique for controlled concatenation.


(97952, 8135)

In [21]:
adata_immune.obs.head()

Unnamed: 0_level_0,batch,chemistry,data_type,dpt_pseudotime_y,final_annotation,mt_frac,n_counts,n_genes,size_factors,species,study,tissue
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AAACCTGCAGCGAACA-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD16+ Monocytes,0.04797,6379.0,1862.0,0.939366,Human,Oetjen,Bone_Marrow
AAACCTGCATGTCCTC-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD4+ T cells,0.024928,4172.0,1082.0,0.450271,Human,Oetjen,Bone_Marrow
AAACCTGGTCGACTGC-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD14+ Monocytes,0.051907,6608.0,1618.0,0.747701,Human,Oetjen,Bone_Marrow
AAACCTGGTCGCTTCT-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,CD14+ Monocytes,0.041716,5034.0,1413.0,0.620114,Human,Oetjen,Bone_Marrow
AAACCTGTCCCGACTT-1-Oetjen_A,Oetjen_A,v2_10X,UMI,,NKT cells,0.043522,3998.0,1127.0,0.468185,Human,Oetjen,Bone_Marrow


In [22]:
adata_immune.var.head()

Unnamed: 0,gene_id-0-0,n_cells-0-0,gene_ids-1-0,n_cells-1-0,gene_ids-2-0,feature_types-2-0,n_cells-2-0,gene_id-3-0,n_cells-3-0,n_cells-4-0,gene_ensembl-0-1,n_cells-0-1,n_cells-1-1,n_cells-2-1
NOC2L,ENSG00000188976,1739,ENSG00000188976,396,ENSG00000188976,Gene Expression,2554,ENSG00000188976,1283,653,ENSMUSG00000095567,17315,124,1166
KLHL17,ENSG00000187961,52,ENSG00000187961,10,ENSG00000187961,Gene Expression,251,ENSG00000187961,30,24,ENSMUSG00000078485,228,44,286
ISG15,ENSG00000187608,3498,ENSG00000187608,811,ENSG00000187608,Gene Expression,4176,ENSG00000187608,1970,566,ENSMUSG00000035692,5487,410,410
AGRN,ENSG00000188157,88,ENSG00000188157,2,ENSG00000188157,Gene Expression,109,ENSG00000188157,21,13,ENSMUSG00000041936,506,17,95
TNFRSF18,ENSG00000186891,377,ENSG00000186891,134,ENSG00000186891,Gene Expression,602,ENSG00000186891,208,5,ENSMUSG00000041954,4335,220,429


In [23]:
adata_immune.layers.keys()

KeysView(Layers with keys: counts)

## Rename cell labels for B cells and Monocytes

In [24]:
tmp = adata_immune.obs['final_annotation']
tmp = ['B cells' if item.startswith('CD20+ B cells') else item for item in tmp]
tmp = ['B cells' if item.startswith('CD10+ B cells') else item for item in tmp]
tmp = ['Monocytes' if item.startswith('CD14+') else item for item in tmp]
tmp = ['Monocytes' if item.startswith('CD16+') else item for item in tmp]
tmp = ['HSPCs' if item.startswith('HSPCs + Lymphocyte') else item for item in tmp]
adata_immune.obs['final_annotation'] = tmp

In [25]:
set(adata_immune.obs['final_annotation'])

{'B cells',
 'Basophils',
 'CD4+ T cells',
 'CD8+ T cells',
 'CD8+ T cells + CD20+ B cells',
 'Eosinophil progenitor cell',
 'Erythrocytes',
 'Erythroid progenitors',
 'HSPCs',
 'Lymphocyte progenitors',
 'Megakaryocyte progenitors',
 'Monocyte progenitors',
 'Monocyte-derived dendritic cells',
 'Monocytes',
 'NK cells',
 'NKT cells',
 'Neutrophils',
 'Plasma cells',
 'Plasmacytoid dendritic cells',
 'T cells'}

In [26]:
adata_immune.shape

(97952, 8135)

In [27]:
adata_immune = adata_immune[adata_immune.obs['final_annotation'] != 'CD8+ T cells + CD20+ B cells']
adata_immune.shape

(97861, 8135)

In [28]:
adata_immune.write(results_file)

Trying to set attribute `.obs` of view, making a copy.
... storing 'batch' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'chemistry' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'data_type' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'final_annotation' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'species' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'study' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'tissue' as categorical
Trying to set attribute `.var` of view, making a copy.
... storing 'feature_types-2-0' as categorical


In [29]:
adata_immune.obs.to_pickle("./adataOBS_immune_mou_hum.pkl")