In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scIB

In [2]:
adata = sc.read('/storage/groups/ml01/workspace/group.daniela/atlases_merged_st.h5ad')

## Clean up merged adata

In [3]:
adata

AnnData object with n_obs × n_vars = 133340 × 18756 
    obs: 'barcode', 'batch', 'cell_type', 'cell_type_union', 'channel', 'log_counts', 'marker_gene', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'tissue', 'study+tissue'
    var: 'highly_variable-0', 'means-0', 'dispersions-0', 'dispersions_norm-0', 'gene_ids-1-1', 'highly_variable-1', 'means-1', 'dispersions-1', 'dispersions_norm-1'
    layers: 'counts'

In [4]:
# remove computed gene info
adata.var.drop(columns=adata.var.columns, inplace=True)
adata.var.head()

0610007P14Rik
0610009B22Rik
0610009L18Rik
0610009O20Rik
0610010F05Rik


In [5]:
adata.obs.index = range(0, adata.n_obs)

In [6]:
# add counts
scIB.pp.summarize_counts(adata, count_matrix=adata.layers['counts'])

In [7]:
adata.obs

Unnamed: 0,barcode,batch,cell_type,cell_type_union,channel,log_counts,marker_gene,n_counts,n_genes,percent_mito,sample,sample_id,sex,size_factors,study,tissue,study+tissue
0,ACCTGAAGTCGTGTTGCC,0,Pre T cell,pre t cell,,7.390799,,1621.0,983,0.0,,0,,2.247378,MCA,Thymus,ThymusMCA
1,CCAGACTGTGCGGGGTTT,0,B Cell,b cell,,8.682538,,5899.0,2459,0.0,,0,,7.477643,MCA,Thymus,ThymusMCA
2,CGGCAGCGTGGCATGCTT,0,Pre T cell,pre t cell,,8.601718,,5441.0,2398,0.0,,0,,6.237622,MCA,Thymus,ThymusMCA
3,GAACGCGGACATACGAGC,0,B Cell,b cell,,8.506941,,4949.0,2133,0.0,,0,,6.422959,MCA,Thymus,ThymusMCA
4,TGATCATTCATAGTGGTA,0,B Cell,b cell,,8.602269,,5444.0,2272,0.0,,0,,6.081328,MCA,Thymus,ThymusMCA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133335,,1,immature T cell,t cell,10X_P7_11,8.270781,,3908.0,1673,0.0,3-F-56,8,F,0.634096,TM-droplet,Thymus,ThymusTM-droplet
133336,,1,immature T cell,t cell,10X_P7_11,9.019785,,8265.0,2710,0.0,3-F-56,8,F,1.382807,TM-droplet,Thymus,ThymusTM-droplet
133337,,1,DN1 thymic pro-T cell,t cell,10X_P7_11,8.516994,,4999.0,1438,0.0,3-F-56,8,F,0.517985,TM-droplet,Thymus,ThymusTM-droplet
133338,,1,immature T cell,t cell,10X_P7_11,8.138272,,3423.0,1398,0.0,3-F-56,8,F,0.518015,TM-droplet,Thymus,ThymusTM-droplet


## Add missing annotations to MCA

In [8]:
anno_file = '/storage/groups/ml01/workspace/group.daniela/MCA/adata_annotation.csv'
anno = pd.read_csv(anno_file, index_col=0)

In [9]:
anno.head()

Unnamed: 0,batch,tissue,sample,barcode,cell_type,marker_gene
0,0,BoneMarrow,BoneMarrow_1,AAGCGGAGGACTGTGGTA,Monocyte,Mif high
1,0,BoneMarrow,BoneMarrow_1,CCGACGGGTACATAGTCG,Neutrophil,Cebpe high
2,0,BoneMarrow,BoneMarrow_1,AGGACTATCTCTACCTGA,Neutrophil,Cebpe high
3,0,BoneMarrow,BoneMarrow_1,TATGTAACTTATTCACTT,Neutrophil,Fcnb high
4,0,BoneMarrow,BoneMarrow_1,AAGCGGATCTCTTGCAAT,Neutrophil,Ngp high


In [10]:
anno.drop(columns=['batch'], inplace=True)

In [11]:
obs = adata.obs.copy()
obs['idx'] = obs.index

In [12]:
obs.head()

Unnamed: 0,barcode,batch,cell_type,cell_type_union,channel,log_counts,marker_gene,n_counts,n_genes,percent_mito,sample,sample_id,sex,size_factors,study,tissue,study+tissue,idx
0,ACCTGAAGTCGTGTTGCC,0,Pre T cell,pre t cell,,7.390799,,1621.0,983,0.0,,0,,2.247378,MCA,Thymus,ThymusMCA,0
1,CCAGACTGTGCGGGGTTT,0,B Cell,b cell,,8.682538,,5899.0,2459,0.0,,0,,7.477643,MCA,Thymus,ThymusMCA,1
2,CGGCAGCGTGGCATGCTT,0,Pre T cell,pre t cell,,8.601718,,5441.0,2398,0.0,,0,,6.237622,MCA,Thymus,ThymusMCA,2
3,GAACGCGGACATACGAGC,0,B Cell,b cell,,8.506941,,4949.0,2133,0.0,,0,,6.422959,MCA,Thymus,ThymusMCA,3
4,TGATCATTCATAGTGGTA,0,B Cell,b cell,,8.602269,,5444.0,2272,0.0,,0,,6.081328,MCA,Thymus,ThymusMCA,4


In [13]:
cols = ['tissue', 'barcode', 'cell_type', 'marker_gene', 'idx']
merged_anno = pd.merge(anno, obs[cols], on=cols[:-1])

In [14]:
merged_anno.index = merged_anno['idx']

In [15]:
adata.obs.loc[merged_anno.index]['sample'] = merged_anno['sample']

In [28]:
adata.obs

Unnamed: 0,barcode,batch,cell_type,cell_type_union,channel,log_counts,marker_gene,n_counts,n_genes,percent_mito,sample,sample_id,sex,size_factors,study,tissue,study+tissue
0,ACCTGAAGTCGTGTTGCC,0,Pre T cell,pre t cell,,7.390799,,1621.0,983,0.0,,0,,2.247378,MCA,Thymus,ThymusMCA
1,CCAGACTGTGCGGGGTTT,0,B Cell,b cell,,8.682538,,5899.0,2459,0.0,,0,,7.477643,MCA,Thymus,ThymusMCA
2,CGGCAGCGTGGCATGCTT,0,Pre T cell,pre t cell,,8.601718,,5441.0,2398,0.0,,0,,6.237622,MCA,Thymus,ThymusMCA
3,GAACGCGGACATACGAGC,0,B Cell,b cell,,8.506941,,4949.0,2133,0.0,,0,,6.422959,MCA,Thymus,ThymusMCA
4,TGATCATTCATAGTGGTA,0,B Cell,b cell,,8.602269,,5444.0,2272,0.0,,0,,6.081328,MCA,Thymus,ThymusMCA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133335,,1,immature T cell,t cell,10X_P7_11,8.270781,,3908.0,1673,0.0,3-F-56,8,F,0.634096,TM-droplet,Thymus,ThymusTM-droplet
133336,,1,immature T cell,t cell,10X_P7_11,9.019785,,8265.0,2710,0.0,3-F-56,8,F,1.382807,TM-droplet,Thymus,ThymusTM-droplet
133337,,1,DN1 thymic pro-T cell,t cell,10X_P7_11,8.516994,,4999.0,1438,0.0,3-F-56,8,F,0.517985,TM-droplet,Thymus,ThymusTM-droplet
133338,,1,immature T cell,t cell,10X_P7_11,8.138272,,3423.0,1398,0.0,3-F-56,8,F,0.518015,TM-droplet,Thymus,ThymusTM-droplet


In [33]:
sum(adata.obs['sample'] == 'nan')

72502

In [34]:
adata.obs[adata.obs['sample'] == 'nan']

Unnamed: 0,barcode,batch,cell_type,cell_type_union,channel,log_counts,marker_gene,n_counts,n_genes,percent_mito,sample,sample_id,sex,size_factors,study,tissue,study+tissue
0,ACCTGAAGTCGTGTTGCC,0,Pre T cell,pre t cell,,7.390799,,1621.0,983,0.0,,0,,2.247378,MCA,Thymus,ThymusMCA
1,CCAGACTGTGCGGGGTTT,0,B Cell,b cell,,8.682538,,5899.0,2459,0.0,,0,,7.477643,MCA,Thymus,ThymusMCA
2,CGGCAGCGTGGCATGCTT,0,Pre T cell,pre t cell,,8.601718,,5441.0,2398,0.0,,0,,6.237622,MCA,Thymus,ThymusMCA
3,GAACGCGGACATACGAGC,0,B Cell,b cell,,8.506941,,4949.0,2133,0.0,,0,,6.422959,MCA,Thymus,ThymusMCA
4,TGATCATTCATAGTGGTA,0,B Cell,b cell,,8.602269,,5444.0,2272,0.0,,0,,6.081328,MCA,Thymus,ThymusMCA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72497,GAGATCTTGGACGCCTAG,0,Microglia,microglia,,6.156979,,472.0,337,0.0,,10,,0.477032,MCA,Brain,BrainMCA
72498,AAGCGGGTTGCCGAACGC,0,Myelinating oligodendrocyte,oligodendrocyte,,6.242223,,514.0,362,0.0,,10,,0.710107,MCA,Brain,BrainMCA
72499,ATGGCGCTCCATAGGGTC,0,Microglia,microglia,,6.180017,,483.0,313,0.0,,10,,0.459761,MCA,Brain,BrainMCA
72500,TGAAGCGCCTAGTGTGCG,0,Myelinating oligodendrocyte,oligodendrocyte,,6.167517,,477.0,351,0.0,,10,,0.681747,MCA,Brain,BrainMCA


In [35]:
adata.write('/storage/groups/ml01/workspace/group.daniela/atlases_merged_anno.h5ad')