In [1]:
import os
os.chdir("../../")

In [2]:
import numpy as np
import scanpy as sc
import pandas as pd

  from pandas.core.index import RangeIndex


In [3]:
def hvg_batch(adata, batch_key=None, target_genes=2000, flavor='cell_ranger', n_bins=20, adataOut=False):
    """

    Method to select HVGs based on mean dispersions of genes that are highly 
    variable genes in all batches. Using a the top target_genes per batch by
    average normalize dispersion. If target genes still hasn't been reached, 
    then HVGs in all but one batches are used to fill up. This is continued 
    until HVGs in a single batch are considered.
    """
    
    adata_hvg = adata if adataOut else adata.copy()

    n_batches = len(adata_hvg.obs[batch_key].cat.categories)

    # Calculate double target genes per dataset
    sc.pp.highly_variable_genes(adata_hvg,
                                flavor=flavor, 
                                n_top_genes=target_genes,
                                n_bins=n_bins, 
                                batch_key=batch_key)

    nbatch1_dispersions = adata_hvg.var['dispersions_norm'][adata_hvg.var.highly_variable_nbatches >
                                                           len(adata_hvg.obs[batch_key].cat.categories)-1]
    
    nbatch1_dispersions.sort_values(ascending=False, inplace=True)

    if len(nbatch1_dispersions) > target_genes:
        hvg = nbatch1_dispersions.index[:target_genes]
    
    else:
        enough = False
        print(f'Using {len(nbatch1_dispersions)} HVGs from full intersect set')
        hvg = nbatch1_dispersions.index[:]
        not_n_batches = 1
        
        while not enough:
            target_genes_diff = target_genes - len(hvg)

            tmp_dispersions = adata_hvg.var['dispersions_norm'][adata_hvg.var.highly_variable_nbatches ==
                                                                (n_batches-not_n_batches)]

            if len(tmp_dispersions) < target_genes_diff:
                print(f'Using {len(tmp_dispersions)} HVGs from n_batch-{not_n_batches} set')
                hvg = hvg.append(tmp_dispersions.index)
                not_n_batches += 1

            else:
                print(f'Using {target_genes_diff} HVGs from n_batch-{not_n_batches} set')
                tmp_dispersions.sort_values(ascending=False, inplace=True)
                hvg = hvg.append(tmp_dispersions.index[:target_genes_diff])
                enough=True

    print(f'Using {len(hvg)} HVGs')

    if not adataOut:
        del adata_hvg
        return hvg.tolist()
    else:
        return adata_hvg[:,hvg].copy()

In [4]:
def subsample_selection(adata, tissue_key, frac=0.1):
    tissues = adata.obs[tissue_key].unique().tolist()
    subsampled_adata = None
    for tissue in tissues:
        tissue_adata = adata[adata.obs[tissue_key] == tissue]
        n_samples = tissue_adata.shape[0]
        subsample_idx = np.random.choice(n_samples, int(frac * n_samples), replace=False)
        tissue_adata_subsampled = tissue_adata[subsample_idx, :]
        subsampled_adata = tissue_adata_subsampled if subsampled_adata is None else subsampled_adata.concatenate(tissue_adata_subsampled)
    return subsampled_adata

In [5]:
base_path = "/media/pgdrive/sharif/exosomians/mohsen/data/"

In [None]:
adata = sc.read(os.path.join(base_path, "./MCA/adata_raw_annotated.h5ad"))
adata

In [7]:
adata.X.min(), adata.X.max()

(0.0, 607.0)

In [8]:
sc.pp.normalize_total(adata, target_sum=1e4, exclude_highly_expressed=True)
sc.pp.log1p(adata)

In [9]:
adata.X.min(), adata.X.max()

(0.0, 10.627431)

In [10]:
adata.obs['Tech'] = 'MCA'
adata.obs['method'] = 'MCA'
adata.obs['age'] = '1m'
adata.obs['method_age'] = 'MCA - 1m'
adata.obs['tissue_age'] = pd.Categorical(adata.obs['tissue'].str.cat(adata.obs['age'], sep=' - '))

In [11]:
adata.write_h5ad(os.path.join(base_path, "./MCA/MCA_normalized_all.h5ad"))

... storing 'Tech' as categorical
... storing 'method' as categorical
... storing 'age' as categorical
... storing 'method_age' as categorical


In [13]:
tabula_senis = sc.read(os.path.join(base_path, "./tabula_senis/tabula_senis_normalized_all.h5ad"))
tabula_senis

AnnData object with n_obs × n_vars = 356213 × 20116 
    obs: 'FACS.selection', 'age', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_counts', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'method_tissue', 'method_age', 'Tech'
    var: 'n_cells-0', 'n_cells-1'

In [14]:
shared_genes = list(set(adata.var_names).intersection(set(tabula_senis.var_names)))

In [16]:
adata[:, shared_genes].write_h5ad(os.path.join(base_path, "./MCA/MCA_shared_normalized_all.h5ad"))

In [17]:
del adata

In [20]:
tabula_senis[:, shared_genes].write_h5ad(os.path.join(base_path, "./tabula_senis/tabula_senis_shared_normalized_all.h5ad"))

In [21]:
del tabula_senis

In [6]:
adata = sc.read(os.path.join(base_path, "./MCA/MCA_shared_normalized_all.h5ad"))
adata

AnnData object with n_obs × n_vars = 233445 × 17721 
    obs: 'batch', 'tissue', 'sample', 'barcode', 'cell_type', 'marker_gene', 'Tech', 'method', 'age', 'method_age', 'tissue_age'

In [7]:
tabula_senis = sc.read(os.path.join(base_path, "./tabula_senis/tabula_senis_shared_normalized_all.h5ad"))
tabula_senis

AnnData object with n_obs × n_vars = 356213 × 17721 
    obs: 'FACS.selection', 'age', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_counts', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'method_tissue', 'method_age', 'Tech'
    var: 'n_cells-0', 'n_cells-1'

In [8]:
sorted(adata.obs['tissue'].value_counts().index.tolist())

['Bladder',
 'BoneMarrow',
 'BoneMarrowcKit',
 'Brain',
 'EmbryonicMesenchyme',
 'EmbryonicStemCells',
 'FetalBrain',
 'FetalFemaleGonad',
 'FetalIntestine',
 'FetalLiver',
 'FetalLung',
 'FetalStomach',
 'Kidney',
 'Liver',
 'Lung',
 'Male(fetal)Gonad',
 'MammaryGland',
 'Muscle',
 'NeonatalBrain',
 'NeonatalCalvaria',
 'NeonatalHeart',
 'NeonatalMuscle',
 'NeonatalPancreas',
 'NeonatalRib',
 'NeonatalSkin',
 'Ovary',
 'Pancreas',
 'PeripheralBlood',
 'Placenta',
 'Prostate',
 'SmallIntestine',
 'Spleen',
 'Stomach',
 'Thymus',
 'TrophoblastStemCells',
 'Uterus']

In [10]:
adata.obs['old_tissue'] = adata.obs['tissue'].values

In [11]:
adata.obs['tissue'].replace("BoneMarrowcKit", "BoneMarrow", inplace=True)

In [12]:
tabula_senis.obs['tissue'].value_counts()

Marrow               54737
Spleen               39552
Limb_Muscle          32722
Lung                 29758
Tongue               24415
Kidney               23480
Mammary_Gland        15577
Brain_Myeloid        13417
Thymus               13322
Bladder              11377
Trachea              11136
Large_Intestine      10198
Liver                10153
Heart                 9669
Pancreas              9585
Skin                  9314
Heart_and_Aorta       8613
Brain_Non-Myeloid     7249
Fat                   6777
SCAT                  3755
GAT                   3406
MAT                   3014
BAT                   2223
Diaphragm             1858
Aorta                  906
Name: tissue, dtype: int64

In [13]:
tabula_senis.obs['old_tissue'] = tabula_senis.obs['tissue'].values

In [14]:
tabula_senis.obs['tissue'].replace("Marrow", "BoneMarrow", inplace=True)
tabula_senis.obs['tissue'].replace("Brain_Myeloid", "Brain", inplace=True)
tabula_senis.obs['tissue'].replace("Brain_Non-Myeloid", "Brain", inplace=True)
tabula_senis.obs['tissue'].replace("Mammary_Gland", "MammaryGland", inplace=True)

In [15]:
shared_tissues = list(set(adata.obs['tissue'].unique().tolist()).intersection(set(tabula_senis.obs['tissue'].unique().tolist())))
shared_tissues

['MammaryGland',
 'Spleen',
 'Liver',
 'Lung',
 'Kidney',
 'BoneMarrow',
 'Bladder',
 'Brain',
 'Thymus',
 'Pancreas']

In [16]:
shared_tissues += ['Stomach']

In [17]:
tabula_senis = tabula_senis[tabula_senis.obs['tissue'].isin(shared_tissues)]
tabula_senis

View of AnnData object with n_obs × n_vars = 228207 × 17721 
    obs: 'FACS.selection', 'age', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_counts', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'method_tissue', 'method_age', 'Tech', 'old_tissue'
    var: 'n_cells-0', 'n_cells-1'

In [18]:
adata = adata[adata.obs['tissue'].isin(shared_tissues)]
adata

View of AnnData object with n_obs × n_vars = 100309 × 17721 
    obs: 'batch', 'tissue', 'sample', 'barcode', 'cell_type', 'marker_gene', 'Tech', 'method', 'age', 'method_age', 'tissue_age', 'old_tissue'

In [19]:
tabula_senis.obs['cell_type'] = tabula_senis.obs['cell_ontology_class']

Trying to set attribute `.obs` of view, copying.


In [20]:
adata.obs['old_tissue'] = adata.obs['tissue'].values

Trying to set attribute `.obs` of view, copying.


In [21]:
adata.concatenate(tabula_senis).write_h5ad(os.path.join(base_path, "./tabula_senis_mca/tabula_senis_mca_normalized_all.h5ad"))

... storing 'FACS.selection' as categorical
... storing 'Tech' as categorical
... storing 'age' as categorical
... storing 'barcode' as categorical
... storing 'cell' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'cell_type' as categorical
... storing 'free_annotation' as categorical
... storing 'marker_gene' as categorical
... storing 'method' as categorical
... storing 'method_age' as categorical
... storing 'method_tissue' as categorical
... storing 'mouse.id' as categorical
... storing 'old_tissue' as categorical
... storing 'sample' as categorical
... storing 'sex' as categorical
... storing 'subtissue' as categorical
... storing 'tissue' as categorical
... storing 'tissue_age' as categorical
... storing 'tissue_free_annotation' as categorical


In [22]:
del tabula_senis

In [6]:
adata = sc.read(os.path.join(base_path, "./MCA/atlases_merged_anno_new.h5ad"))
adata

AnnData object with n_obs × n_vars = 133340 × 18756 
    obs: 'barcode', 'batch', 'cell_type', 'cell_type_union', 'channel', 'log_counts', 'marker_gene', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'tissue'
    var: 'gene_ids-1-1'
    layers: 'counts'

In [7]:
adata.obs['tissue'].value_counts()

Mammary_Gland    28618
Kidney           17368
Lung             14020
Spleen           13201
Brain            11875
Marrow            9199
Liver             7244
Thymus            7067
Mammary           6886
Bladder           6624
Muscle            6064
Pancreas          5174
Name: tissue, dtype: int64

In [8]:
adata.X = adata.layers['counts'].A

In [9]:
del adata.layers['counts']

In [10]:
sc.pp.normalize_total(adata, target_sum=1e4, exclude_highly_expressed=True)
sc.pp.log1p(adata)

In [11]:
adata.X.min(), adata.X.max()

(0.0, 10.850981)

In [12]:
adata.obs['age'] = '3m'
adata.obs['method'] = adata.obs['study'].values
adata.obs['method'].replace('TM-smartseq2', 'facs', inplace=True)
adata.obs['method'].replace('TM-droplet', 'droplet', inplace=True)

In [13]:
adata.obs['Tech'] = 'MCA'
adata.obs.loc[adata.obs['method'] != 'MCA', 'Tech'] = 'Tabula Muris'

In [14]:
adata.obs['method_age'] = pd.Categorical(adata.obs['method'].str.cat(adata.obs['age'], sep=' - '))

In [15]:
adata.obs['method'].value_counts()

MCA        72502
droplet    35553
facs       25285
Name: method, dtype: int64

In [16]:
adata.obs['method_age'].value_counts()

MCA - 3m        72502
droplet - 3m    35553
facs - 3m       25285
Name: method_age, dtype: int64

In [17]:
adata.obs['Tech'].value_counts()

MCA             72502
Tabula Muris    60838
Name: Tech, dtype: int64

In [18]:
adata_old = sc.read(os.path.join(base_path, "./tabula_senis_mca/tabula_senis_mca_normalized_all.h5ad"))
adata_old

AnnData object with n_obs × n_vars = 328516 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'free_annotation', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'sample', 'sex', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1', 'n_cells-1-1'

In [19]:
muris_old = adata_old[((adata_old.obs['age'] == '3m') & (adata_old.obs['method'] != 'MCA'))]
muris_old

View of AnnData object with n_obs × n_vars = 49528 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'free_annotation', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'sample', 'sex', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1', 'n_cells-1-1'

In [20]:
senis = adata_old[adata_old.obs['age'] != '3m']
senis = senis[senis.obs['method'] != "MCA"]
senis

View of AnnData object with n_obs × n_vars = 178679 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'free_annotation', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'sample', 'sex', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1', 'n_cells-1-1'

In [21]:
senis.obs['Tech'] = 'Tabula Senis'

Trying to set attribute `.obs` of view, copying.


In [22]:
muris_new = adata[adata.obs['Tech'] == 'Tabula Muris']
muris_new

View of AnnData object with n_obs × n_vars = 60838 × 18756 
    obs: 'barcode', 'batch', 'cell_type', 'cell_type_union', 'channel', 'log_counts', 'marker_gene', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'tissue', 'age', 'method', 'Tech', 'method_age'
    var: 'gene_ids-1-1'
    uns: 'log1p'

In [23]:
muris_old_indices = [index.split("-")[0] for index in muris_old.obs.index.tolist()]
muris_new_indices = [index.split("-")[0] for index in muris_new.obs.index.tolist()]

In [24]:
len(set(muris_old_indices).intersection(muris_new_indices))

23128

In [25]:
adata.obs['tissue'].value_counts()

Mammary_Gland    28618
Kidney           17368
Lung             14020
Spleen           13201
Brain            11875
Marrow            9199
Liver             7244
Thymus            7067
Mammary           6886
Bladder           6624
Muscle            6064
Pancreas          5174
Name: tissue, dtype: int64

In [26]:
adata.obs['tissue'].replace('Mammary', 'Mammary_Gland', inplace=True)
adata.obs['tissue'].replace('MammaryGland', 'Mammary_Gland', inplace=True)

In [29]:
adata.obs['tissue'].value_counts()

Mammary_Gland    35504
Kidney           17368
Lung             14020
Spleen           13201
Brain            11875
Marrow            9199
Liver             7244
Thymus            7067
Bladder           6624
Muscle            6064
Pancreas          5174
Name: tissue, dtype: int64

In [28]:
adata.obs['old_tissue'] = adata.obs['tissue'].values

In [31]:
adata = senis.concatenate(adata)
adata

AnnData object with n_obs × n_vars = 312019 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'gene_ids-1-1-1'

In [32]:
adata.obs['Tech'].value_counts()

Tabula Senis    178679
MCA              72502
Tabula Muris     60838
Name: Tech, dtype: int64

In [33]:
adata.obs['method'].value_counts()

droplet    175684
MCA         72502
facs        63833
Name: method, dtype: int64

In [34]:
adata.obs['age'].value_counts()

3m     133340
18m     50933
30m     44183
24m     41829
21m     26964
1m      14770
Name: age, dtype: int64

In [35]:
adata.obs['method_age'].value_counts()

MCA - 3m         72502
droplet - 30m    44183
droplet - 3m     35553
droplet - 18m    30194
droplet - 21m    26236
facs - 3m        25285
droplet - 24m    24748
facs - 18m       20739
facs - 24m       17081
droplet - 1m     14770
facs - 21m         728
Name: method_age, dtype: int64

In [36]:
adata.obs['tissue'].value_counts()

BoneMarrow       46178
Spleen           43914
Lung             38785
Kidney           38016
Mammary_Gland    35504
Brain            24567
Thymus           17600
Liver            15640
Bladder          14197
Pancreas         13171
Marrow            9199
MammaryGland      9184
Muscle            6064
Name: tissue, dtype: int64

In [37]:
adata.obs['tissue'].replace('Mammary', 'Mammary_Gland', inplace=True)
adata.obs['tissue'].replace('MammaryGland', 'Mammary_Gland', inplace=True)

In [38]:
adata.obs['tissue'].value_counts()

BoneMarrow       46178
Mammary_Gland    44688
Spleen           43914
Lung             38785
Kidney           38016
Brain            24567
Thymus           17600
Liver            15640
Bladder          14197
Pancreas         13171
Marrow            9199
Muscle            6064
Name: tissue, dtype: int64

In [41]:
adata.obs['method_age'].replace('MCA - 3m', 'Microwell-mix', inplace=True)

In [42]:
adata.write_h5ad(os.path.join(base_path, "./tabula_senis_mca/tabula_senis_mca_new_normalized_all.h5ad"))

... storing 'FACS.selection' as categorical
... storing 'Tech' as categorical
... storing 'age' as categorical
... storing 'barcode' as categorical
... storing 'cell' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'cell_type' as categorical
... storing 'cell_type_union' as categorical
... storing 'channel' as categorical
... storing 'free_annotation' as categorical
... storing 'marker_gene' as categorical
... storing 'method' as categorical
... storing 'method_age' as categorical
... storing 'method_tissue' as categorical
... storing 'mouse.id' as categorical
... storing 'old_tissue' as categorical
... storing 'sample' as categorical
... storing 'sample_id' as categorical
... storing 'sex' as categorical
... storing 'study' as categorical
... storing 'subtissue' as categorical
... storing 'tissue' as categorical
... storing 'tissue_age' as categorical
... storing 'tissue_free_annotation' as categorical


In [43]:
del muris_new, muris_old

In [46]:
del adata

In [44]:
adata_new = sc.read(os.path.join(base_path, "./tabula_senis_mca/tabula_senis_mca_new_normalized_all.h5ad"))
adata_new

AnnData object with n_obs × n_vars = 312019 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'gene_ids-1-1-1'

In [45]:
adata_old = sc.read(os.path.join(base_path, "./tabula_senis_mca/tabula_senis_mca_normalized_all.h5ad"))
adata_old

AnnData object with n_obs × n_vars = 328516 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'free_annotation', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'sample', 'sex', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1', 'n_cells-1-1'

In [47]:
adata_new.obs['Tech'].value_counts()

Tabula Senis    178679
MCA              72502
Tabula Muris     60838
Name: Tech, dtype: int64

In [48]:
adata_new[adata_new.obs['Tech'] == 'Tabula Senis']

View of AnnData object with n_obs × n_vars = 178679 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'gene_ids-1-1-1'

In [49]:
adata_old.obs['Tech'].value_counts()

Tabula Senis    178679
MCA             100309
Tabula Muris     49528
Name: Tech, dtype: int64

In [50]:
mca_new = adata_new[adata_new.obs['Tech'] == 'MCA']
mca_old = adata_old[adata_old.obs['Tech'] == 'MCA']

In [51]:
muris_new = adata_new[adata_new.obs['Tech'] == 'Tabula Muris']
muris_old = adata_old[adata_old.obs['Tech'] == 'Tabula Muris']

In [52]:
mca_new.obs.index = mca_new.obs.index.map(lambda x: x.split("-")[0])
mca_old.obs.index = mca_old.obs.index.map(lambda x: x.split("-")[0])

In [53]:
muris_new.obs.index = muris_new.obs.index.map(lambda x: x.split("-")[0])
muris_old.obs.index = muris_old.obs.index.map(lambda x: x.split("-")[0])

In [54]:
mca_new.shape, mca_old.shape

((72502, 17721), (100309, 17721))

In [55]:
muris_new.shape, muris_old.shape

((60838, 17721), (49528, 17721))

In [56]:
muris_old_index = set(muris_old.obs.index.tolist())
muris_new_index = set(muris_new.obs.index.tolist())

In [57]:
mca_old_index = set(mca_old.obs.index.tolist())
mca_new_index = set(mca_new.obs.index.tolist())

In [58]:
muris_shared_index = list(muris_old_index.intersection(muris_new_index))
mca_shared_index = list(mca_old_index.intersection(mca_new_index))

In [59]:
muris_shared = muris_old[muris_shared_index]
muris_shared

View of AnnData object with n_obs × n_vars = 23128 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'free_annotation', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'sample', 'sex', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1', 'n_cells-1-1'

In [60]:
muris_new.obs_names_make_unique()

In [61]:
muris_shared.obs['cell_type_union'] = muris_new[muris_shared_index].obs['cell_type_union'].values

Trying to set attribute `.obs` of view, copying.


In [62]:
muris_dict = muris_shared.obs.groupby(['cell_ontology_class', 'cell_type_union']).size().to_dict()

In [63]:
cell_type_map = {}
cell_type_map_count = {}
for (cell_type_lvl2, cell_type_lvl1) in muris_dict.keys():
    cell_type_lvl2_population = len(muris_shared[muris_shared.obs['cell_ontology_class'] == cell_type_lvl2])
    if muris_dict[(cell_type_lvl2, cell_type_lvl1)] > 0: 
        if cell_type_lvl2 not in cell_type_map:
            cell_type_map[cell_type_lvl2] = cell_type_lvl1
            cell_type_map_count[cell_type_lvl2] = muris_dict[(cell_type_lvl2, cell_type_lvl1)]
        elif cell_type_map_count[cell_type_lvl2] < muris_dict[(cell_type_lvl2, cell_type_lvl1)]:
            cell_type_map[cell_type_lvl2] = cell_type_lvl1  
            cell_type_map_count[cell_type_lvl2] = muris_dict[(cell_type_lvl2, cell_type_lvl1)]

In [64]:
senis = adata_new[adata_new.obs['Tech'] == 'Tabula Senis']
senis.obs['old_tissue'] = pd.Categorical(adata_old[adata_old.obs['Tech'] == 'Tabula Senis'].obs['old_tissue'].values)
senis

Trying to set attribute `.obs` of view, copying.


AnnData object with n_obs × n_vars = 178679 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'gene_ids-1-1-1'

In [65]:
senis.obs['cell_type_union'] = senis.obs['cell_ontology_class'].map(lambda x: cell_type_map[x] if x in cell_type_map.keys() else x)
muris_old.obs['cell_type_union'] = muris_old.obs['cell_ontology_class'].map(lambda x: cell_type_map[x] if x in cell_type_map.keys() else x)

Trying to set attribute `.obs` of view, copying.


In [66]:
mca_shared = mca_new[mca_shared_index]
mca_shared

View of AnnData object with n_obs × n_vars = 71437 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'gene_ids-1-1-1'

In [67]:
mca_dict = mca_shared.obs.groupby(['cell_type', 'cell_type_union']).size().to_dict()

In [68]:
cell_type_map = {}
for (cell_type_lvl2, cell_type_lvl1) in mca_dict.keys():
    cell_type_lvl2_population = len(mca_shared[mca_shared.obs['cell_type'] == cell_type_lvl2])
    if mca_dict[(cell_type_lvl2, cell_type_lvl1)] > 0:
        if cell_type_lvl2 in cell_type_map:
            print((cell_type_lvl2, cell_type_lvl1, mca_dict[(cell_type_lvl2, cell_type_lvl1)], cell_type_lvl2_population))
        cell_type_map[cell_type_lvl2] = cell_type_lvl1

In [69]:
mca_old.obs['cell_type_union'] = mca_old.obs['cell_type'].map(lambda x: cell_type_map[x] if x in cell_type_map.keys() else x)

Trying to set attribute `.obs` of view, copying.


In [70]:
mca_old.obs['Tech'] = 'MCA'

In [71]:
adata = senis.concatenate(muris_old, mca_old)
adata

AnnData object with n_obs × n_vars = 328516 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0-0', 'n_cells-1-1-0-0', 'gene_ids-1-1-1-0', 'n_cells-0-1-1', 'n_cells-1-1-1', 'n_cells-0-1-2', 'n_cells-1-1-2'

In [72]:
adata.obs['Tech'].value_counts()

Tabula Senis    178679
MCA             100309
Tabula Muris     49528
Name: Tech, dtype: int64

In [73]:
adata.obs['tissue'].value_counts()

BoneMarrow       81730
Spleen           41504
Kidney           37548
Lung             36698
MammaryGland     35011
Brain            24685
Thymus           17611
Liver            14838
Bladder          14123
Pancreas         13195
Mammary_Gland     9184
Stomach           2389
Name: tissue, dtype: int64

In [74]:
adata.obs.groupby(['tissue', 'Tech']).size()

tissue         Tech        
Bladder        MCA              2746
               Tabula Muris     3804
               Tabula Senis     7573
BoneMarrow     MCA             26993
               Tabula Muris     8559
               Tabula Senis    46178
Brain          MCA              4019
               Tabula Muris     7974
               Tabula Senis    12692
Kidney         MCA             14068
               Tabula Muris     2832
               Tabula Senis    20648
Liver          MCA              4685
               Tabula Muris     1757
               Tabula Senis     8396
Lung           MCA              6940
               Tabula Muris     4993
               Tabula Senis    24765
MammaryGland   MCA             28618
               Tabula Muris     6393
Mammary_Gland  Tabula Senis     9184
Pancreas       MCA              3610
               Tabula Muris     1588
               Tabula Senis     7997
Spleen         MCA              1952
               Tabula Muris     8839
          

In [75]:
adata.write_h5ad(os.path.join(base_path, "./tabula_senis_mca/tabula_senis_mca_new_label_consistent_normalized_all.h5ad"))

... storing 'FACS.selection' as categorical
... storing 'Tech' as categorical
... storing 'age' as categorical
... storing 'barcode' as categorical
... storing 'cell' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'cell_type' as categorical
... storing 'cell_type_union' as categorical
... storing 'channel' as categorical
... storing 'free_annotation' as categorical
... storing 'marker_gene' as categorical
... storing 'method' as categorical
... storing 'method_age' as categorical
... storing 'method_tissue' as categorical
... storing 'mouse.id' as categorical
... storing 'old_tissue' as categorical
... storing 'sample' as categorical
... storing 'sample_id' as categorical
... storing 'sex' as categorical
... storing 'study' as categorical
... storing 'subtissue' as categorical
... storing 'tissue' as categorical
... storing 'tissue_age' as categorical
... storing 'tissue_free_annotation' as categorical


In [76]:
adata = adata[~adata.obs['tissue'].isin(['Mammary_Gland', 'Marrow', 'Muscle', 'MammaryGland'])]

In [77]:
adata

View of AnnData object with n_obs × n_vars = 284321 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0-0', 'n_cells-1-1-0-0', 'gene_ids-1-1-1-0', 'n_cells-0-1-1', 'n_cells-1-1-1', 'n_cells-0-1-2', 'n_cells-1-1-2'

In [78]:
cell_types_to_keep = []
for cell_type in adata.obs['cell_type_union'].unique().tolist():
    if len(adata[adata.obs['cell_type_union'] == cell_type]) > 50:
        cell_types_to_keep.append(cell_type)

In [79]:
adata = adata[adata.obs['cell_type_union'].isin(cell_types_to_keep)]
adata

View of AnnData object with n_obs × n_vars = 283811 × 17721 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'old_tissue', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0-0', 'n_cells-1-1-0-0', 'gene_ids-1-1-1-0', 'n_cells-0-1-1', 'n_cells-1-1-1', 'n_cells-0-1-2', 'n_cells-1-1-2'

In [80]:
adata.obs['Tech'].value_counts()

Tabula Senis    169425
MCA              71259
Tabula Muris     43127
Name: Tech, dtype: int64

In [81]:
adata.write_h5ad(os.path.join(base_path, "./tabula_senis_mca/tabula_senis_mca_new_label_consistent_celltype_filtered_normalized_all.h5ad"))

In [82]:
adata.obs['old_tissue'].value_counts()

Marrow               54724
Spleen               41504
Kidney               37491
Lung                 36551
BoneMarrow           26973
Thymus               17593
Liver                14797
Bladder              14110
Brain_Myeloid        13417
Pancreas             13150
Brain_Non-Myeloid     7249
Brain                 3977
Stomach               2275
Name: old_tissue, dtype: int64