In [1]:
import os
os.chdir("../../")

In [2]:
import numpy as np
import scanpy as sc
import pandas as pd

In [3]:
def hvg_batch(adata, batch_key=None, target_genes=2000, flavor='cell_ranger', n_bins=20, adataOut=False):
    """

    Method to select HVGs based on mean dispersions of genes that are highly 
    variable genes in all batches. Using a the top target_genes per batch by
    average normalize dispersion. If target genes still hasn't been reached, 
    then HVGs in all but one batches are used to fill up. This is continued 
    until HVGs in a single batch are considered.
    """
    
    adata_hvg = adata if adataOut else adata.copy()

    n_batches = len(adata_hvg.obs[batch_key].cat.categories)

    # Calculate double target genes per dataset
    sc.pp.highly_variable_genes(adata_hvg,
                                flavor=flavor, 
                                n_top_genes=target_genes,
                                n_bins=n_bins, 
                                batch_key=batch_key)

    nbatch1_dispersions = adata_hvg.var['dispersions_norm'][adata_hvg.var.highly_variable_nbatches >
                                                           len(adata_hvg.obs[batch_key].cat.categories)-1]
    
    nbatch1_dispersions.sort_values(ascending=False, inplace=True)

    if len(nbatch1_dispersions) > target_genes:
        hvg = nbatch1_dispersions.index[:target_genes]
    
    else:
        enough = False
        print(f'Using {len(nbatch1_dispersions)} HVGs from full intersect set')
        hvg = nbatch1_dispersions.index[:]
        not_n_batches = 1
        
        while not enough:
            target_genes_diff = target_genes - len(hvg)

            tmp_dispersions = adata_hvg.var['dispersions_norm'][adata_hvg.var.highly_variable_nbatches ==
                                                                (n_batches-not_n_batches)]

            if len(tmp_dispersions) < target_genes_diff:
                print(f'Using {len(tmp_dispersions)} HVGs from n_batch-{not_n_batches} set')
                hvg = hvg.append(tmp_dispersions.index)
                not_n_batches += 1

            else:
                print(f'Using {target_genes_diff} HVGs from n_batch-{not_n_batches} set')
                tmp_dispersions.sort_values(ascending=False, inplace=True)
                hvg = hvg.append(tmp_dispersions.index[:target_genes_diff])
                enough=True

    print(f'Using {len(hvg)} HVGs')

    if not adataOut:
        del adata_hvg
        return hvg.tolist()
    else:
        return adata_hvg[:,hvg].copy()

In [4]:
def subsample_selection(adata, tissue_key, frac=0.1):
    tissues = adata.obs[tissue_key].unique().tolist()
    subsampled_adata = None
    for tissue in tissues:
        tissue_adata = adata[adata.obs[tissue_key] == tissue]
        n_samples = tissue_adata.shape[0]
        subsample_idx = np.random.choice(n_samples, int(frac * n_samples), replace=False)
        tissue_adata_subsampled = tissue_adata[subsample_idx, :]
        subsampled_adata = tissue_adata_subsampled if subsampled_adata is None else subsampled_adata.concatenate(tissue_adata_subsampled)
    return subsampled_adata

In [5]:
adata = sc.read("./data/MCA/adata_raw_annotated.h5ad")
adata

AnnData object with n_obs × n_vars = 233445 × 34749 
    obs: 'batch', 'tissue', 'sample', 'barcode', 'cell_type', 'marker_gene'

In [4]:
adata.X.min(), adata.X.max()

(0.0, 607.0)

In [5]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [6]:
adata.X.min(), adata.X.max()

(0.0, 8.617314)

In [7]:
adata.obs['method'] = 'MCA'
adata.obs['age'] = '1m'
adata.obs['method_age'] = 'MCA - 1m'
adata.obs['tissue_age'] = pd.Categorical(adata.obs['tissue'].str.cat(adata.obs['age'], sep=' - '))

In [8]:
adata.write_h5ad("./data/MCA/MCA_normalized_all.h5ad")

... storing 'method' as categorical
... storing 'age' as categorical
... storing 'method_age' as categorical


In [9]:
tabula_senis = sc.read("./data/tabula_senis/tabula_senis_normalized_all.h5ad")
tabula_senis

AnnData object with n_obs × n_vars = 356213 × 20116 
    obs: 'FACS.selection', 'age', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_counts', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'method_tissue', 'method_age'
    var: 'n_cells-0', 'n_cells-1'

In [10]:
shared_genes = list(set(adata.var_names).intersection(set(tabula_senis.var_names)))

In [11]:
adata[:, shared_genes].write_h5ad("./data/MCA/MCA_shared_normalized_all.h5ad")

In [12]:
del adata

In [13]:
tabula_senis[:, shared_genes].write_h5ad("./data/tabula_senis/tabula_senis_shared_normalized_all.h5ad")

In [14]:
del tabula_senis

In [6]:
adata = sc.read("./data/MCA/MCA_shared_normalized_all.h5ad")
adata

AnnData object with n_obs × n_vars = 233445 × 17721 
    obs: 'batch', 'tissue', 'sample', 'barcode', 'cell_type', 'marker_gene', 'method', 'age', 'method_age', 'tissue_age'

In [5]:
tabula_senis = sc.read("./data/tabula_senis/tabula_senis_shared_normalized_all.h5ad")
tabula_senis

AnnData object with n_obs × n_vars = 356213 × 17721 
    obs: 'FACS.selection', 'age', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_counts', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'method_tissue', 'method_age'
    var: 'n_cells-0', 'n_cells-1'

In [6]:
adata = adata[:, tabula_senis.var_names]
adata

View of AnnData object with n_obs × n_vars = 233445 × 2000 
    obs: 'batch', 'tissue', 'sample', 'barcode', 'cell_type', 'marker_gene', 'method', 'age', 'method_age', 'tissue_age'

In [13]:
sorted(adata.obs['tissue'].value_counts().index.tolist())

['Bladder',
 'BoneMarrow',
 'BoneMarrowcKit',
 'Brain',
 'EmbryonicMesenchyme',
 'EmbryonicStemCells',
 'FetalBrain',
 'FetalFemaleGonad',
 'FetalIntestine',
 'FetalLiver',
 'FetalLung',
 'FetalStomach',
 'Kidney',
 'Liver',
 'Lung',
 'Male(fetal)Gonad',
 'MammaryGland',
 'Muscle',
 'NeonatalBrain',
 'NeonatalCalvaria',
 'NeonatalHeart',
 'NeonatalMuscle',
 'NeonatalPancreas',
 'NeonatalRib',
 'NeonatalSkin',
 'Ovary',
 'Pancreas',
 'PeripheralBlood',
 'Placenta',
 'Prostate',
 'SmallIntestine',
 'Spleen',
 'Stomach',
 'Thymus',
 'TrophoblastStemCells',
 'Uterus']

In [14]:
adata.obs['tissue'].replace("BoneMarrowcKit", "BoneMarrow", inplace=True)

In [15]:
tabula_senis.obs['tissue'].value_counts()

Marrow               54737
Spleen               39552
Limb_Muscle          32722
Lung                 29758
Tongue               24415
Kidney               23480
Mammary_Gland        15577
Brain_Myeloid        13417
Thymus               13322
Bladder              11377
Trachea              11136
Large_Intestine      10198
Liver                10153
Heart                 9669
Pancreas              9585
Skin                  9314
Heart_and_Aorta       8613
Brain_Non-Myeloid     7249
Fat                   6777
SCAT                  3755
GAT                   3406
MAT                   3014
BAT                   2223
Diaphragm             1858
Aorta                  906
Name: tissue, dtype: int64

In [16]:
tabula_senis.obs['tissue'].replace("Marrow", "BoneMarrow", inplace=True)
tabula_senis.obs['tissue'].replace("Brain_Myeloid", "Brain", inplace=True)
tabula_senis.obs['tissue'].replace("Brain_Non-Myeloid", "Brain", inplace=True)
tabula_senis.obs['tissue'].replace("Mammary_Gland", "MammaryGland", inplace=True)

In [17]:
shared_tissues = list(set(adata.obs['tissue'].unique().tolist()).intersection(set(tabula_senis.obs['tissue'].unique().tolist())))
shared_tissues

['Spleen',
 'Bladder',
 'Brain',
 'Pancreas',
 'BoneMarrow',
 'Liver',
 'Thymus',
 'MammaryGland',
 'Lung',
 'Kidney']

In [18]:
shared_tissues += ['Stomach']

In [19]:
tabula_senis = tabula_senis[tabula_senis.obs['tissue'].isin(shared_tissues)]
tabula_senis

View of AnnData object with n_obs × n_vars = 228207 × 5000 
    obs: 'FACS.selection', 'age', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'free_annotation', 'method', 'mouse.id', 'n_counts', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation', 'method_tissue', 'method_age'
    var: 'n_cells-0', 'n_cells-1', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'

In [20]:
adata = adata[adata.obs['tissue'].isin(shared_tissues)]
adata

View of AnnData object with n_obs × n_vars = 100309 × 5000 
    obs: 'batch', 'tissue', 'sample', 'barcode', 'cell_type', 'marker_gene', 'method', 'age', 'method_age', 'tissue_age'

In [21]:
tabula_senis.obs['cell_type'] = tabula_senis.obs['cell_ontology_class']

Trying to set attribute `.obs` of view, making a copy.


In [22]:
adata.concatenate(tabula_senis).write_h5ad("./data/tabula_senis_mca/tabula_senis_mca_normalized_hvg_all.h5ad")

... storing 'FACS.selection' as categorical
... storing 'age' as categorical
... storing 'barcode' as categorical
... storing 'cell' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'cell_type' as categorical
... storing 'free_annotation' as categorical
... storing 'marker_gene' as categorical
... storing 'method' as categorical
... storing 'method_age' as categorical
... storing 'method_tissue' as categorical
... storing 'mouse.id' as categorical
... storing 'sample' as categorical
... storing 'sex' as categorical
... storing 'subtissue' as categorical
... storing 'tissue' as categorical
... storing 'tissue_age' as categorical
... storing 'tissue_free_annotation' as categorical


In [23]:
subsampled_adata = subsample_selection(tabula_senis, "tissue", 0.5)
subsampled_adata

AnnData object with n_obs × n_vars = 114101 × 5000 
    obs: 'FACS.selection', 'age', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'free_annotation', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'sex', 'subtissue', 'tissue', 'tissue_free_annotation'
    var: 'n_cells-0', 'n_cells-1', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'

In [24]:
subsampled_adata.concatenate(adata).write_h5ad("./data/tabula_senis_mca/tabula_senis_mca_normalized_hvg_subsampled.h5ad")

... storing 'FACS.selection' as categorical
... storing 'age' as categorical
... storing 'barcode' as categorical
... storing 'cell' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'cell_type' as categorical
... storing 'free_annotation' as categorical
... storing 'marker_gene' as categorical
... storing 'method' as categorical
... storing 'method_age' as categorical
... storing 'method_tissue' as categorical
... storing 'mouse.id' as categorical
... storing 'sample' as categorical
... storing 'sex' as categorical
... storing 'subtissue' as categorical
... storing 'tissue' as categorical
... storing 'tissue_age' as categorical
... storing 'tissue_free_annotation' as categorical


In [4]:
adata = sc.read("./data/atlases_merged_anno_new.h5ad")
adata

AnnData object with n_obs × n_vars = 133340 × 18756 
    obs: 'barcode', 'batch', 'cell_type', 'cell_type_union', 'channel', 'log_counts', 'marker_gene', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'tissue'
    var: 'gene_ids-1-1'
    layers: 'counts'

In [5]:
adata.X = adata.layers['counts'].A

In [6]:
del adata.layers['counts']

In [7]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [8]:
adata.X.min(), adata.X.max()

(0.0, 8.826167)

In [9]:
adata.obs['age'] = '3m'
adata.obs['method'] = adata.obs['study'].values
adata.obs['method'].replace('TM-smartseq2', 'facs', inplace=True)
adata.obs['method'].replace('TM-droplet', 'droplet', inplace=True)

In [10]:
adata.obs['Tech'] = 'MCA'
adata.obs.loc[adata.obs['method'] != 'MCA', 'Tech'] = 'Tabula Muris'

In [11]:
adata.obs['method_age'] = pd.Categorical(adata.obs['method'].str.cat(adata.obs['age'], sep=' - '))

In [12]:
adata.obs['method'].value_counts()

MCA        72502
droplet    35553
facs       25285
Name: method, dtype: int64

In [13]:
adata.obs['method_age'].value_counts()

MCA - 3m        72502
droplet - 3m    35553
facs - 3m       25285
Name: method_age, dtype: int64

In [14]:
adata.obs['Tech'].value_counts()

MCA             72502
Tabula Muris    60838
Name: Tech, dtype: int64

In [None]:
adata_old = sc.read("./data/tabula_senis_mca/tabula_senis_mca_normalized_hvg_all.h5ad")
adata_old

In [None]:
adata_old = adata_old[adata_old.obs['age'] != '3m']
adata_old = adata_old[adata_old.obs['method'] != "MCA"]
adata_old

In [None]:
adata_old.obs['Tech'] = 'Tabula Senis'

In [None]:
adata = adata_old.concatenate(adata)
adata

In [None]:
adata.obs['Tech'].value_counts()

In [None]:
adata.obs['method'].value_counts()

In [None]:
adata.obs['age'].value_counts()

In [None]:
adata.obs['method_age'].value_counts()

In [None]:
adata.obs['tissue'].replace('Mammary', 'Mammary_Gland', inplace=True)
adata.obs['tissue'].replace('MammaryGland', 'Mammary_Gland', inplace=True)

In [None]:
adata.obs['tissue'].value_counts()

In [28]:
adata.write_h5ad("./data/tabula_senis_mca/tabula_senis_mca_new_normalized_hvg_all.h5ad")

... storing 'FACS.selection' as categorical
... storing 'Tech' as categorical
... storing 'age' as categorical
... storing 'barcode' as categorical
... storing 'cell' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'cell_type' as categorical
... storing 'cell_type_union' as categorical
... storing 'channel' as categorical
... storing 'free_annotation' as categorical
... storing 'marker_gene' as categorical
... storing 'method' as categorical
... storing 'method_age' as categorical
... storing 'method_tissue' as categorical
... storing 'mouse.id' as categorical
... storing 'sample' as categorical
... storing 'sample_id' as categorical
... storing 'sex' as categorical
... storing 'study' as categorical
... storing 'subtissue' as categorical
... storing 'tissue' as categorical
... storing 'tissue_age' as categorical
... storing 'tissue_free_annotation' as categorical


In [29]:
senis = adata[adata.obs['age'] != '3m']
senis

View of AnnData object with n_obs × n_vars = 178679 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [30]:
senis = subsample_selection(senis, 'tissue', 0.6)
senis

AnnData object with n_obs × n_vars = 107202 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [31]:
adata = senis.concatenate(subsample_selection(adata[adata.obs['age'] == '3m'], 'tissue', 0.8))
adata

AnnData object with n_obs × n_vars = 213871 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [32]:
adata.write_h5ad("./data/tabula_senis_mca/tabula_senis_mca_new_normalized_hvg_subsampled.h5ad")

... storing 'FACS.selection' as categorical
... storing 'Tech' as categorical
... storing 'age' as categorical
... storing 'barcode' as categorical
... storing 'cell' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'cell_type' as categorical
... storing 'cell_type_union' as categorical
... storing 'channel' as categorical
... storing 'free_annotation' as categorical
... storing 'marker_gene' as categorical
... storing 'method' as categorical
... storing 'method_age' as categorical
... storing 'method_tissue' as categorical
... storing 'mouse.id' as categorical
... storing 'sample' as categorical
... storing 'sample_id' as categorical
... storing 'sex' as categorical
... storing 'study' as categorical
... storing 'subtissue' as categorical
... storing 'tissue' as categorical
... storing 'tissue_free_annotation' as categorical


In [16]:
adata_new = sc.read("./data/tabula_senis_mca/tabula_senis_mca_new_normalized_hvg_all.h5ad")
adata_new

AnnData object with n_obs × n_vars = 312019 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [17]:
adata_old = sc.read("./data/tabula_senis_mca/tabula_senis_mca_normalized_hvg_all.h5ad")
adata_old

AnnData object with n_obs × n_vars = 328516 × 5000 
    obs: 'FACS.selection', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'free_annotation', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'sample', 'sex', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1', 'n_cells-1-1', 'highly_variable-1', 'means-1', 'dispersions-1', 'dispersions_norm-1', 'highly_variable_nbatches-1', 'highly_variable_intersection-1'

In [18]:
adata_new.obs['Tech'].value_counts()

Tabula Senis    178679
MCA              72502
Tabula Muris     60838
Name: Tech, dtype: int64

In [41]:
adata_old.obs['method_age'].value_counts()

MCA - 1m         100309
droplet - 30m     44183
droplet - 18m     30194
droplet - 21m     26236
droplet - 3m      26004
droplet - 24m     24748
facs - 3m         23524
facs - 18m        20739
facs - 24m        17081
droplet - 1m      14770
facs - 21m          728
Name: method_age, dtype: int64

In [21]:
mca_new = adata_new[adata_new.obs['Tech'] == 'MCA']
mca_old = adata_old[adata_old.obs['method'] == 'MCA']

In [22]:
muris_new = adata_new[adata_new.obs['Tech'] == 'Tabula Muris']
muris_old = adata_old[adata_old.obs['age'] == '3m']

In [23]:
mca_new.obs.index = mca_new.obs.index.map(lambda x: x.split("-")[0])
mca_old.obs.index = mca_old.obs.index.map(lambda x: x.split("-")[0])

In [24]:
muris_new.obs.index = muris_new.obs.index.map(lambda x: x.split("-")[0])
muris_old.obs.index = muris_old.obs.index.map(lambda x: x.split("-")[0])

In [25]:
mca_new.shape, mca_old.shape

((72502, 5000), (100309, 5000))

In [26]:
muris_new.shape, muris_old.shape

((60838, 5000), (49528, 5000))

In [42]:
muris_shared_index = muris_new.obs.merge(muris_old.obs, on='index').index.tolist()
mca_shared_index = mca_new.obs.merge(mca_old.obs, on='index').index.tolist()

In [28]:
muris_shared = muris_old[muris_shared_index]
muris_shared

View of AnnData object with n_obs × n_vars = 23128 × 5000 
    obs: 'FACS.selection', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'free_annotation', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'sample', 'sex', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1', 'n_cells-1-1', 'highly_variable-1', 'means-1', 'dispersions-1', 'dispersions_norm-1', 'highly_variable_nbatches-1', 'highly_variable_intersection-1'

In [29]:
muris_shared.obs['cell_type_union'] = muris_new[muris_shared_index].obs['cell_type_union']

Trying to set attribute `.obs` of view, making a copy.


In [30]:
muris_dict = muris_shared.obs.groupby(['cell_ontology_class', 'cell_type_union']).size().to_dict()

In [31]:
cell_type_map = {}
for (cell_type_lvl2, cell_type_lvl1) in muris_dict.keys():
    cell_type_lvl2_population = len(muris_shared[muris_shared.obs['cell_ontology_class'] == cell_type_lvl2])
    if muris_dict[(cell_type_lvl2, cell_type_lvl1)] >= 0.5 * cell_type_lvl2_population:
        cell_type_map[cell_type_lvl2] = cell_type_lvl1

In [33]:
senis = adata_new[adata_new.obs['Tech'] == 'Tabula Senis']
senis

View of AnnData object with n_obs × n_vars = 178679 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [36]:
senis.obs['cell_type_union'] = senis.obs['cell_ontology_class'].map(lambda x: cell_type_map[x] if x in cell_type_map.keys() else x)

In [50]:
mca_shared = mca_new[mca_shared_index]
mca_shared

View of AnnData object with n_obs × n_vars = 71437 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [51]:
mca_dict = mca_shared.obs.groupby(['cell_type', 'cell_type_union']).size().to_dict()

In [54]:
cell_type_map = {}
for (cell_type_lvl2, cell_type_lvl1) in muris_dict.keys():
    cell_type_lvl2_population = len(muris_shared[muris_shared.obs['cell_ontology_class'] == cell_type_lvl2])
    if muris_dict[(cell_type_lvl2, cell_type_lvl1)] >= 0.5 * cell_type_lvl2_population:
        cell_type_map[cell_type_lvl2] = cell_type_lvl1

In [67]:
mca_old.obs['cell_type_union'] = mca_old.obs['cell_type'].map(lambda x: cell_type_map[x] if x in cell_type_map.keys() else x)

In [70]:
mca_old.obs['Tech'] = 'MCA'

In [80]:
adata = senis.concatenate(muris_new, mca_new)
adata

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Or pass `index_unique!=None` to `.concatenate`.


AnnData object with n_obs × n_vars = 312019 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [81]:
adata.obs['Tech'].value_counts()

Tabula Senis    178679
MCA              72502
Tabula Muris     60838
Name: Tech, dtype: int64

In [83]:
adata.write_h5ad("./data/tabula_senis_mca/tabula_senis_mca_new_label_consistent_normalized_hvg_all.h5ad")

... storing 'FACS.selection' as categorical
... storing 'Tech' as categorical
... storing 'age' as categorical
... storing 'barcode' as categorical
... storing 'cell' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'cell_type' as categorical
... storing 'cell_type_union' as categorical
... storing 'channel' as categorical
... storing 'free_annotation' as categorical
... storing 'marker_gene' as categorical
... storing 'method' as categorical
... storing 'method_age' as categorical
... storing 'method_tissue' as categorical
... storing 'mouse.id' as categorical
... storing 'sample' as categorical
... storing 'sample_id' as categorical
... storing 'sex' as categorical
... storing 'study' as categorical
... storing 'subtissue' as categorical
... storing 'tissue' as categorical
... storing 'tissue_free_annotation' as categorical


In [84]:
adata = subsample_selection(senis, 'tissue', 0.4).concatenate(muris_new, mca_new)
adata

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Or pass `index_unique!=None` to `.concatenate`.


AnnData object with n_obs × n_vars = 204808 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [85]:
adata.write_h5ad("./data/tabula_senis_mca/tabula_senis_mca_new_label_consistent_normalized_hvg_subsampled.h5ad")

... storing 'FACS.selection' as categorical
... storing 'Tech' as categorical
... storing 'age' as categorical
... storing 'barcode' as categorical
... storing 'cell' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'cell_type' as categorical
... storing 'cell_type_union' as categorical
... storing 'channel' as categorical
... storing 'free_annotation' as categorical
... storing 'marker_gene' as categorical
... storing 'method' as categorical
... storing 'method_age' as categorical
... storing 'method_tissue' as categorical
... storing 'mouse.id' as categorical
... storing 'sample' as categorical
... storing 'sample_id' as categorical
... storing 'sex' as categorical
... storing 'study' as categorical
... storing 'subtissue' as categorical
... storing 'tissue' as categorical
... storing 'tissue_free_annotation' as categorical


In [5]:
adata = sc.read("./data/tabula_senis_mca/tabula_senis_mca_new_label_consistent_normalized_hvg_all.h5ad")
adata

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


AnnData object with n_obs × n_vars = 312019 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [7]:
adata = adata[~adata.obs['tissue'].isin(['Mammary_Gland', 'Marrow', 'Muscle'])]

In [8]:
adata

View of AnnData object with n_obs × n_vars = 205890 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [9]:
cell_types_to_keep = []
for cell_type in adata.obs['cell_type_union'].unique().tolist():
    if len(adata[adata.obs['cell_type_union'] == cell_type]) > 50:
        cell_types_to_keep.append(cell_type)

In [10]:
adata = adata[adata.obs['cell_type_union'].isin(cell_types_to_keep)]
adata

View of AnnData object with n_obs × n_vars = 205522 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [11]:
adata.obs['Tech'].value_counts()

Tabula Senis    123260
MCA              42049
Tabula Muris     40213
Name: Tech, dtype: int64

In [12]:
subsampled_adata = subsample_selection(adata[adata.obs['Tech'] == 'Tabula Senis'], 'tissue', 0.4)
subsampled_adata

AnnData object with n_obs × n_vars = 49300 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [14]:
subsampled_adata = adata[adata.obs['Tech'] != 'Tabula Senis'].concatenate(subsampled_adata)
subsampled_adata

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Or pass `index_unique!=None` to `.concatenate`.


AnnData object with n_obs × n_vars = 131562 × 5000 
    obs: 'FACS.selection', 'Tech', 'age', 'barcode', 'batch', 'cell', 'cell_ontology_class', 'cell_ontology_id', 'cell_type', 'cell_type_union', 'channel', 'free_annotation', 'log_counts', 'marker_gene', 'method', 'method_age', 'method_tissue', 'mouse.id', 'n_counts', 'n_genes', 'percent_mito', 'sample', 'sample_id', 'sex', 'size_factors', 'study', 'subtissue', 'tissue', 'tissue_age', 'tissue_free_annotation'
    var: 'n_cells-0-1-0', 'n_cells-1-1-0', 'highly_variable-1-0', 'means-1-0', 'dispersions-1-0', 'dispersions_norm-1-0', 'highly_variable_nbatches-1-0', 'highly_variable_intersection-1-0', 'gene_ids-1-1-1'

In [15]:
subsampled_adata.obs['Tech'].value_counts()

Tabula Senis    49300
MCA             42049
Tabula Muris    40213
Name: Tech, dtype: int64

In [16]:
subsampled_adata.write_h5ad("./data/tabula_senis_mca/tabula_senis_mca_new_label_consistent_normalized_hvg_subsampled_notebook.h5ad")

... storing 'FACS.selection' as categorical
... storing 'Tech' as categorical
... storing 'age' as categorical
... storing 'barcode' as categorical
... storing 'cell' as categorical
... storing 'cell_ontology_class' as categorical
... storing 'cell_ontology_id' as categorical
... storing 'cell_type' as categorical
... storing 'cell_type_union' as categorical
... storing 'channel' as categorical
... storing 'free_annotation' as categorical
... storing 'marker_gene' as categorical
... storing 'method' as categorical
... storing 'method_age' as categorical
... storing 'method_tissue' as categorical
... storing 'mouse.id' as categorical
... storing 'sample' as categorical
... storing 'sample_id' as categorical
... storing 'sex' as categorical
... storing 'study' as categorical
... storing 'subtissue' as categorical
... storing 'tissue' as categorical
... storing 'tissue_free_annotation' as categorical
