In [115]:
import scanpy as sc
import surgeon
import numpy as np
sc.settings.set_figure_params(dpi=200, frameon=False)

In [116]:
condition_key = "Method"
cell_type_key = 'CellType'
target_condition = "Drop-seq"

In [117]:
adata_broad = sc.read("./data/PBMC/broad_pbmc_count.h5ad")
adata_pbmc_bbknn = sc.read("./data/PBMC/bbknn_pbmc_raw.h5ad")
adata_pbmc_68k = sc.read("./data/PBMC/pbmc_68k.h5ad")
adata_small_3k = sc.read("./data/PBMC/pbmc3k.h5ad")

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [118]:
adata_small_3k.obs[cell_type_key] = adata_small_3k.obs['louvain']
adata_small_3k.obs[cell_type_key].value_counts()

CD4 T                1153
CD14+ Monocytes       480
B                     341
CD8 T                 303
NK                    157
FCGR3A+ Monocytes     153
Dendritic              36
Megakaryocytes         15
Name: CellType, dtype: int64

In [119]:
new_categories = ['CD4+T', 'CD14+Monocyte', 'B', 'CD8+T', 'NK', 'FCGR3A+Monocytes',
       'Dendritic', 'Megakaryocyte']
adata_small_3k.rename_categories(cell_type_key, new_categories)
adata_small_3k.obs[cell_type_key].value_counts()

CD4+T               1153
CD14+Monocyte        480
B                    341
CD8+T                303
NK                   157
FCGR3A+Monocytes     153
Dendritic             36
Megakaryocyte         15
Name: CellType, dtype: int64

In [120]:
adata_small_3k.obs[condition_key] = 'Small 3K'

In [121]:
adata_broad.obs[cell_type_key].value_counts()

Cytotoxic T cell               6094
CD4+ T cell                    4867
CD14+ monocyte                 3501
B cell                         3269
Natural killer cell            1147
CD16+ monocyte                  653
Megakaryocyte                   634
Dendritic cell                  282
Plasmacytoid dendritic cell     108
Name: CellType, dtype: int64

In [122]:
new_categories = ['B', 'CD4+T', 'CD14+Monocyte', 'CD16+Monocyte',
       'Cytotoxic+T', 'Dendritic', 'Megakaryocyte',
       'NK', 'Plasmacytoid dendritic']
adata_broad.rename_categories(cell_type_key, new_categories)
adata_broad.obs[cell_type_key].value_counts()

Cytotoxic+T               6094
CD4+T                     4867
CD14+Monocyte             3501
B                         3269
NK                        1147
CD16+Monocyte              653
Megakaryocyte              634
Dendritic                  282
Plasmacytoid dendritic     108
Name: CellType, dtype: int64

In [123]:
adata_pbmc_68k.obs[cell_type_key] = adata_pbmc_68k.obs['bulk_labels']
adata_pbmc_68k.obs[cell_type_key].value_counts()

CD8+/CD45RA+ Naive Cytotoxic    21975
CD4+/CD25 T Reg                 14112
CD8+ Cytotoxic T                11445
CD56+ NK                         5859
CD19+ B                          3817
CD14+ Monocyte                   3306
CD4+/CD45RO+ Memory              3126
CD4+/CD45RA+/CD25- Naive T       2793
Dendritic                        1865
CD34+                             262
CD4+ T Helper2                     19
Name: CellType, dtype: int64

In [124]:
new_categories = ['CD4+T', 'CD4+/CD25+T', 'CD4+/CD45RA+/CD25-Naive T',
       'CD4+/CD45RO+Memory', 'CD8+Cytotoxic T',
       'CD8+/CD45RA+Naive Cytotoxic', 'CD14+Monocyte', 'CD19+B', 'CD34+',
       'CD56+NK', 'Dendritic']
adata_pbmc_68k.rename_categories(cell_type_key, new_categories)
adata_pbmc_68k.obs[cell_type_key].value_counts()

CD8+/CD45RA+Naive Cytotoxic    21975
CD4+/CD25+T                    14112
CD8+Cytotoxic T                11445
CD56+NK                         5859
CD19+B                          3817
CD14+Monocyte                   3306
CD4+/CD45RO+Memory              3126
CD4+/CD45RA+/CD25-Naive T       2793
Dendritic                       1865
CD34+                            262
CD4+T                             19
Name: CellType, dtype: int64

In [125]:
adata_pbmc_68k.obs[condition_key] = '68K'

In [126]:
adata_small_3k = sc.AnnData(adata_small_3k.raw.X, var=adata_small_3k.raw.var, obs=adata_small_3k.obs) 

In [127]:
adata_broad.X.min(), adata_broad.X.max() # Count

(0.0, 157681.0)

In [128]:
adata_pbmc_bbknn.X.min(), adata_pbmc_bbknn.X.max() # Normalized

(0.0, 8.5225315)

In [129]:
adata_pbmc_68k.X.min(), adata_pbmc_68k.X.max() # Count

(0.0, 260.0)

In [130]:
adata_small_3k.X.min(), adata_small_3k.X.max() # Normalized

(0.0, 7.4695992)

## Normalize Broad & 68K adata

In [131]:
sc.pp.normalize_per_cell(adata_broad)
sc.pp.log1p(adata_broad)

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


In [132]:
sc.pp.normalize_per_cell(adata_pbmc_68k)
sc.pp.log1p(adata_pbmc_68k)

In [133]:
adata_broad.X.min(), adata_broad.X.max() 

(0.0, 9.3747)

In [134]:
adata_pbmc_68k.X.min(), adata_pbmc_68k.X.max()

(0.0, 5.483166)

In [135]:
adata_pbmc_68k.X = adata_pbmc_68k.X.A

In [136]:
adata_small_3k.X = adata_small_3k.X.A

In [137]:
pbmc = adata_broad.concatenate(adata_pbmc_68k, adata_small_3k)
pbmc

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
Or pass `index_unique!=None` to `.concatenate`.


AnnData object with n_obs × n_vars = 91772 × 13157 
    obs: 'CBC', 'CellType', 'Experiment', 'Method', 'batch', 'bulk_labels', 'louvain', 'n_counts', 'n_genes', 'percent_mito'
    var: 'ENS-0', 'Name-0', 'n_counts-1', 'gene_ids-2', 'n_cells-2'

In [138]:
pbmc.obs[cell_type_key].value_counts()

CD8+/CD45RA+Naive Cytotoxic    21975
CD4+/CD25+T                    14112
CD8+Cytotoxic T                11445
CD14+Monocyte                   7287
Cytotoxic+T                     6094
CD4+T                           6039
CD56+NK                         5859
CD19+B                          3817
B                               3610
CD4+/CD45RO+Memory              3126
CD4+/CD45RA+/CD25-Naive T       2793
Dendritic                       2183
NK                              1304
CD16+Monocyte                    653
Megakaryocyte                    649
CD8+T                            303
CD34+                            262
FCGR3A+Monocytes                 153
Plasmacytoid dendritic           108
Name: CellType, dtype: int64

In [139]:
pbmc.obs[condition_key].value_counts()

68K                  68579
Drop-seq              6584
inDrops               6584
10x Chromium V2 A     3222
10x Chromium V2 B     3222
Small 3K              2638
Smart-seq2             526
CEL-Seq2               417
Name: Method, dtype: int64

## Select Highly Variable Genes

In [140]:
sc.pp.highly_variable_genes(pbmc, n_top_genes=2000)

In [141]:
pbmc = pbmc[:, pbmc.var['highly_variable']]
pbmc

View of AnnData object with n_obs × n_vars = 91772 × 2000 
    obs: 'CBC', 'CellType', 'Experiment', 'Method', 'batch', 'bulk_labels', 'louvain', 'n_counts', 'n_genes', 'percent_mito'
    var: 'ENS-0', 'Name-0', 'n_counts-1', 'gene_ids-2', 'n_cells-2', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'

In [142]:
pbmc.X.min(), pbmc.X.max()

(ArrayView(0., dtype=float32), ArrayView(9.3747, dtype=float32))

In [143]:
pbmc.obs[condition_key].value_counts()

68K                  68579
Drop-seq              6584
inDrops               6584
10x Chromium V2 A     3222
10x Chromium V2 B     3222
Small 3K              2638
Smart-seq2             526
CEL-Seq2               417
Name: Method, dtype: int64

In [144]:
pbmc.obs[cell_type_key].value_counts()

CD8+/CD45RA+Naive Cytotoxic    21975
CD4+/CD25+T                    14112
CD8+Cytotoxic T                11445
CD14+Monocyte                   7287
Cytotoxic+T                     6094
CD4+T                           6039
CD56+NK                         5859
CD19+B                          3817
B                               3610
CD4+/CD45RO+Memory              3126
CD4+/CD45RA+/CD25-Naive T       2793
Dendritic                       2183
NK                              1304
CD16+Monocyte                    653
Megakaryocyte                    649
CD8+T                            303
CD34+                            262
FCGR3A+Monocytes                 153
Plasmacytoid dendritic           108
Name: CellType, dtype: int64

In [145]:
pbmc.obs_names_make_unique()

In [146]:
pbmc.write_h5ad("./data/pbmc/pbmc.h5ad")

Trying to set attribute `.obs` of view, making a copy.
... storing 'CBC' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'CellType' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'Experiment' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'Method' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'bulk_labels' as categorical
Trying to set attribute `.obs` of view, making a copy.
... storing 'louvain' as categorical
