In [1]:
import scgen
import scanpy as sc
import pandas as pd
import numpy as np
from metrics import compute_entropy, silhouette_coeff_ASW

sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100)
sc.settings.set_figure_params(dpi_save=300)
sc.logging.print_version_and_date()

Running Scanpy 1.7.2, on 2021-06-15 18:16.


In [2]:
train_scgen = sc.read_h5ad('data/HCA.h5ad')
train_scgen.obs['batch'] = train_scgen.obs['tissue']

celltypes = train_scgen.obs.celltype.tolist()
consider_celltypes = ['CLP', 'CMP', 'DC', 'Erythrocytes', 'Fibroblasts', 'GMP', 'HSC', 'Macrophages', 
                      'Megakaryocytes', 'Memory.B.cells', 'MEP',
                      'Mono', 'MPP', 'Myeloid.progenitor', 'naive.B.cells', 'PBMC_cd4mem', 'PBMC_cd4naive', 'PBMC_cd8mem',
                      'PBMC_cd8naive', 'PBMC_nk', 'PBMC_treg', 'Plasma.cells']
celltypes=['DC' if i.startswith('DC') else i for i in celltypes]
celltypes=['Mono' if i.startswith('Mono') else i for i in celltypes]
celltypes=['remove' if i not in consider_celltypes else i for i in celltypes]
train_scgen.obs['celltype'] = celltypes
train_scgen = train_scgen[train_scgen.obs.celltype != 'remove']

sc.pp.normalize_total(train_scgen)
sc.pp.log1p(train_scgen)
sc.pp.highly_variable_genes(train_scgen, n_top_genes=7000)
train_scgen = train_scgen[:,train_scgen.var.highly_variable]

  view_to_actual(adata)
normalizing counts per cell
    finished (0:00:02)
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
    finished (0:00:11)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)


In [3]:
train_scgen.obs["cell_type"] = train_scgen.obs["celltype"].tolist()

Trying to set attribute `.obs` of view, copying.


In [4]:
train_scgen = scgen.setup_anndata(train_scgen, batch_key="batch", labels_key="cell_type",  copy=True)

[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"batch"[0m[1m][0m                                               
[34mINFO    [0m Using labels from adata.obs[1m[[0m[32m"cell_type"[0m[1m][0m                                            
[34mINFO    [0m Using data from adata.X                                                             
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m528014[0m cells, [1;36m7000[0m vars, [1;36m2[0m        
         batches, [1;36m22[0m labels, and [1;36m0[0m proteins. Also registered [1;36m0[0m extra categorical covariates  
         and [1;36m0[0m extra continuous covariates.                                                  
[34mINFO    [0m Please do not further modify adata until model is trained.                          


In [5]:
model = scgen.SCGEN(train_scgen)

  "Make sure the registered X field in anndata contains unnormalized count data."


In [6]:
model.train(
    max_epochs=100,
    batch_size=1024,
    early_stopping=True,
    early_stopping_patience=25,
    use_gpu=True
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Epoch 31/100:  31%|███       | 31/100 [28:21<1:03:07, 54.90s/it, loss=32.6, v_num=1]


In [8]:
adata_scgen = model.batch_removal()

RuntimeError: CUDA out of memory. Tried to allocate 13.77 GiB (GPU 0; 10.92 GiB total capacity; 3.73 GiB already allocated; 6.43 GiB free; 3.76 GiB reserved in total by PyTorch)

In [7]:
adata_scgen = model.batch_removal()

adata_scgen.obsm['X_pca'] = adata_scgen.obsm['corrected_latent']
sc.pp.neighbors(adata_scgen)
sc.tl.umap(adata_scgen)

  "Make sure the registered X field in anndata contains unnormalized count data."


RuntimeError: CUDA out of memory. Tried to allocate 13.77 GiB (GPU 0; 10.92 GiB total capacity; 1.96 GiB already allocated; 8.20 GiB free; 1.99 GiB reserved in total by PyTorch)

In [None]:
sc.pl.umap(adata_scgen,color='batch',save='_figure6_scgen_batch_1024.pdf')
sc.pl.umap(adata_scgen,color='celltype',save='_figure6_scgen_celltype_1024.pdf')

In [None]:
asw_scgen = silhouette_coeff_ASW(adata_scgen)
entropy_scgen = compute_entropy(adata_scgen)

In [None]:
asw_scgen.to_csv('asw_scgen_fig6_1024.csv',index=0)
entropy_scgen.to_csv('entropy_scgen_fig6_1024.csv',index=0)