In [37]:
%load_ext autoreload
%autoreload 2
%aimport anndata
%aimport os
%aimport torch
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from typing import Dict, Union, List, Tuple
from torch.utils.data import Dataset
# Control UMAP numba warnings
import warnings; warnings.simplefilter('ignore')
import copy
import scvi
from scvi.dataset import PurifiedPBMCDataset
from scvi.models.scanvi import SCANVI

from scvi.inference import UnsupervisedTrainer, load_posterior, SemiSupervisedTrainer
from scvi import set_seed
from scvi.dataset.utils import setup_anndata

# Sets torch and numpy random seeds, run after all scvi imports
set_seed(0)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
use_cuda = False

cell_types = ["regulatory_t", "naive_t", "memory_t",  "naive_cytotoxic"]
cell_types = [1,2,3,5]
#cell_types should probably be a list of str instead of ints
pbmc = PurifiedPBMCDataset(subset_datasets=cell_types)

[2020-06-23 23:13:43,872] INFO - scvi.dataset.dataset | File /Users/galen/scVI/galen/data/regulatory_t/filtered_gene_bc_matrices.tar.gz already downloaded
[2020-06-23 23:13:43,873] INFO - scvi.dataset.dataset10X | Preprocessing dataset
[2020-06-23 23:13:43,876] INFO - scvi.dataset.dataset10X | Extracting tar file
[2020-06-23 23:13:52,738] INFO - scvi.dataset.dataset10X | Finished preprocessing dataset
[2020-06-23 23:13:52,792] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-06-23 23:13:52,793] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-06-23 23:13:52,903] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-06-23 23:13:52,944] INFO - scvi.dataset.dataset | Downsampled from 10263 to 10263 cells
[2020-06-23 23:13:52,968] INFO - scvi.dataset.dataset | File /Users/galen/scVI/galen/data/naive_t/filtered_gene_bc_matrices.tar.gz already downloaded
[2020-06-23 23:13:52,969] INFO - scvi.dataset.dataset10X | Preprocessing dataset

In [4]:
import scanpy
subset = ["CD4", "FOXP3", "TNFRSF18", "IL2RA", "CTLA4", "CD44", "TCF7", "CD8B", "CCR7", "CD69", "PTPRC", "S100A4"]

adata = pbmc.to_anndata()
adata.var_names_make_unique()

#where to subset
# adata = adata[:,subset]

#target sum follows what chenling uses
scanpy.pp.normalize_total(adata, target_sum = 1e4) 
scanpy.pp.log1p(adata)
#corresponding chenling code
#expression = np.log(1 + 1e4 * expression[:, idx] / np.sum(expression, axis=1)[:, np.newaxis])

#second subset option
adata = adata[:,subset]

#scale data to mean 0, std 1
scanpy.pp.scale(adata)

#chenling code:
# expression = expression - np.mean(expression, axis=0)
# expression = expression / np.std(expression, axis=0)

#make sure no inf values
assert len(np.where(np.isinf(adata.X))[0]) == 0


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [5]:
#get the score for a set of genes
def get_score(normalized_adata, gene_set):
    score = np.zeros(normalized_adata.n_obs)
    for gene in gene_set['positive']:
        expression = np.array(adata[:, gene].X)
        score = np.add(score, expression.flatten())
    for gene in gene_set['negative']:
        expression = np.array(adata[:, gene].X)
        score = np.subtract(score, expression.flatten())
    return score

#get a mask over the genes
def get_gene_mask(normalized_adata, gene_set):
    score = get_score(normalized_adata, gene_set)
    cell_idx = score.argsort()[-50:]
    mask = np.zeros(normalized_adata.n_obs)
    mask[cell_idx] = 1
    return mask.astype(bool)

    

In [6]:
cd4_reg_geneset = {"positive":["TNFRSF18", "CTLA4", "FOXP3", "IL2RA"],
                   "negative":["S100A4" ,"PTPRC" ,"CD8B"]}

cd8_naive_geneset = {"positive":["CD8B", "CCR7"],
                   "negative":["CD4"]}

cd4_naive_geneset = {"positive":["CCR7","CD4"],
                   "negative":["S100A4", "PTPRC", "FOXP3", "IL2RA", "CD69" ]}

cd4_mem_geneset = {"positive":["S100A4"],
                   "negative":["IL2RA" ,"FOXP3","TNFRSF18", "CCR7"]}

cd4_reg_mask = get_gene_mask(adata, cd4_reg_geneset,) #38, 4
cd8_naive_mask = get_gene_mask(adata, cd8_naive_geneset,) #100, 0
cd4_naive_mask = get_gene_mask(adata, cd4_naive_geneset,)#72, 0 
cd4_mem_mask = get_gene_mask(adata, cd4_mem_geneset,)#100, 0 
full_mask = cd4_reg_mask | cd8_naive_mask | cd4_naive_mask | cd4_mem_mask
indicator_lab = np.zeros(len(pbmc))
indicator_lab[full_mask] = 1

In [30]:
np.sum(indicator_lab)

200.0

In [29]:
gene_dataset = pbmc

#indicator mask
full_mask = cd4_reg_mask | cd8_naive_mask | cd4_naive_mask | cd4_mem_mask
indicator_lab = np.zeros(len(gene_dataset))
indicator_lab[full_mask] = 1

scanvi = SCANVI(gene_dataset.nb_genes,n_batch = 0, n_labels=6, n_layers=2, classifier_parameters = {'dropout_rate':0.2, 'n_hidden':256, 'n_layers':2})
trainer_scanvi = SemiSupervisedTrainer(scanvi, gene_dataset, classification_ratio=100,
                                       n_epochs_classifier=1, lr_classification=5 * 1e-3,  frequency=5)

trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(indicator_lab == 1))
trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(indicator_lab == 0))
trainer_scanvi.labelled_set.to_monitor = ['reconstruction_error', 'accuracy']
trainer_scanvi.unlabelled_set.to_monitor = ['reconstruction_error', 'accuracy']

trainer_scanvi.train(n_epochs=1)
trainer_scanvi.model.eval()

full_scanvi = trainer_scanvi.create_posterior(trainer_scanvi.model, gene_dataset, indices=np.arange(len(gene_dataset)))




In [32]:
gene_dataset


GeneExpressionDataset object with n_cells x nb_genes = 42919 x 18443
    gene_attribute_names: 'gene_names'
    cell_attribute_names: 'batch_indices', 'local_means', 'local_vars', 'labels', 'barcodes'
    cell_categorical_attribute_names: 'batch_indices', 'labels'

In [35]:
import time
a = time.time()
i = 0
for a, b in enumerate(full_scanvi):
    i+=1
#     print(b)
b = time.time()
print(b-a)
print(i)

AttributeError: 'tuple' object has no attribute 'items'

In [39]:
raw_adata = pbmc.to_anndata()
raw_adata.obs['scanvi_labeled_mask'] = indicator_lab

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [64]:
import pandas as pd
isinstance(raw_adata.X, pd.Series)

False

In [46]:
raw_adata.X = raw_adata.X.toarray()

In [79]:
%timeit raw_adata.X[10].toarray()


74.9 µs ± 330 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [70]:
raw_adata.layers['rawX'] = raw_adata.X.toarray()

In [73]:
%timeit raw_adata.layers['rawX'][10].flatten().astype(np.float32)

6.72 µs ± 48.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [48]:
from sys import getsizeof
getsizeof(raw_adata.X)

3166220580

In [40]:
setup_anndata(raw_adata, batch_key = 'batch_indices', scanvi_labeled_idx_key = 'scanvi_labeled_mask', labels_key = 'cell_types')



[2020-06-23 23:57:21,444] INFO - scvi.dataset.utils | Using data from adata.X
[2020-06-23 23:57:21,445] INFO - scvi.dataset.utils | Using batches from adata.obs["batch_indices"]
[2020-06-23 23:57:21,446] INFO - scvi.dataset.utils | Using labels from adata.obs["cell_types"]
[2020-06-23 23:57:21,451] INFO - scvi.dataset.utils | Computing library size prior per batch
[2020-06-23 23:57:21,662] INFO - scvi.dataset.utils | Successfully registered anndata object containing 42919 cells, 18443 genes, and 4 batches 
Registered keys:['X', 'batch_indices', 'local_l_mean', 'local_l_var', 'labels', 'scanvi_labeled_idx']


In [58]:
raw_adata.X.shape

(42919, 18443)

In [42]:
scanvi = SCANVI(raw_adata.uns['scvi_summary_stats']['n_genes'], n_batch = 0, n_labels=6, n_layers=2, classifier_parameters = {'dropout_rate':0.2, 'n_hidden':256, 'n_layers':2})
trainer_scanvi = SemiSupervisedTrainer(scanvi, raw_adata, classification_ratio=100,
                                       n_epochs_classifier=1, lr_classification=5 * 1e-3,  frequency=5)

trainer_scanvi.labelled_set = trainer_scanvi.create_posterior(indices=(indicator_lab == 1))
trainer_scanvi.unlabelled_set = trainer_scanvi.create_posterior(indices=(indicator_lab == 0))
trainer_scanvi.labelled_set.to_monitor = ['reconstruction_error', 'accuracy']
trainer_scanvi.unlabelled_set.to_monitor = ['reconstruction_error', 'accuracy']

# trainer_scanvi.train(n_epochs=1)
trainer_scanvi.model.eval()

full_scanvi = trainer_scanvi.create_posterior(trainer_scanvi.model, raw_adata, )
# latent_scanvi, batch_indices, labels = full_scanvi.sequential().get_latent()


ValueError: too many values to unpack (expected 5)

In [45]:
import time
a = time.time()
i = 0
for e, d in enumerate(full_scanvi):
    i+=1
#     print(b)
b = time.time()
print(b-a)
print(i)

20.95548391342163
336


In [44]:
print(i)

336


In [75]:
scanvi = SCANVI(raw_adata.uns['scvi_summary_stats']['n_genes'], n_batch = 0, n_labels=6, n_layers=2, classifier_parameters = {'dropout_rate':0.2, 'n_hidden':256, 'n_layers':2})
trainer_scanvi = SemiSupervisedTrainer(scanvi, raw_adata, classification_ratio=100,
                                       n_epochs_classifier=1, lr_classification=5 * 1e-3,  frequency=5)

full = trainer_scanvi.create_posterior()


KeyError: 'scvi_summary_stats'

In [74]:
import time
a = time.time()
for d, e in enumerate(full):
    pass
b = time.time()
b-a

18.18271803855896

In [None]:
#time for regular: 1 sec
#csr get_from_registry(self.adata, key)[idx]
            .toarray() 22.8 sec
#from dense 14 sec

In [31]:
raw_adata.X = raw_adata.X.toarray()


In [37]:
b


1592954082.626915

In [38]:
a

335