In [1]:
%load_ext autoreload
%autoreload 2
%aimport anndata
%aimport os
%aimport torch
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
from typing import Dict, Union, List, Tuple
from torch.utils.data import Dataset
# Control UMAP numba warnings
import warnings; warnings.simplefilter('ignore')
import copy
import scvi
from scvi.dataset import PurifiedPBMCDataset
from scvi.models.scanvi import SCANVI

from scvi.inference import UnsupervisedTrainer, load_posterior, SemiSupervisedTrainer
from scvi import set_seed
from scvi.dataset.utils import setup_anndata

# Sets torch and numpy random seeds, run after all scvi imports
set_seed(0)


  data = yaml.load(f.read()) or {}


In [3]:
use_cuda = False

cell_types = ["regulatory_t", "naive_t", "memory_t",  "naive_cytotoxic"]
cell_types = [1,2,3,5]
#cell_types should probably be a list of str instead of ints
pbmc = PurifiedPBMCDataset(subset_datasets=cell_types)
import scanpy
subset = ["CD4", "FOXP3", "TNFRSF18", "IL2RA", "CTLA4", "CD44", "TCF7", "CD8B", "CCR7", "CD69", "PTPRC", "S100A4"]

adata = pbmc.to_anndata()
adata.var_names_make_unique()

#where to subset
# adata = adata[:,subset]

#target sum follows what chenling uses
scanpy.pp.normalize_total(adata, target_sum = 1e4) 
scanpy.pp.log1p(adata)
#corresponding chenling code
#expression = np.log(1 + 1e4 * expression[:, idx] / np.sum(expression, axis=1)[:, np.newaxis])

#second subset option
adata = adata[:,subset]

#scale data to mean 0, std 1
scanpy.pp.scale(adata)

#chenling code:
# expression = expression - np.mean(expression, axis=0)
# expression = expression / np.std(expression, axis=0)

#make sure no inf values
assert len(np.where(np.isinf(adata.X))[0]) == 0
#get the score for a set of genes
def get_score(normalized_adata, gene_set):
    score = np.zeros(normalized_adata.n_obs)
    for gene in gene_set['positive']:
        expression = np.array(adata[:, gene].X)
        score = np.add(score, expression.flatten())
    for gene in gene_set['negative']:
        expression = np.array(adata[:, gene].X)
        score = np.subtract(score, expression.flatten())
    return score

#get a mask over the genes
def get_gene_mask(normalized_adata, gene_set):
    score = get_score(normalized_adata, gene_set)
    cell_idx = score.argsort()[-50:]
    mask = np.zeros(normalized_adata.n_obs)
    mask[cell_idx] = 1
    return mask.astype(bool)

cd4_reg_geneset = {"positive":["TNFRSF18", "CTLA4", "FOXP3", "IL2RA"],
                   "negative":["S100A4" ,"PTPRC" ,"CD8B"]}

cd8_naive_geneset = {"positive":["CD8B", "CCR7"],
                   "negative":["CD4"]}

cd4_naive_geneset = {"positive":["CCR7","CD4"],
                   "negative":["S100A4", "PTPRC", "FOXP3", "IL2RA", "CD69" ]}

cd4_mem_geneset = {"positive":["S100A4"],
                   "negative":["IL2RA" ,"FOXP3","TNFRSF18", "CCR7"]}

cd4_reg_mask = get_gene_mask(adata, cd4_reg_geneset,) #38, 4
cd8_naive_mask = get_gene_mask(adata, cd8_naive_geneset,) #100, 0
cd4_naive_mask = get_gene_mask(adata, cd4_naive_geneset,)#72, 0 
cd4_mem_mask = get_gene_mask(adata, cd4_mem_geneset,)#100, 0 
full_mask = cd4_reg_mask | cd8_naive_mask | cd4_naive_mask | cd4_mem_mask
indicator_lab = np.zeros(len(pbmc))
indicator_lab[full_mask] = 1


[2020-06-23 22:41:04,629] INFO - scvi.dataset.dataset | File /Users/galen/scVI/galen/data/regulatory_t/filtered_gene_bc_matrices.tar.gz already downloaded
[2020-06-23 22:41:04,631] INFO - scvi.dataset.dataset10X | Preprocessing dataset
[2020-06-23 22:41:04,634] INFO - scvi.dataset.dataset10X | Extracting tar file
[2020-06-23 22:41:13,213] INFO - scvi.dataset.dataset10X | Finished preprocessing dataset
[2020-06-23 22:41:13,319] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-06-23 22:41:13,320] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-06-23 22:41:13,458] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-06-23 22:41:13,535] INFO - scvi.dataset.dataset | Downsampled from 10263 to 10263 cells
[2020-06-23 22:41:13,560] INFO - scvi.dataset.dataset | File /Users/galen/scVI/galen/data/naive_t/filtered_gene_bc_matrices.tar.gz already downloaded
[2020-06-23 22:41:13,561] INFO - scvi.dataset.dataset10X | Preprocessing dataset

Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [4]:
raw_adata = pbmc.to_anndata()
raw_adata.obs['scanvi_labeled_mask'] = indicator_lab
setup_anndata(raw_adata, batch_key = 'batch_indices', scanvi_labeled_idx_key = 'scanvi_labeled_mask', labels_key = 'cell_types')


Transforming to str index.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


[2020-06-23 22:51:54,842] INFO - scvi.dataset.utils | Using data from adata.X
[2020-06-23 22:51:54,842] INFO - scvi.dataset.utils | Using batches from adata.obs["batch_indices"]
[2020-06-23 22:51:54,844] INFO - scvi.dataset.utils | Using labels from adata.obs["cell_types"]
[2020-06-23 22:51:54,856] INFO - scvi.dataset.utils | Computing library size prior per batch
[2020-06-23 22:51:55,152] INFO - scvi.dataset.utils | Successfully registered anndata object containing 42919 cells, 18443 genes, and 4 batches 
Registered keys:['X', 'batch_indices', 'local_l_mean', 'local_l_var', 'labels', 'scanvi_labeled_idx']


In [5]:
from scvi.dataset.utils import get_from_registry

In [14]:
# %timeit get_from_registry(raw_adata, 'X') # 1.2 us 
# %timeit get_from_registry(raw_adata, 'X')[10] #74 us 
# %timeit get_from_registry(raw_adata, 'X')[10].toarray() #87.4 us
# %timeit get_from_registry(raw_adata, 'X')[10].toarray().flatten() #90.5 us
# %timeit get_from_registry(raw_adata, 'X')[10].toarray().flatten() #90.5 us
# %timeit get_from_registry(raw_adata, 'X')[10].toarray().flatten().astype(np.float32) #99.7 us
# %timeit torch.from_numpy(get_from_registry(raw_adata, 'X')[10].toarray().flatten().astype(np.float32)) #105 us



105 µs ± 3.23 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [28]:
csr = raw_adata.X

In [31]:
# %timeit csr[10] #60us
%timeit torch.from_numpy(csr[10].toarray().flatten().astype(np.float32))

86.7 µs ± 1.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [35]:
raw_adata.shape

(42919, 18443)

In [33]:
%%time
for idx in range(n_cells):
    attributes_and_types = {'X': np.float32}
    [csr[idx].toarray().flatten().astype(dtype) for key, dtype in attributes_and_types.items()]

#     data_numpy = {
#                 key: get_from_registry(raw_adata, key)[idx].astype(dtype)
#                 if isinstance(get_from_registry(raw_adata, key), np.ndarray)
#                 else get_from_registry(raw_adata, key)[idx]
#                 .toarray()
#                 .flatten()
#                 .astype(dtype)
#                 for key, dtype in attributes_and_types.items()
#                 }


CPU times: user 3.5 s, sys: 49.9 ms, total: 3.55 s
Wall time: 3.56 s


In [27]:
get_from_registry(raw_adata, key)[idx]

NameError: name 'key' is not defined