In [1]:
from scvi.dataset import (
    PreFrontalCortexStarmapDataset,
    FrontalCortexDropseqDataset,
    SmfishDataset,
    CortexDataset,
)
from scvi.models import JVAE, Classifier
from scvi.inference import JVAETrainer

import notebooks.utils.gimvi_tutorial as gimvi_utils

ModuleNotFoundError: No module named 'notebooks'

In [2]:
import numpy as np
import copy

In [6]:
test_mode = True

save_path = "../../data"

data_spatial = SmfishDataset(save_path=save_path)
data_seq = CortexDataset(
    save_path=save_path, total_genes=None
)
# make sure gene names have the same case
data_spatial.make_gene_names_lower()
data_seq.make_gene_names_lower()
# filters genes by gene_names
data_seq.filter_genes_by_attribute(data_spatial.gene_names)
if test_mode:
    data_seq = data_spatial

[2020-06-25 12:59:25,908] INFO - scvi.dataset.dataset | File /Users/galen/data/osmFISH_SScortex_mouse_all_cell.loom already downloaded
[2020-06-25 12:59:25,909] INFO - scvi.dataset.smfish | Loading smFISH dataset
[2020-06-25 12:59:25,960] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-06-25 12:59:25,962] INFO - scvi.dataset.dataset | Downsampled from 6471 to 4530 cells
[2020-06-25 12:59:25,964] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-06-25 12:59:25,967] INFO - scvi.dataset.dataset | Remapping batch_indices to [0,N]
[2020-06-25 12:59:25,968] INFO - scvi.dataset.dataset | File /Users/galen/data/expression.bin already downloaded
[2020-06-25 12:59:25,969] INFO - scvi.dataset.cortex | Loading Cortex data
[2020-06-25 12:59:34,107] INFO - scvi.dataset.cortex | Finished preprocessing Cortex data
[2020-06-25 12:59:34,970] INFO - scvi.dataset.dataset | Remapping labels to [0,N]
[2020-06-25 12:59:34,971] INFO - scvi.dataset.dataset | Remappin

In [7]:
data_seq.filter_cells_by_count(1)
data_spatial.filter_cells_by_count(1)

[2020-06-25 12:59:35,278] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-06-25 12:59:35,279] INFO - scvi.dataset.dataset | Downsampled from 4530 to 4530 cells
[2020-06-25 12:59:35,283] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-06-25 12:59:35,285] INFO - scvi.dataset.dataset | Downsampled from 4530 to 4530 cells


In [8]:
train_size = 0.8

gene_names_rnaseq = data_seq.gene_names
np.random.seed(0)
n_genes = len(gene_names_rnaseq)
gene_ids_train = sorted(
    np.random.choice(range(n_genes), int(n_genes * train_size), False)
)
gene_ids_test = sorted(set(range(n_genes)) - set(gene_ids_train))

gene_names_fish = gene_names_rnaseq[gene_ids_train]

# Create copy of the fish dataset with hidden genes
data_spatial_partial = copy.deepcopy(data_spatial)
data_spatial_partial.filter_genes_by_attribute(gene_names_fish)
data_spatial_partial.batch_indices += data_seq.n_batches

[2020-06-25 13:00:06,107] INFO - scvi.dataset.dataset | Downsampling from 33 to 26 genes
[2020-06-25 13:00:06,111] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-06-25 13:00:06,113] INFO - scvi.dataset.dataset | Filtering non-expressing cells.
[2020-06-25 13:00:06,117] INFO - scvi.dataset.dataset | Computing the library size for the new data
[2020-06-25 13:00:06,119] INFO - scvi.dataset.dataset | Downsampled from 4530 to 4530 cells


In [9]:
datasets = [data_seq, data_spatial_partial]
generative_distributions = ["zinb", "nb"]
gene_mappings = [slice(None), np.array(gene_ids_train)]
n_inputs = [d.nb_genes for d in datasets]
total_genes = data_seq.nb_genes
n_batches = sum([d.n_batches for d in datasets])

model_library_size = [True, False]

n_latent = 8
kappa = 1

In [12]:
seq_adata = data_seq.to_anndata()



In [13]:
spatial_adata = data_spatial_partial.to_anndata()

In [16]:
seq_adata

AnnData object with n_obs × n_vars = 4530 × 33
    obs: 'x_coord', 'batch_indices', 'cell_types', 'y_coord'
    uns: 'cell_measurements_col_mappings'

In [15]:
from scvi.dataset.utils import setup_anndata

In [17]:
setup_anndata(seq_adata, 'batch_indices')

[2020-06-25 13:07:13,124] INFO - scvi.dataset.utils | Using data from adata.X
[2020-06-25 13:07:13,125] INFO - scvi.dataset.utils | Using batches from adata.obs["batch_indices"]
[2020-06-25 13:07:13,126] INFO - scvi.dataset.utils | No label_key inputted, assuming all cells have same label
[2020-06-25 13:07:13,138] INFO - scvi.dataset.utils | Computing library size prior per batch
[2020-06-25 13:07:13,149] INFO - scvi.dataset.utils | Successfully registered anndata object containing 4530 cells, 33 genes, and 1 batches 
Registered keys:['X', 'batch_indices', 'local_l_mean', 'local_l_var', 'labels']


In [18]:
spatial_adata

AnnData object with n_obs × n_vars = 4530 × 26
    obs: 'batch_indices', 'x_coord', 'cell_types', 'y_coord'
    uns: 'cell_measurements_col_mappings'

In [10]:
import torch

torch.manual_seed(0)

model = JVAE(
    n_inputs,
    total_genes,
    gene_mappings,
    generative_distributions,
    model_library_size,
    n_layers_decoder_individual=0,
    n_layers_decoder_shared=0,
    n_layers_encoder_individual=1,
    n_layers_encoder_shared=1,
    dim_hidden_encoder=64,
    dim_hidden_decoder_shared=64,
    dropout_rate_encoder=0.2,
    dropout_rate_decoder=0.2,
    n_batch=n_batches,
    n_latent=n_latent,
)

discriminator = Classifier(n_latent, 32, 2, 3, logits=True)

trainer = JVAETrainer(model, discriminator, datasets, 0.95, frequency=1, kappa=kappa)

AttributeError: 'SmfishDataset' object has no attribute 'uns_keys'

In [11]:
n_epochs = if_not_test_else(200, 1)
trainer.train(n_epochs=n_epochs)

NameError: name 'if_not_test_else' is not defined