In [26]:
import sys

#if branch is stable, will install via pypi, else will install from source
branch = "stable"
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB and branch == "stable":
    !pip install --quiet scvi-tools[tutorials]
elif IN_COLAB and branch != "stable":
    !pip install --quiet --upgrade jsonschema
    !pip install --quiet git+https://github.com/yoseflab/scvi-tools@$branch#egg=scvi-tools[tutorials]

In [27]:
def Barplot(which_var, adata, var='clusters', height=3, color = False):
    plotdata = pd.crosstab(adata.obs[var], adata.obs[which_var], normalize='index') * 100
    if 'category' in plotdata.index.dtype.name:
        plotdata.index.reorder_categories(adata.obs[var].cat.categories[::-1])

    if not color:
        ax1 = plotdata.plot.barh(stacked = True, edgecolor = 'none', zorder = 3, figsize = (6,height), fontsize = 14, grid = False)
    else:
        ax1 = plotdata.plot.barh(stacked = True, edgecolor = 'none', zorder = 3, figsize = (6,height), fontsize = 14, grid = False, color = color)
    ax1.set_title(which_var+' %')
    ax1.set_ylabel(var)
    horiz_offset = 1
    vert_offset = 1.
    ax1 = ax1.legend(bbox_to_anchor = (horiz_offset, vert_offset))
#     ax1.figure.savefig(str(sc.settings.figdir)+'/barplot_'+var+'_proportions_'+which_var+'.pdf', bbox_inches='tight',
#                        dpi=300, orientation='landscape', format= 'pdf', optimize=True)

In [28]:
 palette = ["#E31A1C", "#1F78B4", "#A6CEE3",  "#B2DF8A", "#33A02C", "#FB9A99",  "#FDBF6F", "#FF7F00", "#CAB2D6", "#6A3D9A", "#FFFF99", "#B15928", "#66C2A5",
               "#FC8D62", "#8DA0CB", "#B3B3B3", "#A6D854", "#FFD92F", "#E5C494", "#E78AC3"]

#### Set up scVI environment

In [29]:
import scvi
import scanpy as sc

sc.set_figure_params(figsize=(4, 4))

In [30]:
import pandas as pd 
import numpy as np

# FEMALE

### Read in dataset

In [31]:
path_to_gonads = '/nfs/team292/lg18/with_valentina/gonadsV2_revision/'

In [32]:
adata = sc.read(path_to_gonads + 'FCA-gonads_rawcounts.h5ad')

In [33]:
adata = adata[['female' in i for i in adata.obs.sex]]
adata.X.shape

(243972, 28820)

In [34]:
sc.pp.filter_genes(adata, min_counts=3)
adata.X.shape

Trying to set attribute `.var` of view, copying.
  container[idx] = value


(243972, 28381)

### OPTIONAL - Remove cc genes

In [35]:
# ccgs defined in M1
adata = adata[:,[i not in adata.uns['ccgs'] for i in adata.var_names]]
print('Total number of genes after ccg filter: {:d}'.format(adata.n_vars))

Total number of genes after ccg filter: 28266


### Identify HVGs

!!! Unlike before (at least I think), now they recommend subsetting to highly variable genes before running scVI 

--> the tutorial says anywhere in between 1000 and 10,000 should be fine, so I arbitrarily chose 2000

In [36]:
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=False,
    flavor="seurat_v3",
    batch_key="donor"
)

Trying to set attribute `.uns` of view, copying.
  'highly_variable_nbatches'


In [37]:
# subset object for scVI
bdata = adata[:, adata.var['highly_variable']]
bdata.layers["counts"] = bdata.X.copy() # preserve counts

## Normalize - optional

In [38]:
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)

## scVI

!!! Here you can specify your covariates you want to correct for

In [40]:
scvi.data.setup_anndata(
    bdata,
    layer="counts",
    batch_key='donor',
    categorical_covariate_keys=["sample_source"] #,
#     continuous_covariate_keys=["percent_mito"]#
)

[34mINFO    [0m Using batches from adata.obs[1m[[0m[32m"donor"[0m[1m][0m                                               
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"counts"[0m[1m][0m                                              
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m243972[0m cells, [1;36m2000[0m vars, [1;36m30[0m       
         batches, [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates   
         and [1;36m0[0m extra continuous covariates.                                                  
[34mINFO    [0m Please do not further modify adata until model is trained.                          


In [41]:
model = scvi.model.SCVI(bdata, n_latent= 60)

In [42]:
model



In [43]:
model.train()

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 33/33: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 33/33 [09:13<00:00, 16.77s/it, loss=568, v_num=1]


In [44]:
latent = model.get_latent_representation()

In [45]:
adata.obsm["X_scVI"] = latent

### Save latent space

In [46]:
pd.DataFrame(adata.obsm["X_scVI"]).to_csv(path_to_gonads+'/FCA-gonads_XscVI_latent_female.csv')