# Running scVI on donor P13 data:

- SN (all) + multiome, all cell states

Correcting by `sample`

In [1]:
from __future__ import print_function
import torch

In [2]:
import sys, os
data_type = 'float32'
os.environ["THEANO_FLAGS"] = 'device=cuda,floatX=' + data_type + ',force_device=True'
#sys.path.insert(1, '/nfs/team292/aa22/scVI_related/202105_troph_organoids/')

**Set up scVI environment**

In [3]:
import scvi
import scanpy as sc

sc.set_figure_params(figsize=(4, 4))

In [4]:
import pandas as pd 
import numpy as np

**Read in raw counts of the trophoblast organoids**

In [6]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202007_snRNA-seq_MFI/snRNA_seq_MFI_202007_adatas/202012_FINAL_reanalysis_with_souporcell_deconvolution_common_variants/donor_P13_all_samples_analysis_202110/'

# Loading object with raw counts, filtered cells and genes object
# before any doublet exclusion
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')


In [8]:
# reading IDs of final cells to keep (after also excluding doublets by GEX, done in notebook M3)
nodoublet_cell_IDs = pd.read_csv(save_path + 'obs_table_nodoublets.csv', index_col=0)
nodoublet_cell_IDs

Unnamed: 0_level_0,n_genes,donor,tissue_block,age,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,barcode,souporcell_assignment,inter_ind_doublet,S_score,G2M_score,phase,annotation_prev,louvain,is_doublet_propagate,origin_M_F
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGATACATG-1_WSSS_PLA8764121,1253,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,1820.0,snRNA-seq,10X,0.191617,...,AAACCCAAGATACATG-1_WSSS_PLA8764121,WSSS_PLA8764121_0,0,0.053412,0.056947,G2M,0_none,16,0,F
AAACCCACAAATCCCA-1_WSSS_PLA8764121,1279,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000535,1868.0,snRNA-seq,10X,0.032895,...,AAACCCACAAATCCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.120729,-0.113529,G1,dM2,10,0,M
AAACCCACAACTGTGT-1_WSSS_PLA8764121,473,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,556.0,snRNA-seq,10X,0.091892,...,AAACCCACAACTGTGT-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,0.042173,-0.027934,S,0_none,14,0,M
AAACCCACAAGCTGCC-1_WSSS_PLA8764121,1360,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000473,2116.0,snRNA-seq,10X,0.012335,...,AAACCCACAAGCTGCC-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.121624,0.050396,G2M,dNK1,14,0,M
AAACCCACATAACCCA-1_WSSS_PLA8764121,1891,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000305,3274.0,snRNA-seq,10X,0.073634,...,AAACCCACATAACCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.125064,-0.066023,G1,dS1,8,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,2409,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.007270,5227.0,snRNA-seq,10X,0.013699,...,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_0,0,0.033632,-0.079893,S,0_none,7,0,M
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,4129,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.002532,17379.0,snRNA-seq,10X,0.102190,...,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.059688,-0.098701,G1,0_none,2,0,F
TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3116,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.004499,7779.0,snRNA-seq,10X,0.036876,...,TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,0.066281,-0.038885,S,0_none,30,0,F
TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3696,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.001992,20078.0,snRNA-seq,10X,0.109137,...,TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.031627,-0.068024,G1,0_none,2,0,F


In [9]:
# subsetting the adata object to final cells
adata = adata[list(nodoublet_cell_IDs.index), :].copy()
adata

AnnData object with n_obs × n_vars = 74244 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [10]:
# adding metadata
for col in nodoublet_cell_IDs.columns:
    print(col)
    adata.obs[col] = nodoublet_cell_IDs.loc[adata.obs_names,col]

n_genes
donor
tissue_block
age
sample
percent_mito
n_counts
dataset
technique
scrublet_score
scrublet_cluster_score
bh_pval
batch
is_doublet
barcode_sample_copy
barcode
souporcell_assignment
inter_ind_doublet
S_score
G2M_score
phase
annotation_prev
louvain
is_doublet_propagate
origin_M_F


In [11]:
# addition - subsetting to genes after cell cycle-associated gene removal
# read in cleaned up genes, after all doublet exclusion
# this is done in notebook S3
cleaned_up_genes = pd.read_csv(save_path + 'genes_without_CC_assoc_genes_for_scVI_20211109.csv', 
                              index_col=0)
cleaned_up_genes

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-2,feature_types-2,...,genome-5,n_cells-5,gene_ids-6,feature_types-6,genome-6,n_cells-6,gene_ids-7,feature_types-7,genome-7,n_cells-7
A1BG,ENSG00000121410,Gene Expression,GRCh38-3.0.0_premrna,105.0,ENSG00000121410,Gene Expression,GRCh38-3.0.0_premrna,119.0,ENSG00000121410,Gene Expression,...,GRCh38-3.0.0_premrna,101.0,ENSG00000121410,Gene Expression,GRCh38,159.0,ENSG00000121410,Gene Expression,GRCh38,160.0
A1BG-AS1,ENSG00000268895,Gene Expression,GRCh38-3.0.0_premrna,162.0,ENSG00000268895,Gene Expression,GRCh38-3.0.0_premrna,211.0,ENSG00000268895,Gene Expression,...,GRCh38-3.0.0_premrna,152.0,ENSG00000268895,Gene Expression,GRCh38,223.0,ENSG00000268895,Gene Expression,GRCh38,217.0
A1CF,ENSG00000148584,Gene Expression,GRCh38-3.0.0_premrna,12.0,ENSG00000148584,Gene Expression,GRCh38-3.0.0_premrna,11.0,ENSG00000148584,Gene Expression,...,GRCh38-3.0.0_premrna,9.0,ENSG00000148584,Gene Expression,GRCh38,12.0,ENSG00000148584,Gene Expression,GRCh38,15.0
A2M,ENSG00000175899,Gene Expression,GRCh38-3.0.0_premrna,1302.0,ENSG00000175899,Gene Expression,GRCh38-3.0.0_premrna,1356.0,ENSG00000175899,Gene Expression,...,GRCh38-3.0.0_premrna,1074.0,ENSG00000175899,Gene Expression,GRCh38,1429.0,ENSG00000175899,Gene Expression,GRCh38,1333.0
A2M-AS1,ENSG00000245105,Gene Expression,GRCh38-3.0.0_premrna,20.0,ENSG00000245105,Gene Expression,GRCh38-3.0.0_premrna,18.0,ENSG00000245105,Gene Expression,...,GRCh38-3.0.0_premrna,10.0,ENSG00000245105,Gene Expression,GRCh38,36.0,ENSG00000245105,Gene Expression,GRCh38,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,ENSG00000070476,Gene Expression,GRCh38-3.0.0_premrna,1994.0,ENSG00000070476,Gene Expression,GRCh38-3.0.0_premrna,2274.0,ENSG00000070476,Gene Expression,...,GRCh38-3.0.0_premrna,2410.0,ENSG00000070476,Gene Expression,GRCh38,2541.0,ENSG00000070476,Gene Expression,GRCh38,2641.0
ZYG11A,ENSG00000203995,Gene Expression,GRCh38-3.0.0_premrna,218.0,ENSG00000203995,Gene Expression,GRCh38-3.0.0_premrna,276.0,ENSG00000203995,Gene Expression,...,GRCh38-3.0.0_premrna,229.0,ENSG00000203995,Gene Expression,GRCh38,216.0,ENSG00000203995,Gene Expression,GRCh38,226.0
ZYG11B,ENSG00000162378,Gene Expression,GRCh38-3.0.0_premrna,2320.0,ENSG00000162378,Gene Expression,GRCh38-3.0.0_premrna,2526.0,ENSG00000162378,Gene Expression,...,GRCh38-3.0.0_premrna,2920.0,ENSG00000162378,Gene Expression,GRCh38,2735.0,ENSG00000162378,Gene Expression,GRCh38,2810.0
ZYX,ENSG00000159840,Gene Expression,GRCh38-3.0.0_premrna,1274.0,ENSG00000159840,Gene Expression,GRCh38-3.0.0_premrna,1402.0,ENSG00000159840,Gene Expression,...,GRCh38-3.0.0_premrna,1124.0,ENSG00000159840,Gene Expression,GRCh38,1795.0,ENSG00000159840,Gene Expression,GRCh38,1690.0


In [12]:
# subsetting to only cleamed up genes
adata = adata[:, list(cleaned_up_genes.index)].copy()

In [13]:
adata

AnnData object with n_obs × n_vars = 74244 × 28854
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [14]:
# also getting rid of soupy genes 
# optional, trying without for now
#gene_stats = pd.read_csv('./gene_filter_cells_by_sample_Pla_Camb10123930_and_Pla_Camb10123931.csv', index_col=0)
#gene_stats

In [15]:
#np.unique(gene_stats['Selected'], return_counts=True)

In [16]:
#genes2keep = list(gene_stats[gene_stats['Selected'] == True].index)
#genes2keep

In [17]:
#adata

In [18]:
# subsetting to only cleaned up genes
#adata = adata[:, list(set(genes2keep) & set(adata.var_names))].copy()

In [19]:
adata

AnnData object with n_obs × n_vars = 74244 × 28854
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [20]:
# going to correct by sample + donor here
np.unique(adata.obs['sample'], return_counts=True)

(array(['Pla_Camb10691970', 'Pla_Camb10691971',
        'Pla_Camb10714919_and_40110_Pla_Camb10687915',
        'Pla_Camb10714920_and_40110_Pla_Camb10687916', 'WSSS_PLA8764121',
        'WSSS_PLA8764122', 'WSSS_PLA8810750', 'WSSS_PLA8810751'],
       dtype=object),
 array([ 9740,  9840,  5172,  5407,  8658,  8411, 14455, 12561]))

In [21]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [22]:
# creating a joint donor + sample key for the most correct HVG calculation in case of donor + sample correction
#adata.obs['donor_sample'] = [i + '_' + j for i,j in zip(adata.obs['donor'], adata.obs['sample'])]

In [23]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="sample",
    #batch_key="donor"
)

... storing 'donor' as categorical
... storing 'tissue_block' as categorical
... storing 'age' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical
... storing 'technique' as categorical
... storing 'souporcell_assignment' as categorical
... storing 'phase' as categorical
... storing 'annotation_prev' as categorical
... storing 'origin_M_F' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [24]:
adata

AnnData object with n_obs × n_vars = 74244 × 2000
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variab

#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [25]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m74244[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 09.11.2021 Running scVI after:

- removing cell cycle-associated genes (calculated in S3 notebook)
- exclusion of all doublets (scrublet and souporcell, done in S2 notebook)

In [30]:
models = {}

# just a few values here, it was 17 PCs I used in the conventional analysis as optimal n_PCs
n_latent_values = [5, 10, 15, 
                   17,
                   20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 17
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [31]:
models[40]



In [28]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211109_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_all_SN_and_multiome.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 5
Epoch 108/108: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [08:49<00:00,  4.90s/it, loss=589, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 10
Epoch 108/108: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [08:49<00:00,  4.90s/it, loss=603, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 15
Epoch 108/108: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [08:51<00:00,  4.92s/it, loss=590, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 108/108: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [08:50<00:00,  4.91s/it, loss=601, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 30
Epoch 108/108: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [08:51<00:00,  4.92s/it, loss=583, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 40
Epoch 108/108: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [08:51<00:00,  4.92s/it, loss=592, v_num=1]


In [32]:
# calculating with n_latent = 17 since 17 PCs best explained the variance in the object in conventional analysis

latent_representations = {}

for n_latent_value in [17]:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20211109_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_all_SN_and_multiome.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 17
Epoch 108/108: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 108/108 [08:50<00:00,  4.91s/it, loss=592, v_num=1]


# after this, going to notebook S4 to calculate a manifold and have a look at it etc.

# Code below is old - pls ignore

# Running on only invading trophoblast 


Subsetted by unbiased clusteing of the manifold with all cell states as calculated above with n_latent=17

In [52]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202007_snRNA-seq_MFI/snRNA_seq_MFI_202007_adatas/202012_FINAL_reanalysis_with_souporcell_deconvolution_common_variants/donor_P13_all_samples_analysis_202110/'

# Loading object with raw counts, filtered cells and genes object
# before any doublet exclusion
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')


In [53]:
# reading IDs of final cells to keep (after also excluding doublets by GEX, done in notebook M3)
nodoublet_cell_IDs = pd.read_csv(save_path + 'obs_table_nodoublets.csv', index_col=0)
nodoublet_cell_IDs

Unnamed: 0_level_0,n_genes,donor,tissue_block,age,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,barcode,souporcell_assignment,inter_ind_doublet,S_score,G2M_score,phase,annotation_prev,louvain,is_doublet_propagate,origin_M_F
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGATACATG-1_WSSS_PLA8764121,1253,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,1820.0,snRNA-seq,10X,0.191617,...,AAACCCAAGATACATG-1_WSSS_PLA8764121,WSSS_PLA8764121_0,0,0.053412,0.056947,G2M,0_none,16,0,F
AAACCCACAAATCCCA-1_WSSS_PLA8764121,1279,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000535,1868.0,snRNA-seq,10X,0.032895,...,AAACCCACAAATCCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.120729,-0.113529,G1,dM2,10,0,M
AAACCCACAACTGTGT-1_WSSS_PLA8764121,473,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,556.0,snRNA-seq,10X,0.091892,...,AAACCCACAACTGTGT-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,0.042173,-0.027934,S,0_none,14,0,M
AAACCCACAAGCTGCC-1_WSSS_PLA8764121,1360,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000473,2116.0,snRNA-seq,10X,0.012335,...,AAACCCACAAGCTGCC-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.121624,0.050396,G2M,dNK1,14,0,M
AAACCCACATAACCCA-1_WSSS_PLA8764121,1891,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000305,3274.0,snRNA-seq,10X,0.073634,...,AAACCCACATAACCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.125064,-0.066023,G1,dS1,8,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,2409,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.007270,5227.0,snRNA-seq,10X,0.013699,...,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_0,0,0.033632,-0.079893,S,0_none,7,0,M
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,4129,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.002532,17379.0,snRNA-seq,10X,0.102190,...,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.059688,-0.098701,G1,0_none,2,0,F
TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3116,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.004499,7779.0,snRNA-seq,10X,0.036876,...,TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,0.066281,-0.038885,S,0_none,30,0,F
TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3696,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.001992,20078.0,snRNA-seq,10X,0.109137,...,TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.031627,-0.068024,G1,0_none,2,0,F


In [54]:
# subsetting the adata object to final cells
adata = adata[list(nodoublet_cell_IDs.index), :].copy()
adata

AnnData object with n_obs × n_vars = 74244 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [55]:
# adding metadata
for col in nodoublet_cell_IDs.columns:
    print(col)
    adata.obs[col] = nodoublet_cell_IDs.loc[adata.obs_names,col]

n_genes
donor
tissue_block
age
sample
percent_mito
n_counts
dataset
technique
scrublet_score
scrublet_cluster_score
bh_pval
batch
is_doublet
barcode_sample_copy
barcode
souporcell_assignment
inter_ind_doublet
S_score
G2M_score
phase
annotation_prev
louvain
is_doublet_propagate
origin_M_F


In [56]:
# subset to only invading trophoblast compartment
# read in obs table with louvain labels from notebook S1 and subset to only clusters 6 and 8 which are VCT_CCC and all the invading troph
obs_table_with_louvain = pd.read_csv(save_path + 'obs_table_adata_scvi_SN_and_multiome_with_louvain.csv', index_col=0)
adata.obs['louvain_scvi'] = obs_table_with_louvain.loc[adata.obs_names, 'louvain_scvi_n_latent_17']

In [57]:
adata = adata[adata.obs['louvain_scvi'].isin([6,8])].copy()

In [58]:
# figuring out how many PCs best explain variance in this data - to choose n_latent optimally
# done in S1a notebook in jhub

# optimal n_PCs = 18 here as well

In [59]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [60]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="sample",
    #batch_key="donor"
)

... storing 'donor' as categorical
... storing 'tissue_block' as categorical
... storing 'age' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical
... storing 'technique' as categorical
... storing 'souporcell_assignment' as categorical
... storing 'phase' as categorical
... storing 'annotation_prev' as categorical
... storing 'origin_M_F' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [61]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m5992[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 09.11.2021 Running scVI after:

- removing cell cycle-associated genes (calculated in M2 notebook)
- exclusion of all doublets (scrublet and souporcell)

In [62]:
models = {}

# just a few values here, it was 18 PCs I used in the conventional analysis
n_latent_values = [5, 10, 15, 
                   18,
                   20, 
                   #30, 40
                  ]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 18
n_latent_value 20


In [64]:
models[18]



In [65]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20211109_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_all_SN_and_multiome_only_invading_trophoblast.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 5
Epoch 400/400: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:43<00:00,  2.44it/s, loss=998, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 10
Epoch 400/400: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:43<00:00,  2.44it/s, loss=986, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 15
Epoch 400/400: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:43<00:00,  2.44it/s, loss=990, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 18
Epoch 400/400: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:43<00:00,  2.44it/s, loss=980, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 400/400: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:43<00:00,  2.44it/s, loss=967, v_num=1]


# Rerunning for invading troph after excluding SCT contaminants (notebook S1a1)

In [73]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202007_snRNA-seq_MFI/snRNA_seq_MFI_202007_adatas/202012_FINAL_reanalysis_with_souporcell_deconvolution_common_variants/donor_P13_all_samples_analysis_202110/'

# Loading object with raw counts, filtered cells and genes object
# before any doublet exclusion
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')


In [74]:
# reading IDs of final cells to keep (after also excluding doublets by GEX, done in notebook M3)
nodoublet_cell_IDs = pd.read_csv(save_path + 'obs_table_nodoublets.csv', index_col=0)
nodoublet_cell_IDs

Unnamed: 0_level_0,n_genes,donor,tissue_block,age,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,barcode,souporcell_assignment,inter_ind_doublet,S_score,G2M_score,phase,annotation_prev,louvain,is_doublet_propagate,origin_M_F
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGATACATG-1_WSSS_PLA8764121,1253,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,1820.0,snRNA-seq,10X,0.191617,...,AAACCCAAGATACATG-1_WSSS_PLA8764121,WSSS_PLA8764121_0,0,0.053412,0.056947,G2M,0_none,16,0,F
AAACCCACAAATCCCA-1_WSSS_PLA8764121,1279,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000535,1868.0,snRNA-seq,10X,0.032895,...,AAACCCACAAATCCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.120729,-0.113529,G1,dM2,10,0,M
AAACCCACAACTGTGT-1_WSSS_PLA8764121,473,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,556.0,snRNA-seq,10X,0.091892,...,AAACCCACAACTGTGT-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,0.042173,-0.027934,S,0_none,14,0,M
AAACCCACAAGCTGCC-1_WSSS_PLA8764121,1360,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000473,2116.0,snRNA-seq,10X,0.012335,...,AAACCCACAAGCTGCC-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.121624,0.050396,G2M,dNK1,14,0,M
AAACCCACATAACCCA-1_WSSS_PLA8764121,1891,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000305,3274.0,snRNA-seq,10X,0.073634,...,AAACCCACATAACCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.125064,-0.066023,G1,dS1,8,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,2409,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.007270,5227.0,snRNA-seq,10X,0.013699,...,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_0,0,0.033632,-0.079893,S,0_none,7,0,M
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,4129,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.002532,17379.0,snRNA-seq,10X,0.102190,...,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.059688,-0.098701,G1,0_none,2,0,F
TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3116,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.004499,7779.0,snRNA-seq,10X,0.036876,...,TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,0.066281,-0.038885,S,0_none,30,0,F
TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3696,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.001992,20078.0,snRNA-seq,10X,0.109137,...,TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.031627,-0.068024,G1,0_none,2,0,F


In [75]:
# subsetting the adata object to final cells
adata = adata[list(nodoublet_cell_IDs.index), :].copy()
adata

AnnData object with n_obs × n_vars = 74244 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [76]:
# adding metadata
for col in nodoublet_cell_IDs.columns:
    print(col)
    adata.obs[col] = nodoublet_cell_IDs.loc[adata.obs_names,col]

n_genes
donor
tissue_block
age
sample
percent_mito
n_counts
dataset
technique
scrublet_score
scrublet_cluster_score
bh_pval
batch
is_doublet
barcode_sample_copy
barcode
souporcell_assignment
inter_ind_doublet
S_score
G2M_score
phase
annotation_prev
louvain
is_doublet_propagate
origin_M_F


In [77]:
# subset to only invading trophoblast compartment
# read in obs table with louvain labels from notebook S1 and subset to only clusters 6 and 8 which are VCT_CCC and all the invading troph
obs_table_with_louvain = pd.read_csv(save_path + 'obs_table_adata_scvi_SN_and_multiome_with_louvain_invading_troph_only.csv', index_col=0)
# subset to only inv troph
adata = adata[obs_table_with_louvain.index,:].copy()

adata.obs['louvain_scvi'] = obs_table_with_louvain.loc[adata.obs_names, 'louvain_scvi_n_latent_18']


In [78]:
# clusters 2, 6 and 7 are SCT contaminants
np.unique(adata.obs['louvain_scvi'])

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [79]:
adata = adata[-adata.obs['louvain_scvi'].isin([2,6,7])].copy()

In [80]:
np.unique(adata.obs['louvain_scvi'])

array([0, 1, 3, 4, 5, 8, 9])

In [81]:
# figuring out how many PCs best explain variance in this data - to choose n_latent optimally
# done in S1a notebook in jhub

# optimal n_PCs = 18 here as well

In [82]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [83]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="sample",
    #batch_key="donor"
)

... storing 'donor' as categorical
... storing 'tissue_block' as categorical
... storing 'age' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical
... storing 'technique' as categorical
... storing 'souporcell_assignment' as categorical
... storing 'phase' as categorical
... storing 'annotation_prev' as categorical
... storing 'origin_M_F' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [84]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m4626[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 09.11.2021 Running scVI after:

- removing cell cycle-associated genes (calculated in M2 notebook)
- exclusion of all doublets (scrublet and souporcell)

In [85]:
models = {}

# just a few values here, it was 18 PCs I used in the conventional analysis
n_latent_values = [18]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 18


In [86]:
models[18]



In [87]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20211109_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_all_SN_and_multiome_only_invading_trophoblast_cleanup_1.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 18
Epoch 400/400: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:07<00:00,  3.14it/s, loss=1.02e+03, v_num=1]
