# Running scVI on donor P13 data:

- SN (all) + multiome, only trophoblast nuclei states (all)

Correcting by `sample`

In [1]:
from __future__ import print_function
import torch

In [2]:
import sys, os
data_type = 'float32'
os.environ["THEANO_FLAGS"] = 'device=cuda,floatX=' + data_type + ',force_device=True'
#sys.path.insert(1, '/nfs/team292/aa22/scVI_related/202105_troph_organoids/')

**Set up scVI environment**

In [3]:
import scvi
import scanpy as sc

sc.set_figure_params(figsize=(4, 4))

In [4]:
import pandas as pd 
import numpy as np

**Read in raw counts of the trophoblast organoids**

In [5]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202007_snRNA-seq_MFI/snRNA_seq_MFI_202007_adatas/202012_FINAL_reanalysis_with_souporcell_deconvolution_common_variants/donor_P13_all_samples_analysis_202110/'

# Loading object with raw counts, filtered cells and genes object
# before any doublet exclusion
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')


In [6]:
# reading IDs of final cells to keep (after also excluding doublets by GEX, done in notebook M3)
nodoublet_cell_IDs = pd.read_csv(save_path + 'obs_table_nodoublets.csv', index_col=0)
nodoublet_cell_IDs

Unnamed: 0_level_0,n_genes,donor,tissue_block,age,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,barcode,souporcell_assignment,inter_ind_doublet,S_score,G2M_score,phase,annotation_prev,louvain,is_doublet_propagate,origin_M_F
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGATACATG-1_WSSS_PLA8764121,1253,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,1820.0,snRNA-seq,10X,0.191617,...,AAACCCAAGATACATG-1_WSSS_PLA8764121,WSSS_PLA8764121_0,0,0.053412,0.056947,G2M,0_none,16,0,F
AAACCCACAAATCCCA-1_WSSS_PLA8764121,1279,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000535,1868.0,snRNA-seq,10X,0.032895,...,AAACCCACAAATCCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.120729,-0.113529,G1,dM2,10,0,M
AAACCCACAACTGTGT-1_WSSS_PLA8764121,473,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,556.0,snRNA-seq,10X,0.091892,...,AAACCCACAACTGTGT-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,0.042173,-0.027934,S,0_none,14,0,M
AAACCCACAAGCTGCC-1_WSSS_PLA8764121,1360,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000473,2116.0,snRNA-seq,10X,0.012335,...,AAACCCACAAGCTGCC-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.121624,0.050396,G2M,dNK1,14,0,M
AAACCCACATAACCCA-1_WSSS_PLA8764121,1891,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000305,3274.0,snRNA-seq,10X,0.073634,...,AAACCCACATAACCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.125064,-0.066023,G1,dS1,8,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,2409,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.007270,5227.0,snRNA-seq,10X,0.013699,...,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_0,0,0.033632,-0.079893,S,0_none,7,0,M
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,4129,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.002532,17379.0,snRNA-seq,10X,0.102190,...,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.059688,-0.098701,G1,0_none,2,0,F
TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3116,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.004499,7779.0,snRNA-seq,10X,0.036876,...,TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,0.066281,-0.038885,S,0_none,30,0,F
TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3696,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.001992,20078.0,snRNA-seq,10X,0.109137,...,TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.031627,-0.068024,G1,0_none,2,0,F


In [9]:
# subsetting the adata object to final cells
adata = adata[list(nodoublet_cell_IDs.index), :].copy()
adata

AnnData object with n_obs × n_vars = 74244 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [10]:
# adding metadata
for col in nodoublet_cell_IDs.columns:
    print(col)
    adata.obs[col] = nodoublet_cell_IDs.loc[adata.obs_names,col]

n_genes
donor
tissue_block
age
sample
percent_mito
n_counts
dataset
technique
scrublet_score
scrublet_cluster_score
bh_pval
batch
is_doublet
barcode_sample_copy
barcode
souporcell_assignment
inter_ind_doublet
S_score
G2M_score
phase
annotation_prev
louvain
is_doublet_propagate
origin_M_F


In [12]:
# addition - subsetting to genes after cell cycle-associated gene removal
# read in cleaned up genes, after all doublet exclusion
cleaned_up_genes = pd.read_csv(save_path + 'genes_without_CC_assoc_genes_for_scVI_SN_and_multiome_trophoblast_20211110.csv', 
                              index_col=0)
cleaned_up_genes

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-2,feature_types-2,...,genome-5,n_cells-5,gene_ids-6,feature_types-6,genome-6,n_cells-6,gene_ids-7,feature_types-7,genome-7,n_cells-7
A1BG,ENSG00000121410,Gene Expression,GRCh38-3.0.0_premrna,105.0,ENSG00000121410,Gene Expression,GRCh38-3.0.0_premrna,119.0,ENSG00000121410,Gene Expression,...,GRCh38-3.0.0_premrna,101.0,ENSG00000121410,Gene Expression,GRCh38,159.0,ENSG00000121410,Gene Expression,GRCh38,160.0
A1BG-AS1,ENSG00000268895,Gene Expression,GRCh38-3.0.0_premrna,162.0,ENSG00000268895,Gene Expression,GRCh38-3.0.0_premrna,211.0,ENSG00000268895,Gene Expression,...,GRCh38-3.0.0_premrna,152.0,ENSG00000268895,Gene Expression,GRCh38,223.0,ENSG00000268895,Gene Expression,GRCh38,217.0
A1CF,ENSG00000148584,Gene Expression,GRCh38-3.0.0_premrna,12.0,ENSG00000148584,Gene Expression,GRCh38-3.0.0_premrna,11.0,ENSG00000148584,Gene Expression,...,GRCh38-3.0.0_premrna,9.0,ENSG00000148584,Gene Expression,GRCh38,12.0,ENSG00000148584,Gene Expression,GRCh38,15.0
A2M,ENSG00000175899,Gene Expression,GRCh38-3.0.0_premrna,1302.0,ENSG00000175899,Gene Expression,GRCh38-3.0.0_premrna,1356.0,ENSG00000175899,Gene Expression,...,GRCh38-3.0.0_premrna,1074.0,ENSG00000175899,Gene Expression,GRCh38,1429.0,ENSG00000175899,Gene Expression,GRCh38,1333.0
A2M-AS1,ENSG00000245105,Gene Expression,GRCh38-3.0.0_premrna,20.0,ENSG00000245105,Gene Expression,GRCh38-3.0.0_premrna,18.0,ENSG00000245105,Gene Expression,...,GRCh38-3.0.0_premrna,10.0,ENSG00000245105,Gene Expression,GRCh38,36.0,ENSG00000245105,Gene Expression,GRCh38,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDB,ENSG00000198455,Gene Expression,GRCh38-3.0.0_premrna,97.0,ENSG00000198455,Gene Expression,GRCh38-3.0.0_premrna,103.0,ENSG00000198455,Gene Expression,...,GRCh38-3.0.0_premrna,104.0,ENSG00000198455,Gene Expression,GRCh38,231.0,ENSG00000198455,Gene Expression,GRCh38,239.0
ZXDC,ENSG00000070476,Gene Expression,GRCh38-3.0.0_premrna,1994.0,ENSG00000070476,Gene Expression,GRCh38-3.0.0_premrna,2274.0,ENSG00000070476,Gene Expression,...,GRCh38-3.0.0_premrna,2410.0,ENSG00000070476,Gene Expression,GRCh38,2541.0,ENSG00000070476,Gene Expression,GRCh38,2641.0
ZYG11B,ENSG00000162378,Gene Expression,GRCh38-3.0.0_premrna,2320.0,ENSG00000162378,Gene Expression,GRCh38-3.0.0_premrna,2526.0,ENSG00000162378,Gene Expression,...,GRCh38-3.0.0_premrna,2920.0,ENSG00000162378,Gene Expression,GRCh38,2735.0,ENSG00000162378,Gene Expression,GRCh38,2810.0
ZYX,ENSG00000159840,Gene Expression,GRCh38-3.0.0_premrna,1274.0,ENSG00000159840,Gene Expression,GRCh38-3.0.0_premrna,1402.0,ENSG00000159840,Gene Expression,...,GRCh38-3.0.0_premrna,1124.0,ENSG00000159840,Gene Expression,GRCh38,1795.0,ENSG00000159840,Gene Expression,GRCh38,1690.0


In [13]:
# subsetting to only cleamed up genes
adata = adata[:, list(cleaned_up_genes.index)].copy()

In [14]:
adata

AnnData object with n_obs × n_vars = 74244 × 28458
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [15]:
# subsetting to only trophoblast
obs_table_with_louvain = pd.read_csv(save_path + 'obs_table_adata_scvi_SN_and_multiome_with_trophoblast_labels.csv', index_col=0)

adata.obs['trophoblast_or_rest'] = obs_table_with_louvain.loc[list(set(adata.obs_names) & set(obs_table_with_louvain.index)), 'trophoblast_or_rest']
adata = adata[adata.obs['trophoblast_or_rest'] == 'trophoblast'].copy()
adata

AnnData object with n_obs × n_vars = 49361 × 28458
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'trophoblast_or_rest'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [16]:
# going to correct by sample here
np.unique(adata.obs['sample'], return_counts=True)

(array(['Pla_Camb10691970', 'Pla_Camb10691971',
        'Pla_Camb10714919_and_40110_Pla_Camb10687915',
        'Pla_Camb10714920_and_40110_Pla_Camb10687916', 'WSSS_PLA8764121',
        'WSSS_PLA8764122', 'WSSS_PLA8810750', 'WSSS_PLA8810751'],
       dtype=object),
 array([ 6406,  6102,  3827,  3896,  4113,  4106, 10980,  9931]))

In [17]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [18]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="sample",
    #batch_key="donor"
)

... storing 'donor' as categorical
... storing 'tissue_block' as categorical
... storing 'age' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical
... storing 'technique' as categorical
... storing 'souporcell_assignment' as categorical
... storing 'phase' as categorical
... storing 'annotation_prev' as categorical
... storing 'origin_M_F' as categorical
... storing 'trophoblast_or_rest' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [19]:
adata

AnnData object with n_obs × n_vars = 49361 × 2000
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'trophoblast_or_rest'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nb

#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [20]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m49361[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 10.11.2021 Running scVI on all trophoblast after:

- removing cell cycle-associated genes (calculated in S4 notebook)
- exclusion of all doublets (scrublet and souporcell, done in S2 notebook)

In [21]:
models = {}

# just a few values here, it was 6 or 16 PCs that look optimal the conventional analysis
n_latent_values = [6, 16, 20]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 6
n_latent_value 16
n_latent_value 20


In [22]:
models[6]



In [21]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211110_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_SN_and_multiome_trophoblast.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 6
Epoch 162/162: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [08:54<00:00,  3.30s/it, loss=718, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 16
Epoch 162/162: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [08:55<00:00,  3.31s/it, loss=696, v_num=1]


In [23]:
latent_representations = {}
# for additional n_latent=20
for n_latent_value in [20]:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211110_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_SN_and_multiome_trophoblast.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 162/162: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 162/162 [07:05<00:00,  2.63s/it, loss=700, v_num=1]


# Running on all trophoblast again after cleanup in notebook S5-1


Subsetted by unbiased clusteing of the manifold with all trophoblast states

In [24]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202007_snRNA-seq_MFI/snRNA_seq_MFI_202007_adatas/202012_FINAL_reanalysis_with_souporcell_deconvolution_common_variants/donor_P13_all_samples_analysis_202110/'

# Loading object with raw counts, filtered cells and genes object
# before any doublet exclusion
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')


In [25]:
# reading IDs of final cells to keep (after also excluding doublets by GEX, done in notebook M3)
nodoublet_cell_IDs = pd.read_csv(save_path + 'obs_table_nodoublets.csv', index_col=0)
nodoublet_cell_IDs

Unnamed: 0_level_0,n_genes,donor,tissue_block,age,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,barcode,souporcell_assignment,inter_ind_doublet,S_score,G2M_score,phase,annotation_prev,louvain,is_doublet_propagate,origin_M_F
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGATACATG-1_WSSS_PLA8764121,1253,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,1820.0,snRNA-seq,10X,0.191617,...,AAACCCAAGATACATG-1_WSSS_PLA8764121,WSSS_PLA8764121_0,0,0.053412,0.056947,G2M,0_none,16,0,F
AAACCCACAAATCCCA-1_WSSS_PLA8764121,1279,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000535,1868.0,snRNA-seq,10X,0.032895,...,AAACCCACAAATCCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.120729,-0.113529,G1,dM2,10,0,M
AAACCCACAACTGTGT-1_WSSS_PLA8764121,473,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,556.0,snRNA-seq,10X,0.091892,...,AAACCCACAACTGTGT-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,0.042173,-0.027934,S,0_none,14,0,M
AAACCCACAAGCTGCC-1_WSSS_PLA8764121,1360,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000473,2116.0,snRNA-seq,10X,0.012335,...,AAACCCACAAGCTGCC-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.121624,0.050396,G2M,dNK1,14,0,M
AAACCCACATAACCCA-1_WSSS_PLA8764121,1891,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000305,3274.0,snRNA-seq,10X,0.073634,...,AAACCCACATAACCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.125064,-0.066023,G1,dS1,8,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,2409,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.007270,5227.0,snRNA-seq,10X,0.013699,...,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_0,0,0.033632,-0.079893,S,0_none,7,0,M
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,4129,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.002532,17379.0,snRNA-seq,10X,0.102190,...,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.059688,-0.098701,G1,0_none,2,0,F
TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3116,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.004499,7779.0,snRNA-seq,10X,0.036876,...,TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,0.066281,-0.038885,S,0_none,30,0,F
TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3696,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.001992,20078.0,snRNA-seq,10X,0.109137,...,TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.031627,-0.068024,G1,0_none,2,0,F


In [26]:
# subsetting the adata object to final cells
adata = adata[list(nodoublet_cell_IDs.index), :].copy()
adata

AnnData object with n_obs × n_vars = 74244 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [27]:
# adding metadata
for col in nodoublet_cell_IDs.columns:
    print(col)
    adata.obs[col] = nodoublet_cell_IDs.loc[adata.obs_names,col]

n_genes
donor
tissue_block
age
sample
percent_mito
n_counts
dataset
technique
scrublet_score
scrublet_cluster_score
bh_pval
batch
is_doublet
barcode_sample_copy
barcode
souporcell_assignment
inter_ind_doublet
S_score
G2M_score
phase
annotation_prev
louvain
is_doublet_propagate
origin_M_F


In [28]:
# subset to only selected trophoblast (after cleanup)
# read in obs table with louvain labels from notebook S3-1 and subset to only cleaned up troph
obs_table_with_labels = pd.read_csv(save_path + 'obs_table_adata_scvi_SN_and_multiome_with_trophoblast_contaminant_v1_labels.csv', index_col=0)

adata.obs['cleanup_v1_contaminant'] = obs_table_with_labels.loc[list(set(adata.obs_names) & set(obs_table_with_labels.index)), 'cleanup_v1_contaminant']
adata = adata[adata.obs['cleanup_v1_contaminant'] == 'keeping'].copy()
adata

AnnData object with n_obs × n_vars = 41166 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'cleanup_v1_contaminant'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [29]:
# read in CCGs recalc in notebook S3-1 and using 8 and 16 PCs?
cleaned_up_genes = pd.read_csv(save_path + 'genes_without_CC_assoc_genes_for_scVI_SN_and_multiome_trophoblast_cleanup_v1_20211110.csv', 
                              index_col=0)
adata = adata[:,cleaned_up_genes.index].copy()
adata

AnnData object with n_obs × n_vars = 41166 × 28847
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'cleanup_v1_contaminant'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [30]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [31]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="sample",
    #batch_key="donor"
)

... storing 'donor' as categorical
... storing 'tissue_block' as categorical
... storing 'age' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical
... storing 'technique' as categorical
... storing 'souporcell_assignment' as categorical
... storing 'phase' as categorical
... storing 'annotation_prev' as categorical
... storing 'origin_M_F' as categorical
... storing 'cleanup_v1_contaminant' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [32]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m41166[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 10.11.2021 Running scVI on all trophoblast after:

- removing cell cycle-associated genes (calculated in S5-1 notebook)
- exclusion of all doublets (scrublet and souporcell, done in S2 notebook)

In [33]:
models = {}

# just a few values here, it was 18 PCs I used in the conventional analysis
n_latent_values = [8, 16, 19]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 8
n_latent_value 16
n_latent_value 19


In [34]:
models[8]



In [34]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211110_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_SN_and_multiome_trophoblast_cleanup_v1.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 8
Epoch 194/194: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 194/194 [08:55<00:00,  2.76s/it, loss=889, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 16
Epoch 194/194: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 194/194 [08:56<00:00,  2.76s/it, loss=880, v_num=1]


In [None]:
latent_representations = {}

for n_latent_value in [19]:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211110_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_SN_and_multiome_trophoblast_cleanup_v1.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 19
Epoch 145/194:  74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 144/194 [05:15<01:51,  2.23s/it, loss=881, v_num=1]

# Running on all trophoblast again after cleanup in notebook S5-2


Subsetted by unbiased clusteing of the manifold with all trophoblast states

In [57]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202007_snRNA-seq_MFI/snRNA_seq_MFI_202007_adatas/202012_FINAL_reanalysis_with_souporcell_deconvolution_common_variants/donor_P13_all_samples_analysis_202110/'

# Loading object with raw counts, filtered cells and genes object
# before any doublet exclusion
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')


In [58]:
# reading IDs of final cells to keep (after also excluding doublets by GEX, done in notebook M3)
nodoublet_cell_IDs = pd.read_csv(save_path + 'obs_table_nodoublets.csv', index_col=0)
nodoublet_cell_IDs

Unnamed: 0_level_0,n_genes,donor,tissue_block,age,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,barcode,souporcell_assignment,inter_ind_doublet,S_score,G2M_score,phase,annotation_prev,louvain,is_doublet_propagate,origin_M_F
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGATACATG-1_WSSS_PLA8764121,1253,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,1820.0,snRNA-seq,10X,0.191617,...,AAACCCAAGATACATG-1_WSSS_PLA8764121,WSSS_PLA8764121_0,0,0.053412,0.056947,G2M,0_none,16,0,F
AAACCCACAAATCCCA-1_WSSS_PLA8764121,1279,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000535,1868.0,snRNA-seq,10X,0.032895,...,AAACCCACAAATCCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.120729,-0.113529,G1,dM2,10,0,M
AAACCCACAACTGTGT-1_WSSS_PLA8764121,473,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,556.0,snRNA-seq,10X,0.091892,...,AAACCCACAACTGTGT-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,0.042173,-0.027934,S,0_none,14,0,M
AAACCCACAAGCTGCC-1_WSSS_PLA8764121,1360,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000473,2116.0,snRNA-seq,10X,0.012335,...,AAACCCACAAGCTGCC-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.121624,0.050396,G2M,dNK1,14,0,M
AAACCCACATAACCCA-1_WSSS_PLA8764121,1891,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000305,3274.0,snRNA-seq,10X,0.073634,...,AAACCCACATAACCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.125064,-0.066023,G1,dS1,8,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,2409,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.007270,5227.0,snRNA-seq,10X,0.013699,...,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_0,0,0.033632,-0.079893,S,0_none,7,0,M
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,4129,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.002532,17379.0,snRNA-seq,10X,0.102190,...,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.059688,-0.098701,G1,0_none,2,0,F
TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3116,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.004499,7779.0,snRNA-seq,10X,0.036876,...,TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,0.066281,-0.038885,S,0_none,30,0,F
TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3696,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.001992,20078.0,snRNA-seq,10X,0.109137,...,TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.031627,-0.068024,G1,0_none,2,0,F


In [59]:
# subsetting the adata object to final cells
adata = adata[list(nodoublet_cell_IDs.index), :].copy()
adata

AnnData object with n_obs × n_vars = 74244 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [60]:
# adding metadata
for col in nodoublet_cell_IDs.columns:
    print(col)
    adata.obs[col] = nodoublet_cell_IDs.loc[adata.obs_names,col]

n_genes
donor
tissue_block
age
sample
percent_mito
n_counts
dataset
technique
scrublet_score
scrublet_cluster_score
bh_pval
batch
is_doublet
barcode_sample_copy
barcode
souporcell_assignment
inter_ind_doublet
S_score
G2M_score
phase
annotation_prev
louvain
is_doublet_propagate
origin_M_F


In [61]:
# subset to only selected trophoblast (after cleanup)
# read in obs table with louvain labels from notebook S3-1 and subset to only cleaned up troph
obs_table_with_labels = pd.read_csv(save_path + 'obs_table_adata_scvi_SN_and_multiome_with_trophoblast_contaminant_v2_labels.csv', index_col=0)

adata.obs['cleanup_v2_contaminant'] = obs_table_with_labels.loc[list(set(adata.obs_names) & set(obs_table_with_labels.index)), 'cleanup_v2_contaminant']
adata = adata[adata.obs['cleanup_v2_contaminant'] == 'keeping'].copy()
adata

AnnData object with n_obs × n_vars = 40198 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'cleanup_v2_contaminant'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [62]:
# read in CCGs recalc in notebook S3-1 and using 8 and 18 PCs
cleaned_up_genes = pd.read_csv(save_path + 'genes_without_CC_assoc_genes_for_scVI_SN_and_multiome_trophoblast_cleanup_v2_20211110.csv', 
                              index_col=0)
adata = adata[:,cleaned_up_genes.index].copy()
adata

AnnData object with n_obs × n_vars = 40198 × 28022
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'cleanup_v2_contaminant'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [63]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [64]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="sample",
    #batch_key="donor"
)

... storing 'donor' as categorical
... storing 'tissue_block' as categorical
... storing 'age' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical
... storing 'technique' as categorical
... storing 'souporcell_assignment' as categorical
... storing 'phase' as categorical
... storing 'annotation_prev' as categorical
... storing 'origin_M_F' as categorical
... storing 'cleanup_v2_contaminant' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [65]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m40198[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 10.11.2021 Running scVI on all trophoblast after:

- removing cell cycle-associated genes (calculated in S5-1 notebook)
- exclusion of all doublets (scrublet and souporcell, done in S2 notebook)

In [66]:
models = {}

# just a few values here, it was 18 PCs I used in the conventional analysis
n_latent_values = [8, 19]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 8
n_latent_value 19


In [67]:
models[8]



In [68]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211110_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_SN_and_multiome_trophoblast_cleanup_v2.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 8
Epoch 199/199: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 199/199 [08:56<00:00,  2.70s/it, loss=868, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 19
Epoch 199/199: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 199/199 [08:56<00:00,  2.70s/it, loss=871, v_num=1]


# Running on all trophoblast again after cleanup in notebook S5-3


Subsetted by unbiased clusteing of the manifold with all trophoblast states

In [69]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202007_snRNA-seq_MFI/snRNA_seq_MFI_202007_adatas/202012_FINAL_reanalysis_with_souporcell_deconvolution_common_variants/donor_P13_all_samples_analysis_202110/'

# Loading object with raw counts, filtered cells and genes object
# before any doublet exclusion
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')


In [70]:
# reading IDs of final cells to keep (after also excluding doublets by GEX, done in notebook M3)
nodoublet_cell_IDs = pd.read_csv(save_path + 'obs_table_nodoublets.csv', index_col=0)
nodoublet_cell_IDs

Unnamed: 0_level_0,n_genes,donor,tissue_block,age,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,barcode,souporcell_assignment,inter_ind_doublet,S_score,G2M_score,phase,annotation_prev,louvain,is_doublet_propagate,origin_M_F
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGATACATG-1_WSSS_PLA8764121,1253,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,1820.0,snRNA-seq,10X,0.191617,...,AAACCCAAGATACATG-1_WSSS_PLA8764121,WSSS_PLA8764121_0,0,0.053412,0.056947,G2M,0_none,16,0,F
AAACCCACAAATCCCA-1_WSSS_PLA8764121,1279,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000535,1868.0,snRNA-seq,10X,0.032895,...,AAACCCACAAATCCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.120729,-0.113529,G1,dM2,10,0,M
AAACCCACAACTGTGT-1_WSSS_PLA8764121,473,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,556.0,snRNA-seq,10X,0.091892,...,AAACCCACAACTGTGT-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,0.042173,-0.027934,S,0_none,14,0,M
AAACCCACAAGCTGCC-1_WSSS_PLA8764121,1360,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000473,2116.0,snRNA-seq,10X,0.012335,...,AAACCCACAAGCTGCC-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.121624,0.050396,G2M,dNK1,14,0,M
AAACCCACATAACCCA-1_WSSS_PLA8764121,1891,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000305,3274.0,snRNA-seq,10X,0.073634,...,AAACCCACATAACCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.125064,-0.066023,G1,dS1,8,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,2409,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.007270,5227.0,snRNA-seq,10X,0.013699,...,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_0,0,0.033632,-0.079893,S,0_none,7,0,M
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,4129,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.002532,17379.0,snRNA-seq,10X,0.102190,...,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.059688,-0.098701,G1,0_none,2,0,F
TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3116,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.004499,7779.0,snRNA-seq,10X,0.036876,...,TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,0.066281,-0.038885,S,0_none,30,0,F
TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3696,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.001992,20078.0,snRNA-seq,10X,0.109137,...,TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.031627,-0.068024,G1,0_none,2,0,F


In [71]:
# subsetting the adata object to final cells
adata = adata[list(nodoublet_cell_IDs.index), :].copy()
adata

AnnData object with n_obs × n_vars = 74244 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [72]:
# adding metadata
for col in nodoublet_cell_IDs.columns:
    print(col)
    adata.obs[col] = nodoublet_cell_IDs.loc[adata.obs_names,col]

n_genes
donor
tissue_block
age
sample
percent_mito
n_counts
dataset
technique
scrublet_score
scrublet_cluster_score
bh_pval
batch
is_doublet
barcode_sample_copy
barcode
souporcell_assignment
inter_ind_doublet
S_score
G2M_score
phase
annotation_prev
louvain
is_doublet_propagate
origin_M_F


In [73]:
# subset to only selected trophoblast (after cleanup)
# read in obs table with louvain labels from notebook S3-1 and subset to only cleaned up troph
obs_table_with_labels = pd.read_csv(save_path + 'obs_table_adata_scvi_SN_and_multiome_with_trophoblast_contaminant_v3_labels.csv', index_col=0)

adata.obs['cleanup_v3_contaminant'] = obs_table_with_labels.loc[list(set(adata.obs_names) & set(obs_table_with_labels.index)), 'cleanup_v3_contaminant']
adata = adata[adata.obs['cleanup_v3_contaminant'] == 'keeping'].copy()
adata

AnnData object with n_obs × n_vars = 39298 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'cleanup_v3_contaminant'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [74]:
# read in CCGs recalc in notebook S3-1 and using X? PCs
cleaned_up_genes = pd.read_csv(save_path + 'genes_without_CC_assoc_genes_for_scVI_SN_and_multiome_trophoblast_cleanup_v3_20211110.csv', 
                              index_col=0)
adata = adata[:,cleaned_up_genes.index].copy()
adata

AnnData object with n_obs × n_vars = 39298 × 28040
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'cleanup_v3_contaminant'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [75]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [76]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="sample",
    #batch_key="donor"
)

... storing 'donor' as categorical
... storing 'tissue_block' as categorical
... storing 'age' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical
... storing 'technique' as categorical
... storing 'souporcell_assignment' as categorical
... storing 'phase' as categorical
... storing 'annotation_prev' as categorical
... storing 'origin_M_F' as categorical
... storing 'cleanup_v3_contaminant' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [77]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m39298[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 10.11.2021 Running scVI on all trophoblast after:

- removing cell cycle-associated genes (calculated in S3-1 notebook)
- exclusion of all doublets (scrublet and souporcell)

In [78]:
models = {}

# just a few values here, it was 18 PCs I used in the conventional analysis
n_latent_values = [8, 18]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 8
n_latent_value 18


In [79]:
models[8]



In [80]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211110_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_SN_and_multiome_trophoblast_cleanup_v3.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 8
Epoch 204/204: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 204/204 [08:59<00:00,  2.64s/it, loss=873, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 18
Epoch 204/204: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 204/204 [08:59<00:00,  2.65s/it, loss=868, v_num=1]


# Running on invading trophoblast after selecting them in notebook S5-4


Subsetted by unbiased clusteing of the manifold with all trophoblast states

In [81]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202007_snRNA-seq_MFI/snRNA_seq_MFI_202007_adatas/202012_FINAL_reanalysis_with_souporcell_deconvolution_common_variants/donor_P13_all_samples_analysis_202110/'

# Loading object with raw counts, filtered cells and genes object
# before any doublet exclusion
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')


In [82]:
# reading IDs of final cells to keep (after also excluding doublets by GEX, done in notebook M3)
nodoublet_cell_IDs = pd.read_csv(save_path + 'obs_table_nodoublets.csv', index_col=0)
nodoublet_cell_IDs

Unnamed: 0_level_0,n_genes,donor,tissue_block,age,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,barcode,souporcell_assignment,inter_ind_doublet,S_score,G2M_score,phase,annotation_prev,louvain,is_doublet_propagate,origin_M_F
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGATACATG-1_WSSS_PLA8764121,1253,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,1820.0,snRNA-seq,10X,0.191617,...,AAACCCAAGATACATG-1_WSSS_PLA8764121,WSSS_PLA8764121_0,0,0.053412,0.056947,G2M,0_none,16,0,F
AAACCCACAAATCCCA-1_WSSS_PLA8764121,1279,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000535,1868.0,snRNA-seq,10X,0.032895,...,AAACCCACAAATCCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.120729,-0.113529,G1,dM2,10,0,M
AAACCCACAACTGTGT-1_WSSS_PLA8764121,473,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,556.0,snRNA-seq,10X,0.091892,...,AAACCCACAACTGTGT-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,0.042173,-0.027934,S,0_none,14,0,M
AAACCCACAAGCTGCC-1_WSSS_PLA8764121,1360,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000473,2116.0,snRNA-seq,10X,0.012335,...,AAACCCACAAGCTGCC-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.121624,0.050396,G2M,dNK1,14,0,M
AAACCCACATAACCCA-1_WSSS_PLA8764121,1891,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000305,3274.0,snRNA-seq,10X,0.073634,...,AAACCCACATAACCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.125064,-0.066023,G1,dS1,8,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,2409,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.007270,5227.0,snRNA-seq,10X,0.013699,...,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_0,0,0.033632,-0.079893,S,0_none,7,0,M
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,4129,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.002532,17379.0,snRNA-seq,10X,0.102190,...,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.059688,-0.098701,G1,0_none,2,0,F
TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3116,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.004499,7779.0,snRNA-seq,10X,0.036876,...,TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,0.066281,-0.038885,S,0_none,30,0,F
TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3696,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.001992,20078.0,snRNA-seq,10X,0.109137,...,TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.031627,-0.068024,G1,0_none,2,0,F


In [83]:
# subsetting the adata object to final cells
adata = adata[list(nodoublet_cell_IDs.index), :].copy()
adata

AnnData object with n_obs × n_vars = 74244 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [84]:
# adding metadata
for col in nodoublet_cell_IDs.columns:
    print(col)
    adata.obs[col] = nodoublet_cell_IDs.loc[adata.obs_names,col]

n_genes
donor
tissue_block
age
sample
percent_mito
n_counts
dataset
technique
scrublet_score
scrublet_cluster_score
bh_pval
batch
is_doublet
barcode_sample_copy
barcode
souporcell_assignment
inter_ind_doublet
S_score
G2M_score
phase
annotation_prev
louvain
is_doublet_propagate
origin_M_F


In [85]:
# subset to only selected trophoblast (after cleanup)
# read in obs table with louvain labels from notebook S3-1 and subset to only cleaned up troph
obs_table_with_labels = pd.read_csv(save_path + 'obs_table_adata_scvi_SN_and_multiome_with_inv_troph_labels_S3-4_notebook.csv', index_col=0)

adata.obs['inv_troph_labels'] = obs_table_with_labels.loc[list(set(adata.obs_names) & set(obs_table_with_labels.index)), 'inv_troph_labels']
adata = adata[adata.obs['inv_troph_labels'] == 'inv_troph'].copy()
adata

AnnData object with n_obs × n_vars = 6338 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'inv_troph_labels'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [86]:
# read in CCGs recalc in notebook S3-1 and using X? PCs
cleaned_up_genes = pd.read_csv(save_path + 'genes_without_CC_assoc_genes_for_scVI_SN_and_multiome_invading_trophoblast_20211110.csv', 
                              index_col=0)
adata = adata[:,cleaned_up_genes.index].copy()
adata

AnnData object with n_obs × n_vars = 6338 × 28632
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'inv_troph_labels'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [87]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [88]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="sample",
    #batch_key="donor"
)

... storing 'donor' as categorical
... storing 'tissue_block' as categorical
... storing 'age' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical
... storing 'technique' as categorical
... storing 'souporcell_assignment' as categorical
... storing 'phase' as categorical
... storing 'annotation_prev' as categorical
... storing 'origin_M_F' as categorical
... storing 'inv_troph_labels' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [89]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m6338[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 10.11.2021 Running scVI on all trophoblast after:

- removing cell cycle-associated genes (calculated in S3 notebook)
- exclusion of all doublets (scrublet and souporcell, done in S2)

In [90]:
models = {}

# just a few values here, it was 18 PCs I used in the conventional analysis
n_latent_values = [9, 11]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 9
n_latent_value 11


In [94]:
models[9]



In [93]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211110_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_SN_and_multiome_invading_trophoblast.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 9
Epoch 400/400: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:54<00:00,  2.29it/s, loss=968, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 11
Epoch 400/400: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:54<00:00,  2.30it/s, loss=968, v_num=1]


# Running on invading trophoblast after cleanup v1 in notebook S6-1


Subsetted by unbiased clusteing of the manifold with invading trophoblast states

In [95]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202007_snRNA-seq_MFI/snRNA_seq_MFI_202007_adatas/202012_FINAL_reanalysis_with_souporcell_deconvolution_common_variants/donor_P13_all_samples_analysis_202110/'

# Loading object with raw counts, filtered cells and genes object
# before any doublet exclusion
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')


In [96]:
# reading IDs of final cells to keep (after also excluding doublets by GEX, done in notebook M3)
nodoublet_cell_IDs = pd.read_csv(save_path + 'obs_table_nodoublets.csv', index_col=0)
nodoublet_cell_IDs

Unnamed: 0_level_0,n_genes,donor,tissue_block,age,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,barcode,souporcell_assignment,inter_ind_doublet,S_score,G2M_score,phase,annotation_prev,louvain,is_doublet_propagate,origin_M_F
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGATACATG-1_WSSS_PLA8764121,1253,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,1820.0,snRNA-seq,10X,0.191617,...,AAACCCAAGATACATG-1_WSSS_PLA8764121,WSSS_PLA8764121_0,0,0.053412,0.056947,G2M,0_none,16,0,F
AAACCCACAAATCCCA-1_WSSS_PLA8764121,1279,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000535,1868.0,snRNA-seq,10X,0.032895,...,AAACCCACAAATCCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.120729,-0.113529,G1,dM2,10,0,M
AAACCCACAACTGTGT-1_WSSS_PLA8764121,473,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,556.0,snRNA-seq,10X,0.091892,...,AAACCCACAACTGTGT-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,0.042173,-0.027934,S,0_none,14,0,M
AAACCCACAAGCTGCC-1_WSSS_PLA8764121,1360,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000473,2116.0,snRNA-seq,10X,0.012335,...,AAACCCACAAGCTGCC-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.121624,0.050396,G2M,dNK1,14,0,M
AAACCCACATAACCCA-1_WSSS_PLA8764121,1891,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000305,3274.0,snRNA-seq,10X,0.073634,...,AAACCCACATAACCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.125064,-0.066023,G1,dS1,8,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,2409,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.007270,5227.0,snRNA-seq,10X,0.013699,...,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_0,0,0.033632,-0.079893,S,0_none,7,0,M
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,4129,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.002532,17379.0,snRNA-seq,10X,0.102190,...,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.059688,-0.098701,G1,0_none,2,0,F
TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3116,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.004499,7779.0,snRNA-seq,10X,0.036876,...,TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,0.066281,-0.038885,S,0_none,30,0,F
TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3696,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.001992,20078.0,snRNA-seq,10X,0.109137,...,TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.031627,-0.068024,G1,0_none,2,0,F


In [97]:
# subsetting the adata object to final cells
adata = adata[list(nodoublet_cell_IDs.index), :].copy()
adata

AnnData object with n_obs × n_vars = 74244 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [98]:
# adding metadata
for col in nodoublet_cell_IDs.columns:
    print(col)
    adata.obs[col] = nodoublet_cell_IDs.loc[adata.obs_names,col]

n_genes
donor
tissue_block
age
sample
percent_mito
n_counts
dataset
technique
scrublet_score
scrublet_cluster_score
bh_pval
batch
is_doublet
barcode_sample_copy
barcode
souporcell_assignment
inter_ind_doublet
S_score
G2M_score
phase
annotation_prev
louvain
is_doublet_propagate
origin_M_F


In [99]:
# subset to only selected trophoblast (after cleanup)
# read in obs table with louvain labels from notebook S3-1 and subset to only cleaned up troph
obs_table_with_labels = pd.read_csv(save_path + 'obs_table_adata_scvi_SN_and_multiome_with_inv_troph_cleanup_v1.csv', index_col=0)

adata.obs['inv_troph_cleanup_v1'] = obs_table_with_labels.loc[list(set(adata.obs_names) & set(obs_table_with_labels.index)), 'inv_troph_cleanup_v1']
adata = adata[adata.obs['inv_troph_cleanup_v1'] == 'keeping'].copy()
adata

AnnData object with n_obs × n_vars = 4848 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'inv_troph_cleanup_v1'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [100]:
# read in CCGs recalc in notebook S4-1 and using X? PCs
cleaned_up_genes = pd.read_csv(save_path + 'genes_without_CC_assoc_genes_for_scVI_SN_and_multiome_invading_trophoblast_cleanup_v1_20211110.csv', 
                              index_col=0)
adata = adata[:,cleaned_up_genes.index].copy()
adata

AnnData object with n_obs × n_vars = 4848 × 28577
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'inv_troph_cleanup_v1'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [101]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [102]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="sample",
    #batch_key="donor"
)

... storing 'donor' as categorical
... storing 'tissue_block' as categorical
... storing 'age' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical
... storing 'technique' as categorical
... storing 'souporcell_assignment' as categorical
... storing 'phase' as categorical
... storing 'annotation_prev' as categorical
... storing 'origin_M_F' as categorical
... storing 'inv_troph_cleanup_v1' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [103]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m4848[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 10.11.2021 Running scVI on all trophoblast after:

- removing cell cycle-associated genes (calculated in S3-1 notebook)
- exclusion of all doublets (scrublet and souporcell)

In [104]:
models = {}

# just a few values here, it was 18 PCs I used in the conventional analysis
n_latent_values = [8,20]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 8
n_latent_value 20


In [106]:
models[8]



In [107]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211110_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_SN_and_multiome_invading_trophoblast_cleanup_v1.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 8
Epoch 400/400: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:15<00:00,  2.94it/s, loss=1.03e+03, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 400/400: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:16<00:00,  2.92it/s, loss=1.04e+03, v_num=1]


# Running on invading trophoblast after cleanup v2 in notebook S6-2


Subsetted by unbiased clusteing of the manifold with invading trophoblast states

In [5]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202007_snRNA-seq_MFI/snRNA_seq_MFI_202007_adatas/202012_FINAL_reanalysis_with_souporcell_deconvolution_common_variants/donor_P13_all_samples_analysis_202110/'

# Loading object with raw counts, filtered cells and genes object
# before any doublet exclusion
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')


In [6]:
# reading IDs of final cells to keep (after also excluding doublets by GEX, done in notebook M3)
nodoublet_cell_IDs = pd.read_csv(save_path + 'obs_table_nodoublets.csv', index_col=0)
nodoublet_cell_IDs

Unnamed: 0_level_0,n_genes,donor,tissue_block,age,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,barcode,souporcell_assignment,inter_ind_doublet,S_score,G2M_score,phase,annotation_prev,louvain,is_doublet_propagate,origin_M_F
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGATACATG-1_WSSS_PLA8764121,1253,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,1820.0,snRNA-seq,10X,0.191617,...,AAACCCAAGATACATG-1_WSSS_PLA8764121,WSSS_PLA8764121_0,0,0.053412,0.056947,G2M,0_none,16,0,F
AAACCCACAAATCCCA-1_WSSS_PLA8764121,1279,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000535,1868.0,snRNA-seq,10X,0.032895,...,AAACCCACAAATCCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.120729,-0.113529,G1,dM2,10,0,M
AAACCCACAACTGTGT-1_WSSS_PLA8764121,473,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000000,556.0,snRNA-seq,10X,0.091892,...,AAACCCACAACTGTGT-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,0.042173,-0.027934,S,0_none,14,0,M
AAACCCACAAGCTGCC-1_WSSS_PLA8764121,1360,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000473,2116.0,snRNA-seq,10X,0.012335,...,AAACCCACAAGCTGCC-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.121624,0.050396,G2M,dNK1,14,0,M
AAACCCACATAACCCA-1_WSSS_PLA8764121,1891,P13,PU8-9_B5,8-9_PCW,WSSS_PLA8764121,0.000305,3274.0,snRNA-seq,10X,0.073634,...,AAACCCACATAACCCA-1_WSSS_PLA8764121,WSSS_PLA8764121_1,0,-0.125064,-0.066023,G1,dS1,8,0,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,2409,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.007270,5227.0,snRNA-seq,10X,0.013699,...,TTTGTGTTCGTCAAGT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_0,0,0.033632,-0.079893,S,0_none,7,0,M
TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,4129,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.002532,17379.0,snRNA-seq,10X,0.102190,...,TTTGTGTTCTTAGTCT-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.059688,-0.098701,G1,0_none,2,0,F
TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3116,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.004499,7779.0,snRNA-seq,10X,0.036876,...,TTTGTTGGTCACAGCG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,0.066281,-0.038885,S,0_none,30,0,F
TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,3696,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.001992,20078.0,snRNA-seq,10X,0.109137,...,TTTGTTGGTTTACTTG-1_Pla_Camb10714920_and_40110_...,Pla_Camb10714920_and_40110_Pla_Camb10687916_1,0,-0.031627,-0.068024,G1,0_none,2,0,F


In [7]:
# subsetting the adata object to final cells
adata = adata[list(nodoublet_cell_IDs.index), :].copy()
adata

AnnData object with n_obs × n_vars = 74244 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [8]:
# adding metadata
for col in nodoublet_cell_IDs.columns:
    print(col)
    adata.obs[col] = nodoublet_cell_IDs.loc[adata.obs_names,col]

n_genes
donor
tissue_block
age
sample
percent_mito
n_counts
dataset
technique
scrublet_score
scrublet_cluster_score
bh_pval
batch
is_doublet
barcode_sample_copy
barcode
souporcell_assignment
inter_ind_doublet
S_score
G2M_score
phase
annotation_prev
louvain
is_doublet_propagate
origin_M_F


In [9]:
# subset to only selected trophoblast (after cleanup)
# read in obs table with louvain labels from notebook S3-1 and subset to only cleaned up troph
obs_table_with_labels = pd.read_csv(save_path + 'obs_table_adata_scvi_SN_and_multiome_with_inv_troph_cleanup_v2.csv', index_col=0)

adata.obs['inv_troph_cleanup_v2'] = obs_table_with_labels.loc[list(set(adata.obs_names) & set(obs_table_with_labels.index)), 'inv_troph_cleanup_v2']
adata = adata[adata.obs['inv_troph_cleanup_v2'] == 'keeping'].copy()
adata

AnnData object with n_obs × n_vars = 4715 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'inv_troph_cleanup_v2'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [10]:
# read in CCGs recalc in notebook S4-2 and using 8 and 20 PCs
cleaned_up_genes = pd.read_csv(save_path + 'genes_without_CC_assoc_genes_for_scVI_SN_and_multiome_invading_trophoblast_cleanup_v2_20211110.csv', 
                              index_col=0)
adata = adata[:,cleaned_up_genes.index].copy()
adata

AnnData object with n_obs × n_vars = 4715 × 26535
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'barcode_sample_copy', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'S_score', 'G2M_score', 'phase', 'annotation_prev', 'louvain', 'is_doublet_propagate', 'origin_M_F', 'inv_troph_cleanup_v2'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [11]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [12]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="sample",
    #batch_key="donor"
)

... storing 'donor' as categorical
... storing 'tissue_block' as categorical
... storing 'age' as categorical
... storing 'sample' as categorical
... storing 'dataset' as categorical
... storing 'technique' as categorical
... storing 'souporcell_assignment' as categorical
... storing 'phase' as categorical
... storing 'annotation_prev' as categorical
... storing 'origin_M_F' as categorical
... storing 'inv_troph_cleanup_v2' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [13]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m4715[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches, 
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 10.11.2021 Running scVI on all trophoblast after:

- removing cell cycle-associated genes (calculated in S3-1 notebook)
- exclusion of all doublets (scrublet and souporcell)

In [14]:
models = {}

# just a few values here, it was 18 PCs I used in the conventional analysis
n_latent_values = [8,20,30]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 8
n_latent_value 20
n_latent_value 30


In [15]:
models[30]



In [119]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211110_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_SN_and_multiome_invading_trophoblast_cleanup_v2.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 8
Epoch 400/400: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:12<00:00,  3.02it/s, loss=977, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 400/400: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:13<00:00,  3.00it/s, loss=972, v_num=1]


In [16]:
# doing for 30 in addition
latent_representations = {}

for n_latent_value in [30]:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211110_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_SN_and_multiome_invading_trophoblast_cleanup_v2.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 30
Epoch 400/400: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [01:44<00:00,  3.84it/s, loss=991, v_num=1]


# Running on all trophoblast after final cleanup of the indaving compartment in notebook S6-3


In [120]:
save_path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202007_snRNA-seq_MFI/snRNA_seq_MFI_202007_adatas/202012_FINAL_reanalysis_with_souporcell_deconvolution_common_variants/donor_P13_all_samples_analysis_202110/'

# Loading object with raw counts, filtered cells and genes object
# before any doublet exclusion
adata = sc.read(save_path + 'adata_raw_filtered.h5ad')


In [121]:
# reading IDs of final cells to keep
obs_to_keep = pd.read_csv(save_path + 'obs_table_for_scVI_SN_and_multiome_all_trophoblast_cleanup_v5_final_20211110.csv', index_col=0)
obs_to_keep

Unnamed: 0_level_0,n_genes,donor,tissue_block,age,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,inter_ind_doublet,S_score,G2M_score,phase,annotation_prev,louvain,is_doublet_propagate,origin_M_F,annotation_prev_or_removed,inv_troph_labels_initial
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACCTGAAAGGATGGCT-1_Pla_Camb10691970,2595,P13,unknown,8-9_PCW,Pla_Camb10691970,0.002652,3433.4832,snRNA-seq,10X,0.118000,...,0,-0.155819,0.002528,G2M,0_none,1,0,F,0_none,others
CTACAGAAGAGGCTGT-1_WSSS_PLA8810751,3291,P13,PU8-9_B2,8-9_PCW,WSSS_PLA8810751,0.000409,3979.8735,snRNA-seq,10X,0.030408,...,0,-0.070639,-0.061709,G1,VCT_p,5,0,F,VCT_p,others
ATTTACCCATGGAACG-1_WSSS_PLA8810750,2387,P13,PU8-9_B2,8-9_PCW,WSSS_PLA8810750,0.000000,3165.4448,snRNA-seq,10X,0.091914,...,0,-0.017344,-0.001856,G1,SCT,0,0,F,SCT,others
ACCATTTGTGTCTTCC-1_Pla_Camb10691970,1862,P13,unknown,8-9_PCW,Pla_Camb10691970,0.001475,2897.3274,snRNA-seq,10X,0.074830,...,0,-0.073606,-0.074652,G1,0_none,1,0,F,0_none,others
TCATTTGGTCCAGTTA-1_WSSS_PLA8810750,2518,P13,PU8-9_B2,8-9_PCW,WSSS_PLA8810750,0.000900,3417.3792,snRNA-seq,10X,0.047110,...,0,-0.053986,-0.041936,G1,iEVT,4,0,F,iEVT,inv_troph
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTGCTTCTCCCGTAAA-1_WSSS_PLA8810751,2391,P13,PU8-9_B2,8-9_PCW,WSSS_PLA8810751,0.000997,3568.1180,snRNA-seq,10X,0.232955,...,0,0.085361,0.077971,S,0_none,1,0,F,removed,others
CGATGCGCAATTGCCA-1_WSSS_PLA8810751,3525,P13,PU8-9_B2,8-9_PCW,WSSS_PLA8810751,0.001118,4057.3357,snRNA-seq,10X,0.025431,...,0,-0.116954,-0.147250,G1,VCT,5,0,F,VCT,others
TTACAGCAGGAAACTG-1_Pla_Camb10714920_and_40110_Pla_Camb10687916,4764,P13,unknown,8-9_PCW,Pla_Camb10714920_and_40110_Pla_Camb10687916,0.035067,3999.3276,snRNA-seq,10X,0.050289,...,0,-0.181669,-0.209082,G1,0_none,23,0,F,0_none,inv_troph
AGCATCACATATGAAG-1_WSSS_PLA8810750,2431,P13,PU8-9_B2,8-9_PCW,WSSS_PLA8810750,0.001215,3604.9640,snRNA-seq,10X,0.058760,...,0,0.073197,0.476664,G2M,EVT_1,4,0,F,EVT_1,inv_troph


In [122]:
# subsetting the adata object to final cells
adata = adata[list(obs_to_keep.index), :].copy()
adata

AnnData object with n_obs × n_vars = 37675 × 29058
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [123]:
# read in CCGs recalc in notebook S4-2 and using 8 and 20 PCs
cleaned_up_genes = pd.read_csv(save_path + 'genes_without_CC_assoc_genes_for_scVI_SN_and_multiome_all_trophoblast_cleanup_v5_final_20211110.csv', 
                              index_col=0)
adata = adata[:,cleaned_up_genes.index].copy()
adata

AnnData object with n_obs × n_vars = 37675 × 28465
    obs: 'n_genes', 'donor', 'tissue_block', 'age', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gene_ids-7', 'feature_types-7', 'genome-7', 'n_cells-7'

In [124]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [125]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="sample",
    #batch_key="donor"
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [126]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m37675[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m1[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 10.11.2021 Running scVI on all trophoblast after:

- removing cell cycle-associated genes (calculated in S5-1 notebook)
- exclusion of all doublets (scrublet and souporcell, done in S2 notebook)

In [127]:
models = {}

n_latent_values = [8,12,17]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 8
n_latent_value 12
n_latent_value 17


In [128]:
models[8]



In [129]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./results/20211110_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_after_CCG_removal_and_all_doublet_exclusion_by_sample_donor_P13_SN_and_multiome_all_trophoblast_cleanup_v5_final.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 8
Epoch 212/212: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 212/212 [08:58<00:00,  2.54s/it, loss=918, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 12
Epoch 212/212: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 212/212 [08:58<00:00,  2.54s/it, loss=906, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 17
Epoch 212/212: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 212/212 [08:58<00:00,  2.54s/it, loss=913, v_num=1]
