# Running scVI on troph organoid data

### No cytokine data here

In 2 versions of correction: by `donor` or by `sample` + `donor`

In [1]:
from __future__ import print_function
import torch

In [2]:
import sys, os
data_type = 'float32'
os.environ["THEANO_FLAGS"] = 'device=cuda,floatX=' + data_type + ',force_device=True'
sys.path.insert(1, '/nfs/team292/aa22/scVI_related/202105_troph_organoids/')

**Set up scVI environment**

In [3]:
import scvi
import scanpy as sc

sc.set_figure_params(figsize=(4, 4))

In [4]:
import pandas as pd 
import numpy as np

**Read in raw counts of the trophoblast organoids**

In [52]:
path = '/lustre/scratch117/cellgen/team292/aa22/adata_objects/202105_troph_organoids_exp_1_2/'

# Loading object with raw counts, filtered cells and genes object
# after exclusion of doublets by genotype but not by GEX yet
adata = sc.read_h5ad(path + 'adata_raw_filtered.h5ad')

In [53]:
# reading IDs of final cells to keep (after also excluding doublets by GEX, done in notebook M3)
final_cell_IDs = pd.read_csv(path + '20210521_final_cell_IDs.csv', index_col=0)
final_cell_IDs

AAACCCAAGCTAGTTC-1_6044STDY8640561
AAACCCACAAAGTGTA-1_6044STDY8640561
AAACCCAGTGCACGCT-1_6044STDY8640561
AAACCCAGTTGGCCTG-1_6044STDY8640561
AAACGAAAGCTCCACG-1_6044STDY8640561
...
TTTGTTGCAGGTTCGC-1_Pla_Camb10123935
TTTGTTGCAGTTGCGC-1_Pla_Camb10123935
TTTGTTGCATGATAGA-1_Pla_Camb10123935
TTTGTTGGTGCTCCGA-1_Pla_Camb10123935
TTTGTTGTCTTTGCTA-1_Pla_Camb10123935


In [54]:
# setting obs_names and obs.index to 'barcode_sample'
adata.obs.set_index('barcode_sample', inplace=True)

In [55]:
# subsetting the adata object to final cells
adata = adata[list(final_cell_IDs.index), :].copy()
adata

AnnData object with n_obs × n_vars = 92045 × 23281
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'ge

In [56]:
# addition - subsetting to genes after cell cycle-associated gene removal
# read in cleaned up genes, after all doublet exclusion
# these genes have been calculated in the course of analysis of all organoids data (stimulated and unstimulated)
cleaned_up_genes = pd.read_csv('/nfs/team292/aa22/scVI_related/202105_troph_organoids/genes_for_scVI_after_CCGs_removal_after_all_doublet_exclusion.csv', 
                              index_col=0)
cleaned_up_genes

Unnamed: 0,gene_ids-0,feature_types-0,genome-0,n_cells-0,gene_ids-1,feature_types-1,genome-1,n_cells-1,gene_ids-10,feature_types-10,...,genome-7,n_cells-7,gene_ids-8,feature_types-8,genome-8,n_cells-8,gene_ids-9,feature_types-9,genome-9,n_cells-9
A1BG,ENSG00000121410,Gene Expression,GRCh38,609.0,ENSG00000121410,Gene Expression,GRCh38,861.0,ENSG00000121410,Gene Expression,...,GRCh38,280.0,ENSG00000121410,Gene Expression,GRCh38,381.0,ENSG00000121410,Gene Expression,GRCh38,602.0
A1BG-AS1,ENSG00000268895,Gene Expression,GRCh38,116.0,ENSG00000268895,Gene Expression,GRCh38,174.0,ENSG00000268895,Gene Expression,...,GRCh38,90.0,ENSG00000268895,Gene Expression,GRCh38,59.0,ENSG00000268895,Gene Expression,GRCh38,116.0
A2M,,,,,,,,,,,...,,,ENSG00000175899,Gene Expression,GRCh38,3.0,,,,
A2M-AS1,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,GRCh38,13.0,ENSG00000245105,Gene Expression,...,GRCh38,20.0,ENSG00000245105,Gene Expression,GRCh38,5.0,ENSG00000245105,Gene Expression,GRCh38,9.0
A2ML1,ENSG00000166535,Gene Expression,GRCh38,53.0,ENSG00000166535,Gene Expression,GRCh38,49.0,ENSG00000166535,Gene Expression,...,GRCh38,99.0,ENSG00000166535,Gene Expression,GRCh38,40.0,ENSG00000166535,Gene Expression,GRCh38,51.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,ENSG00000070476,Gene Expression,GRCh38,466.0,ENSG00000070476,Gene Expression,GRCh38,630.0,ENSG00000070476,Gene Expression,...,GRCh38,446.0,ENSG00000070476,Gene Expression,GRCh38,206.0,ENSG00000070476,Gene Expression,GRCh38,269.0
ZYG11A,ENSG00000203995,Gene Expression,GRCh38,99.0,ENSG00000203995,Gene Expression,GRCh38,156.0,ENSG00000203995,Gene Expression,...,GRCh38,121.0,ENSG00000203995,Gene Expression,GRCh38,44.0,ENSG00000203995,Gene Expression,GRCh38,74.0
ZYG11B,ENSG00000162378,Gene Expression,GRCh38,1648.0,ENSG00000162378,Gene Expression,GRCh38,2233.0,ENSG00000162378,Gene Expression,...,GRCh38,2235.0,ENSG00000162378,Gene Expression,GRCh38,1442.0,ENSG00000162378,Gene Expression,GRCh38,1862.0
ZYX,ENSG00000159840,Gene Expression,GRCh38,2152.0,ENSG00000159840,Gene Expression,GRCh38,3590.0,ENSG00000159840,Gene Expression,...,GRCh38,2223.0,ENSG00000159840,Gene Expression,GRCh38,2277.0,ENSG00000159840,Gene Expression,GRCh38,3658.0


In [57]:
# subsetting to only cleaned up genes
adata = adata[:, list(cleaned_up_genes.index)].copy()

In [58]:
# also getting rid of soupy genes calc in notebook S3 (based on 1 soupy sample)
gene_stats = pd.read_csv('./gene_filter_cells_by_sample_Pla_Camb10123930.csv', index_col=0)
gene_stats

Unnamed: 0,Empty_droplet_sum,Cell_sum,Soup_probability,Selected
RTN4RL1,1.0,0.0,1.000000,False
TEX2,341.0,181.0,0.653257,False
PRRG1,3682.0,2230.0,0.622801,True
HHATL,1.0,0.0,1.000000,False
AC011287.2,6.0,4.0,0.600000,True
...,...,...,...,...
UBLCP1,2175.0,1221.0,0.640459,False
AL117336.1,1.0,0.0,1.000000,False
A2M-AS1,10.0,5.0,0.666667,False
KRTAP19-3,2.0,0.0,1.000000,False


In [59]:
np.unique(gene_stats['Selected'], return_counts=True)

(array([False,  True]), array([15040,  9715]))

In [60]:
genes2keep = list(gene_stats[gene_stats['Selected'] == True].index)
genes2keep

['PRRG1',
 'AC011287.2',
 'NIPAL1',
 'PI16',
 'PKD2L2',
 'GATD3A',
 'OSBPL7',
 'SLC25A17',
 'SCAI',
 'LINC01748',
 'CHGB',
 'BSG',
 'AL645940.1',
 'C12orf56',
 'ITFG2',
 'EN2',
 'ELOVL1',
 'AC097478.3',
 'GCOM1',
 'DDR2',
 'TH',
 'AC103764.1',
 'AL050343.2',
 'OTULINL',
 'GPSM3',
 'KLHL7-DT',
 'AC096582.3',
 'TDRD12',
 'TMEM98',
 'AC012157.1',
 'GAS2L3',
 'ZNF552',
 'AL136962.1',
 'PRSS57',
 'TOMM22',
 'ABCG8',
 'PMS1',
 'ANKS3',
 'IFT172',
 'ALDH1A1',
 'FBXO17',
 'AL022476.1',
 'FAM241B',
 'AC245060.5',
 'FAAP24',
 'HOXA-AS3',
 'AC126773.4',
 'ASAP2',
 'AC009065.8',
 'AC005899.8',
 'HSPBAP1',
 'SELENOT',
 'SHANK2-AS1',
 'RRM1-AS1',
 'NALCN-AS1',
 'HEG1',
 'AC010680.5',
 'XPC',
 'RTN2',
 'AC106786.2',
 'TRPM2-AS',
 'AHNAK',
 'PDGFRL',
 'LINC01591',
 'SCN5A',
 'DDX24',
 'FUCA2',
 'AC011921.1',
 'TAB3-AS2',
 'AMT',
 'PICK1',
 'AP000894.4',
 'PCDHGA5',
 'ZNF213-AS1',
 'INE2',
 'RIT2',
 'TBC1D32',
 'OR2C3',
 'EEA1',
 'RSBN1',
 'CUL7',
 'CPEB1',
 'KCNQ1OT1',
 'MIR181A2HG',
 'PTCHD1',
 'SMPX

In [61]:
adata

AnnData object with n_obs × n_vars = 92045 × 23015
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'ge

In [62]:
# subsetting to only cleaned up genes
adata = adata[:, list(set(genes2keep) & set(adata.var_names))].copy()

In [63]:
# additionally: excluding genes that are common b/w mix population (annotated in notebook S4) and empty droplets
# these genes are derived on a per sample basis in notebook S5
#genes_common_bw_mix_and_empty_droplets = list(pd.read_csv('./20210729_common_genes_bw_mix_population_and_empty_droplets.csv',
#                                                    index_col=0)['0'])
#len(genes_common_bw_mix_and_empty_droplets)


In [64]:
#len(set(genes_common_bw_mix_and_empty_droplets))

In [65]:
# I kind of doubt that excuding these 564 genes with drastically change smth but let's try
#len(set(adata.var_names) & set(genes_common_bw_mix_and_empty_droplets))

In [66]:
#genes2exclude = set(adata.var_names) & set(genes_common_bw_mix_and_empty_droplets)

In [67]:
#len(list(set(adata.var_names) - genes2exclude))

In [68]:
#adata = adata[:, list(set(adata.var_names) - genes2exclude)].copy()

In [69]:
adata

AnnData object with n_obs × n_vars = 92045 × 9555
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gen

## Taking only data for organoids without stimulation

In [70]:
np.unique(adata.obs['activation_status'], return_counts=True)

(array(['cytokines', 'no_cytokines'], dtype=object), array([43369, 48676]))

In [71]:
adata = adata[adata.obs['activation_status'] == 'no_cytokines'].copy()

In [72]:
# going to correct by donor or by donor+sample here
np.unique(adata.obs['donor'], return_counts=True)

(array(['1', '2', '3', '4', '5', '6'], dtype=object),
 array([ 2710, 13143,  5623, 16892,  8269,  2039]))

In [73]:
np.unique(adata.obs['sample'], return_counts=True)

(array(['6044STDY8640561', '6044STDY8640563', '6044STDY8640565',
        'Pla_Camb10123928', 'Pla_Camb10123929', 'Pla_Camb10123930',
        'Pla_Camb10123932', 'Pla_Camb10123934'], dtype=object),
 array([5586, 8527, 7363, 3404, 6968, 6539, 3403, 6886]))

In [74]:
adata

AnnData object with n_obs × n_vars = 48676 × 9555
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gen

In [75]:
# 01.08.2021: 3) calculating scVI embedding for only VCT --> EVT branch for later use in trajectories, now from updated annotation in notebook S7 (v3)

# reading the prelim_annot_v3 table
prelim_annot_v3 = pd.read_csv('/lustre/scratch117/cellgen/team292/aa22/adata_objects/202105_troph_organoids_exp_1_2/no_cytokine_analysis_prelim_annot_v3_20210801.csv',
                             index_col=0)
prelim_annot_v3

Unnamed: 0_level_0,barcode_sample_copy,prelim_annot_v3
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACCCACAAAGTGTA-1_6044STDY8640561,AAACCCACAAAGTGTA-1_6044STDY8640561,EVT
AAACCCAGTGCACGCT-1_6044STDY8640561,AAACCCAGTGCACGCT-1_6044STDY8640561,EVT
AAACCCAGTTGGCCTG-1_6044STDY8640561,AAACCCAGTTGGCCTG-1_6044STDY8640561,EVT_p
AAACGAAAGCTCCACG-1_6044STDY8640561,AAACGAAAGCTCCACG-1_6044STDY8640561,EVT_p
AAACGAACAAGTCGTT-1_6044STDY8640561,AAACGAACAAGTCGTT-1_6044STDY8640561,EVT
...,...,...
TTTGTTGGTATCGGTT-1_Pla_Camb10123934,TTTGTTGGTATCGGTT-1_Pla_Camb10123934,VCT
TTTGTTGGTTGTGGCC-1_Pla_Camb10123934,TTTGTTGGTTGTGGCC-1_Pla_Camb10123934,VCT_fusing
TTTGTTGTCAAGAGTA-1_Pla_Camb10123934,TTTGTTGTCAAGAGTA-1_Pla_Camb10123934,SCT
TTTGTTGTCCAATCTT-1_Pla_Camb10123934,TTTGTTGTCCAATCTT-1_Pla_Camb10123934,SCT


In [76]:
adata

AnnData object with n_obs × n_vars = 48676 × 9555
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 'n_cells-6', 'gen

In [77]:
len(set(adata.obs_names) & set(prelim_annot_v3.index))

37480

In [78]:
# subsetting to only barcodes in prelim_annot_v3 = aka without the mix cluster (as annot in S4 notebook)
adata = adata[list(prelim_annot_v3.index),:].copy()

In [79]:
adata.obs['prelim_annot_v3'] = prelim_annot_v3.loc[list(adata.obs_names), 'prelim_annot_v3']

In [80]:
adata.obs['prelim_annot_v3'].value_counts()

EVT             11400
VCT              8627
SCT              5373
VCT_CCC_EVT?     5158
EVT_p            2660
VCT_p            2408
eEVT?            1025
VCT_fusing        829
Name: prelim_annot_v3, dtype: int64

In [81]:
np.unique(adata.obs['prelim_annot_v3'])

array(['EVT', 'EVT_p', 'SCT', 'VCT', 'VCT_CCC_EVT?', 'VCT_fusing',
       'VCT_p', 'eEVT?'], dtype=object)

In [82]:
adata = adata[adata.obs['prelim_annot_v3'].isin(['EVT', 'EVT_p', 
                                                 #'SCT', 
                                                 'VCT', 'VCT_CCC_EVT?', 
                                                 #'VCT_fusing',
                                                 'VCT_p', 'eEVT?'])].copy()

In [83]:
adata

AnnData object with n_obs × n_vars = 31278 × 9555
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor', 'prelim_annot_v3'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6'

In [84]:
# 20.08.2021 v2: also additionally ignoring louvain cluster 9 since it is SCT contaminants (as per notebook S9)

# reading it table with louvain_R labels
louvain_labels = pd.read_csv('/lustre/scratch117/cellgen/team292/aa22/adata_objects/202105_troph_organoids_exp_1_2/20210820_no_cytokine_analysis_VCT_EVT_zoomin_louvain_labels_v2.csv',
                            index_col=0)
louvain_labels


Unnamed: 0_level_0,louvain_R
barcode_sample,Unnamed: 1_level_1
AAACCCACAAAGTGTA-1_6044STDY8640561,2
AAACCCAGTGCACGCT-1_6044STDY8640561,2
AAACCCAGTTGGCCTG-1_6044STDY8640561,4
AAACGAAAGCTCCACG-1_6044STDY8640561,0
AAACGAACAAGTCGTT-1_6044STDY8640561,2
...,...
TTTGTTGAGATGAAGG-1_Pla_Camb10123934,1
TTTGTTGCACTTGAGT-1_Pla_Camb10123934,1
TTTGTTGGTACGATTC-1_Pla_Camb10123934,1
TTTGTTGGTATCGGTT-1_Pla_Camb10123934,1


In [86]:
# firstly, subst to only barcodes in the louvain_labels table (that is after first cleanup)
adata = adata[louvain_labels.index,:].copy()

# then add the louvain_R labels
adata.obs['louvain'] = louvain_labels.loc[adata.obs_names,'louvain_R']
adata.obs['louvain'].value_counts()

0      5585
1      5364
2      5181
3      4516
4      4137
5      3358
7       878
6,0     720
8       297
9       257
10      239
6,1     169
6,2     151
6,3      42
Name: louvain, dtype: int64

In [87]:
adata

AnnData object with n_obs × n_vars = 30894 × 9555
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor', 'prelim_annot_v3', 'louvain'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6',

In [89]:
adata[adata.obs['louvain'] != '9']

View of AnnData object with n_obs × n_vars = 30637 × 9555
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor', 'prelim_annot_v3', 'louvain'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_t

In [90]:
adata = adata[adata.obs['louvain'] != '9'].copy()

In [91]:
adata

AnnData object with n_obs × n_vars = 30637 × 9555
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor', 'prelim_annot_v3', 'louvain'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6',

In [92]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [93]:
# creating a joint donor + sample key for the most correct HVG calculation in case of donor + sample correction
adata.obs['donor_sample'] = [i + '_' + j for i,j in zip(adata.obs['donor'], adata.obs['sample'])]

In [94]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="donor_sample"
)

... storing 'prelim_annot_v3' as categorical
... storing 'louvain' as categorical
... storing 'donor_sample' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [95]:
adata

AnnData object with n_obs × n_vars = 30637 × 2000
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor', 'prelim_annot_v3', 'louvain', 'donor_sample'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'f

#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [96]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["donor","sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m30637[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m2[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 20.08.2021, running scVI after excluding soupy genes based on sample 30 and only for VCT -> EVT branch populations (acc to annotation done in notebook S7) AND excluding louvain cluster 9 (SCT contaminants) according to analysis in notebook S9, cleanup v2

In [97]:
models = {}

# Valentina said n_latent = 10 worked for her well, let's try a few values
n_latent_values = [5, 10, 15, 20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [98]:
models[5]



In [99]:
models[10]



In [100]:
models[15]



In [101]:
models[20]



In [102]:
models[30]



In [103]:
models[40]



In [104]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20210820_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_NO_CYTONIKE_after_CCG_removal_and_all_doublet_exclusion_with_LF_gene_filter_only_VCT_EVT_branch_proper_no_SCT_contaminants_v2.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 5
Epoch 261/261: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 261/261 [07:04<00:00,  1.63s/it, loss=996, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 10
Epoch 261/261: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 261/261 [07:04<00:00,  1.63s/it, loss=981, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 15
Epoch 261/261: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 261/261 [07:06<00:00,  1.63s/it, loss=982, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 261/261: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 261/261 [07:05<00:00,  1.63s/it, loss=993, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 30
Epoch 261/261: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 261/261 [07:05<00:00,  1.63s/it, loss=980, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 40
Epoch 261/261: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 261/261 [07:05<00:00,  1.63s/it, loss=993, v_num=1]


### 20.08.2021, running scVI after excluding soupy genes based on sample 30 and only for VCT -> EVT branch populations (acc to annotation done in notebook S7) AND excluding louvain cluster 11 (SCT contaminants) according to analysis in notebook S8

In [44]:
models = {}

# Valentina said n_latent = 10 worked for her well, let's try a few values
n_latent_values = [5, 10, 15, 20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [45]:
models[5]



In [46]:
models[10]



In [47]:
models[15]



In [48]:
models[20]



In [49]:
models[30]



In [50]:
models[40]



In [51]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20210820_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_NO_CYTONIKE_after_CCG_removal_and_all_doublet_exclusion_with_LF_gene_filter_only_VCT_EVT_branch_proper_no_SCT_contaminants.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 5
Epoch 259/259: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 259/259 [07:03<00:00,  1.63s/it, loss=996, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 10
Epoch 259/259: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 259/259 [07:03<00:00,  1.64s/it, loss=986, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 15
Epoch 259/259: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 259/259 [07:05<00:00,  1.64s/it, loss=976, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 259/259: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 259/259 [07:04<00:00,  1.64s/it, loss=985, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 30
Epoch 259/259: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 259/259 [07:03<00:00,  1.64s/it, loss=965, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 40
Epoch 259/259: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 259/259 [07:03<00:00,  1.64s/it, loss=991, v_num=1]


### 01.08.2021, running scVI after excluding soupy genes based on sample 30 and only for VCT -> EVT branch populations (acc to annotation done in notebook S7)

In [58]:
models = {}

# Valentina said n_latent = 10 worked for her well, let's try a few values
n_latent_values = [5, 10, 15, 20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [59]:
models[5]



In [60]:
models[10]



In [61]:
models[15]



In [62]:
models[20]



In [63]:
models[30]



In [64]:
models[40]



In [65]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20210801_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_NO_CYTONIKE_after_CCG_removal_and_all_doublet_exclusion_with_LF_gene_filter_only_VCT_EVT_branch_proper.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 5
Epoch 256/256: 100%|██████████████████████████████████████████████████████████| 256/256 [07:03<00:00,  1.65s/it, loss=994, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 10
Epoch 256/256: 100%|██████████████████████████████████████████████████████████| 256/256 [07:03<00:00,  1.65s/it, loss=976, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 15
Epoch 256/256: 100%|██████████████████████████████████████████████████████████| 256/256 [07:05<00:00,  1.66s/it, loss=981, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 256/256: 100%|██████████████████████████████████████████████████████████| 256/256 [07:04<00:00,  1.66s/it, loss=980, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 30
Epoch 256/256: 100%|██████████████████████████████████████████████████████████| 256/256 [07:04<00:00,  1.66s/it, loss=992, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 40
Epoch 256/256: 100%|██████████████████████████████████████████████████████████| 256/256 [07:04<00:00,  1.66s/it, loss=978, v_num=1]


In [30]:
# 01.08.2021: 1) calculating scVI embedding without mix population (acc to prelim_annot_v2 done in S4 notebook)
# 2) calculating scVI embedding for only VCT --> EVT branch for later use in trajectories

# reading the prelim_annot_v2 table
prelim_annot_v2 = pd.read_csv('/lustre/scratch117/cellgen/team292/aa22/adata_objects/202105_troph_organoids_exp_1_2/no_cytokine_analysis_prelim_annot_v2_20210727.csv',
                              index_col=0)


adata.obs['prelim_annot_v2'] = prelim_annot_v2.loc[adata.obs_names,'prelim_annot_v2']

In [32]:
adata.obs['prelim_annot_v2'].value_counts()

EVT           12017
mix           11196
VCT            7060
EVT_1          5692
SCT            5401
VCT_CCC        3081
VCT_p          2736
eEVT?          1012
VCT_fusing      481
Name: prelim_annot_v2, dtype: int64

In [33]:
adata_no_mix = adata[adata.obs['prelim_annot_v2'] != 'mix'].copy()
adata_VCT_EVT_branch = adata[adata.obs['prelim_annot_v2'].isin(['EVT','VCT','EVT_1','VCT_CCC',
                                                                'VCT_p','eEVT?'])].copy()

In [34]:
# checking
adata_no_mix.obs['prelim_annot_v2'].value_counts()

EVT           12017
VCT            7060
EVT_1          5692
SCT            5401
VCT_CCC        3081
VCT_p          2736
eEVT?          1012
VCT_fusing      481
Name: prelim_annot_v2, dtype: int64

In [35]:
# checking
adata_VCT_EVT_branch.obs['prelim_annot_v2'].value_counts()

EVT        12017
VCT         7060
EVT_1       5692
VCT_CCC     3081
VCT_p       2736
eEVT?       1012
Name: prelim_annot_v2, dtype: int64

In [51]:
# currently running for adata_no_mix3
#adata = adata_no_mix.copy()

In [52]:
# currently running for adata_VCT_EVT_branch
adata = adata_VCT_EVT_branch.copy()

In [53]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [54]:
# creating a joint donor + sample key for the most correct HVG calculation in case of donor + sample correction
adata.obs['donor_sample'] = [i + '_' + j for i,j in zip(adata.obs['donor'], adata.obs['sample'])]

In [55]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="donor_sample"
)

... storing 'prelim_annot_v2' as categorical
... storing 'donor_sample' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [56]:
adata

AnnData object with n_obs × n_vars = 31598 × 2000
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor', 'prelim_annot_v2', 'donor_sample'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_type

#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [57]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["donor","sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m31598[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m2[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 01.08.2021, running scVI after excluding soupy genes based on sample 30 and only for VCT -> EVT branch populations (acc to annotation done in notebook S4)

In [58]:
models = {}

# Valentina said n_latent = 10 worked for her well, let's try a few values
n_latent_values = [5, 10, 15, 20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [59]:
models[5]



In [60]:
models[10]



In [61]:
models[15]



In [62]:
models[20]



In [63]:
models[30]



In [64]:
models[40]



In [65]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20210801_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_NO_CYTONIKE_after_CCG_removal_and_all_doublet_exclusion_with_LF_gene_filter_only_VCT_EVT_branch.csv')
    

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 5
Epoch 253/253: 100%|██████████████████████████████████████████████████████████████████████| 253/253 [07:03<00:00,  1.67s/it, loss=994, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 10
Epoch 253/253: 100%|██████████████████████████████████████████████████████████████████████| 253/253 [07:04<00:00,  1.68s/it, loss=988, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 15
Epoch 253/253: 100%|████████████████████████████████████████████████████████████████████| 253/253 [07:05<00:00,  1.68s/it, loss=1e+03, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 253/253: 100%|██████████████████████████████████████████████████████████████████████| 253/253 [07:04<00:00,  1.68s/it, loss=970, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 30
Epoch 253/253: 100%|██████████████████████████████████████████████████████████████████████| 253/253 [07:05<00:00,  1.68s/it, loss=980, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 40
Epoch 253/253: 100%|██████████████████████████████████████████████████████████████████████| 253/253 [07:05<00:00,  1.68s/it, loss=972, v_num=1]


### 01.08.2021, running scVI after excluding soupy genes based on sample 30 and excluding cluster `mix` (acc to annotation done in notebook S4)

In [42]:
models = {}

# Valentina said n_latent = 10 worked for her well, let's try a few values
n_latent_values = [5, 10, 15, 20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [43]:
models[5]



In [44]:
models[10]



In [45]:
models[15]



In [46]:
models[20]



In [47]:
models[30]



In [48]:
models[40]



In [49]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20210801_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_NO_CYTONIKE_after_CCG_removal_and_all_doublet_exclusion_with_LF_gene_filter_and_excluding_mix_cluster.csv')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 5
Epoch 213/213: 100%|██████████████████████████████████████████████████████████████████████| 213/213 [07:01<00:00,  1.98s/it, loss=909, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 10
Epoch 213/213: 100%|██████████████████████████████████████████████████████████████████████| 213/213 [07:02<00:00,  1.98s/it, loss=906, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 15
Epoch 213/213: 100%|██████████████████████████████████████████████████████████████████████| 213/213 [07:02<00:00,  1.98s/it, loss=899, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 213/213: 100%|██████████████████████████████████████████████████████████████████████| 213/213 [07:02<00:00,  1.98s/it, loss=915, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 30
Epoch 213/213: 100%|██████████████████████████████████████████████████████████████████████| 213/213 [07:02<00:00,  1.98s/it, loss=914, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 40
Epoch 213/213: 100%|██████████████████████████████████████████████████████████████████████| 213/213 [07:02<00:00,  1.98s/it, loss=920, v_num=1]


### 29.07.2021, running scVI after excluding soupy genes based on sample 30 AND genes common b/w mix population and empty droplets (for more detail see notebooks S4 and S5)

In [33]:
models = {}

# Valentina said n_latent = 10 worked for her well, let's try a few values
n_latent_values = [5, 10, 15, 20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [34]:
models[5]



In [35]:
models[10]



In [36]:
models[15]



In [37]:
models[20]



In [38]:
models[30]



In [39]:
models[40]



In [40]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20210729_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_NO_CYTONIKE_after_CCG_removal_and_all_doublet_exclusion_with_LF_gene_filter_and_excluding_genes_common_bw_mix_population_and_empty_droplets.csv')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 5
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [07:05<00:00,  2.60s/it, loss=634, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 10
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [07:05<00:00,  2.59s/it, loss=601, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 15
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [07:05<00:00,  2.60s/it, loss=612, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [07:06<00:00,  2.60s/it, loss=611, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 30
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [07:06<00:00,  2.60s/it, loss=607, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 40
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [07:05<00:00,  2.60s/it, loss=599, v_num=1]


#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [28]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["donor","sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m48676[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m2[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 23.07.2021, running scVI after excluding soupy genes based on sample 30

In [29]:
models = {}

# Valentina said n_latent = 10 worked for her well, let's try a few values
n_latent_values = [5, 10, 15, 20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [30]:
models[5]



In [31]:
models[10]



In [32]:
models[15]



In [33]:
models[20]



In [34]:
models[30]



In [35]:
models[40]



In [36]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20210723_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_NO_CYTONIKE_after_CCG_removal_and_all_doublet_exclusion_with_LF_gene_filter.csv')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


training model for n_latent_value: 5
Epoch 164/164: 100%|███████████████████████████████████████████████████████████████| 164/164 [08:59<00:00,  3.29s/it, loss=699, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


training model for n_latent_value: 10
Epoch 164/164: 100%|███████████████████████████████████████████████████████████████| 164/164 [09:01<00:00,  3.30s/it, loss=663, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


training model for n_latent_value: 15
Epoch 164/164: 100%|███████████████████████████████████████████████████████████████| 164/164 [09:02<00:00,  3.31s/it, loss=675, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


training model for n_latent_value: 20
Epoch 164/164: 100%|███████████████████████████████████████████████████████████████| 164/164 [09:01<00:00,  3.30s/it, loss=674, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


training model for n_latent_value: 30
Epoch 164/164: 100%|███████████████████████████████████████████████████████████████| 164/164 [09:00<00:00,  3.30s/it, loss=669, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


training model for n_latent_value: 40
Epoch 164/164: 100%|███████████████████████████████████████████████████████████████| 164/164 [09:02<00:00,  3.31s/it, loss=660, v_num=1]


In [15]:
# 23.07.2021 addition - excluding some low QC cluster 'mix' and cluster 'SCT_donor_5' (this one coz it's ~80% from donor 5 only)
# to calculate a cleaned up manifold

# reading in adata with prelim annotation
adata_prelim_annot = sc.read(path + 'adata_scVI_analysed_no_cytokines.h5ad')


In [16]:
np.unique(adata_prelim_annot.obs['prelim_annot'], return_counts=True)

(array(['EVT', 'SCT', 'SCT_RP_high', 'SCT_donor_5', 'VCT', 'VCT_CCC',
        'VCT_IFIT', 'VCT_TOM_like', 'VCT_fusing', 'VCT_p', 'eEVT?', 'mix'],
       dtype=object),
 array([10026,  4997,  1709,  2054,  8020,  2037,    35,  7795,   845,
         1508,   171,  9479]))

In [17]:
adata_prelim_annot.obs

Unnamed: 0_level_0,n_genes,time_point,activation_status,media,sample,percent_mito,n_counts,dataset,technique,scrublet_score,...,G2M_score,phase,celltype_predictions_P13,probabilities_P13,celltype_predictions_Roser,probabilities_Roser,louvain,louvain_R,barcode_sample_copy,prelim_annot
barcode_sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGCTAGTTC-1_6044STDY8640561,715,3H,no_cytokines,EVTM,6044STDY8640561,0.016393,1220.0,scRNA-seq,10X,0.025670,...,-0.052240,G1,SCT,0.636900,VCT_CCC,0.960739,5,5,AAACCCAAGCTAGTTC-1_6044STDY8640561,mix
AAACCCACAAAGTGTA-1_6044STDY8640561,3765,3H,no_cytokines,EVTM,6044STDY8640561,0.067086,18931.0,scRNA-seq,10X,0.173047,...,-0.183358,G1,SCT,0.940563,EVT_1,0.947113,4,40,AAACCCACAAAGTGTA-1_6044STDY8640561,SCT
AAACCCAGTGCACGCT-1_6044STDY8640561,2116,3H,no_cytokines,EVTM,6044STDY8640561,0.005407,6288.0,scRNA-seq,10X,0.056231,...,-0.143884,G1,SCT,0.815997,EVT_1,0.667089,8,8,AAACCCAGTGCACGCT-1_6044STDY8640561,EVT
AAACCCAGTTGGCCTG-1_6044STDY8640561,2672,3H,no_cytokines,EVTM,6044STDY8640561,0.111556,10999.0,scRNA-seq,10X,0.054764,...,0.101151,S,VCT_CCC,0.866917,VCT,0.995907,11,11,AAACCCAGTTGGCCTG-1_6044STDY8640561,VCT_p
AAACGAAAGCTCCACG-1_6044STDY8640561,1820,3H,no_cytokines,EVTM,6044STDY8640561,0.109140,6652.0,scRNA-seq,10X,0.044248,...,-0.135884,G1,VCT_CCC,0.920668,VCT,0.994888,1,1,AAACGAAAGCTCCACG-1_6044STDY8640561,VCT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTATCGGTT-1_Pla_Camb10123934,3899,96H,no_cytokines,TOM,Pla_Camb10123934,0.098137,15509.0,scRNA-seq,10X,0.207339,...,-0.069770,G1,VCT_p,0.999891,VCT,0.999992,2,2,TTTGTTGGTATCGGTT-1_Pla_Camb10123934,VCT_TOM_like
TTTGTTGGTTGTGGCC-1_Pla_Camb10123934,3576,96H,no_cytokines,TOM,Pla_Camb10123934,0.086538,17680.0,scRNA-seq,10X,0.159639,...,-0.143474,G1,VCT_fusing,0.852114,SCT,0.623202,4,41,TTTGTTGGTTGTGGCC-1_Pla_Camb10123934,VCT_fusing
TTTGTTGTCAAGAGTA-1_Pla_Camb10123934,416,96H,no_cytokines,TOM,Pla_Camb10123934,0.001206,829.0,scRNA-seq,10X,0.105085,...,-0.113720,G1,SCT,0.903973,VCT,0.991487,6,6,TTTGTTGTCAAGAGTA-1_Pla_Camb10123934,SCT
TTTGTTGTCCAATCTT-1_Pla_Camb10123934,1562,96H,no_cytokines,TOM,Pla_Camb10123934,0.063601,4937.0,scRNA-seq,10X,0.070347,...,-0.151458,G1,SCT,0.909222,VCT_CCC,0.980226,4,41,TTTGTTGTCCAATCTT-1_Pla_Camb10123934,VCT_fusing


In [20]:
# adding prelim_annot to object here
adata.obs['prelim_annot'] = adata_prelim_annot.obs.loc[adata.obs_names,'prelim_annot']

# excluding 'mix' and 'SCT_donor_5'
adata = adata[(adata.obs['prelim_annot'] != 'mix') & (adata.obs['prelim_annot'] != 'SCT_donor_5')].copy()
adata

AnnData object with n_obs × n_vars = 37143 × 23015
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor', 'prelim_annot'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6', 'genome-6', 

In [21]:
# do some basic preprocessing
adata.layers["raw_counts"] = adata.X.copy() # preserve counts
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.raw = adata.copy()

In [22]:
# creating a joint donor + sample key for the most correct HVG calculation in case of donor + sample correction
adata.obs['donor_sample'] = [i + '_' + j for i,j in zip(adata.obs['donor'], adata.obs['sample'])]

In [23]:
# here we subset to HVGs, accounting for donor batch
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=2000,
    subset=True,
    #layer="raw_counts",
    flavor="seurat",
    batch_key="donor_sample"
)

... storing 'donor_sample' as categorical
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [24]:
adata

AnnData object with n_obs × n_vars = 37143 × 2000
    obs: 'n_genes', 'time_point', 'activation_status', 'media', 'sample', 'percent_mito', 'n_counts', 'dataset', 'technique', 'scrublet_score', 'scrublet_cluster_score', 'bh_pval', 'batch', 'is_doublet', 'experiment', 'barcode', 'souporcell_assignment', 'inter_ind_doublet', 'donor', 'prelim_annot', 'donor_sample'
    var: 'gene_ids-0', 'feature_types-0', 'genome-0', 'n_cells-0', 'gene_ids-1', 'feature_types-1', 'genome-1', 'n_cells-1', 'gene_ids-10', 'feature_types-10', 'genome-10', 'n_cells-10', 'gene_ids-11', 'feature_types-11', 'genome-11', 'n_cells-11', 'gene_ids-12', 'feature_types-12', 'genome-12', 'n_cells-12', 'gene_ids-13', 'feature_types-13', 'genome-13', 'n_cells-13', 'gene_ids-2', 'feature_types-2', 'genome-2', 'n_cells-2', 'gene_ids-3', 'feature_types-3', 'genome-3', 'n_cells-3', 'gene_ids-4', 'feature_types-4', 'genome-4', 'n_cells-4', 'gene_ids-5', 'feature_types-5', 'genome-5', 'n_cells-5', 'gene_ids-6', 'feature_types-6

#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [25]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["donor","sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m37143[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m2[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### 23.07.2021, running scVI after excluding `mix` and `SCT_donor_5` clusters (code name "cleanup_1")

In [26]:
models = {}

# Valentina said n_latent = 10 worked for her well, let's try a few values
n_latent_values = [5, 10, 15, 20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [27]:
models[5]



In [28]:
models[10]



In [29]:
models[15]



In [30]:
models[20]



In [31]:
models[30]



In [32]:
models[40]



In [33]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20210723_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_NO_CYTONIKE_after_CCG_removal_and_all_doublet_exclusion_cleanup_1.csv')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


training model for n_latent_value: 5
Epoch 215/215: 100%|██████████████████████████████████████████████████████████████████| 215/215 [17:01<00:00,  4.75s/it, loss=1.1e+03, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


training model for n_latent_value: 10
Epoch 215/215: 100%|█████████████████████████████████████████████████████████████████| 215/215 [17:40<00:00,  4.93s/it, loss=1.09e+03, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


training model for n_latent_value: 15
Epoch 215/215: 100%|██████████████████████████████████████████████████████████████████| 215/215 [21:52<00:00,  6.10s/it, loss=1.1e+03, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


training model for n_latent_value: 20
Epoch 215/215: 100%|█████████████████████████████████████████████████████████████████| 215/215 [16:17<00:00,  4.55s/it, loss=1.08e+03, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


training model for n_latent_value: 30
Epoch 215/215: 100%|█████████████████████████████████████████████████████████████████| 215/215 [23:11<00:00,  6.47s/it, loss=1.11e+03, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


training model for n_latent_value: 40
Epoch 215/215: 100%|██████████████████████████████████████████████████████████████████| 215/215 [19:26<00:00,  5.42s/it, loss=1.1e+03, v_num=1]


### Running scVI after removing cell cycle-associated genes (calculated in M2 notebook) and exclusion of all doublets including those by GEX (marked in M3 notebook)

In [32]:
models = {}

# Valentina said n_latent = 10 worked for her well, let's try a few values
n_latent_values = [5, 10, 15, 20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [36]:
models[5]



In [37]:
models[10]



In [38]:
models[15]



In [39]:
models[20]



In [40]:
models[30]



In [41]:
models[40]



In [42]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20210720_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_NO_CYTONIKE_after_CCG_removal_and_all_doublet_exclusion.csv')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 5
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [09:12<00:00,  3.37s/it, loss=782, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 10
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [09:13<00:00,  3.37s/it, loss=786, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 15
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [09:13<00:00,  3.37s/it, loss=771, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [09:13<00:00,  3.38s/it, loss=771, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 30
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [09:14<00:00,  3.38s/it, loss=781, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 40
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [09:14<00:00,  3.38s/it, loss=766, v_num=1]


## Running scVI also for `donor` + `sample` batch correction

#### Here you can specify your covariates you want to correct for (categorical or continuous)

In [20]:
scvi.data.setup_anndata(
    adata,
    layer="raw_counts",
    categorical_covariate_keys=["donor","sample"],
    #continuous_covariate_keys=[""] # could try and regress n_genes in the future? not for now
)

[34mINFO    [0m No batch_key inputted, assuming all cells are same batch                            
[34mINFO    [0m No label_key inputted, assuming all cells have same label                           
[34mINFO    [0m Using data from adata.layers[1m[[0m[32m"raw_counts"[0m[1m][0m                                          
[34mINFO    [0m Computing library size prior per batch                                              
[34mINFO    [0m Successfully registered anndata object containing [1;36m48676[0m cells, [1;36m2000[0m vars, [1;36m1[0m batches,
         [1;36m1[0m labels, and [1;36m0[0m proteins. Also registered [1;36m2[0m extra categorical covariates and [1;36m0[0m extra
         continuous covariates.                                                              
[34mINFO    [0m Please do not further modify adata until model is trained.                          


### Running scVI after removing cell cycle-associated genes (calculated in M2 notebook) and exclusion of all doublets including those by GEX (marked in M3 notebook)

In [21]:
models = {}

# Valentina said n_latent = 10 worked for her well, let's try a few values
n_latent_values = [5, 10, 15, 20, 30, 40]

for n_latent_value in n_latent_values:
    print('n_latent_value', n_latent_value)
    models[n_latent_value] = scvi.model.SCVI(adata, n_latent = n_latent_value)

n_latent_value 5
n_latent_value 10
n_latent_value 15
n_latent_value 20
n_latent_value 30
n_latent_value 40


In [22]:
models[5]



In [23]:
models[10]



In [24]:
models[15]



In [25]:
models[20]



In [26]:
models[30]



In [27]:
models[40]



In [28]:
adata.obs['_scvi_batch']

barcode_sample
AAACCCAAGCTAGTTC-1_6044STDY8640561     0
AAACCCACAAAGTGTA-1_6044STDY8640561     0
AAACCCAGTGCACGCT-1_6044STDY8640561     0
AAACCCAGTTGGCCTG-1_6044STDY8640561     0
AAACGAAAGCTCCACG-1_6044STDY8640561     0
                                      ..
TTTGTTGGTATCGGTT-1_Pla_Camb10123934    0
TTTGTTGGTTGTGGCC-1_Pla_Camb10123934    0
TTTGTTGTCAAGAGTA-1_Pla_Camb10123934    0
TTTGTTGTCCAATCTT-1_Pla_Camb10123934    0
TTTGTTGTCGCATTAG-1_Pla_Camb10123934    0
Name: _scvi_batch, Length: 48676, dtype: int8

In [29]:
latent_representations = {}

for n_latent_value in n_latent_values:
    print('training model for n_latent_value:', n_latent_value)
    models[n_latent_value].train()
    
    # get latent representation as a dataframe
    latent_representations[n_latent_value] = models[n_latent_value].get_latent_representation()
    
    # add it to the adata object
    adata.obsm["X_scVI_n_latent_" + str(n_latent_value)] = latent_representations[n_latent_value]
    curr_df = pd.DataFrame(adata.obsm["X_scVI_n_latent_" + str(n_latent_value)])
    
    # save the latent representation
    curr_df.to_csv('./20210720_obsm_with_scVI_latent_representation_n_' + str(n_latent_value) + '_NO_CYTONIKE_after_CCG_removal_and_all_doublet_exclusion_by_donor_sample.csv')

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 5
Epoch 164/164: 100%|██████████████████████████████████████████████████████████████████████| 164/164 [16:36<00:00,  6.08s/it, loss=957, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 10
Epoch 164/164: 100%|██████████████████████████████████████████████████████████| 164/164 [18:56<00:00,  6.93s/it, loss=906, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 15
Epoch 164/164: 100%|██████████████████████████████████████████████████████████| 164/164 [18:48<00:00,  6.88s/it, loss=921, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 20
Epoch 164/164: 100%|██████████████████████████████████████████████████████████| 164/164 [18:48<00:00,  6.88s/it, loss=912, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 30
Epoch 164/164: 100%|██████████████████████████████████████████████████████████| 164/164 [18:48<00:00,  6.88s/it, loss=912, v_num=1]


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


training model for n_latent_value: 40
Epoch 164/164: 100%|██████████████████████████████████████████████████████████| 164/164 [18:49<00:00,  6.89s/it, loss=896, v_num=1]
