In [None]:
# *** scArches integration of new datasets to reference 
# (integration run selected from the original test on the smaller ref data subset)

In [2]:
import scanpy as sc
import pandas as pd
import pickle
import scarches as sca
import datetime
import os
import argparse
import glob
import anndata as ann
import numpy as np

import sys  
sys.path.insert(0, '/lustre/groups/ml01/workspace/karin.hrovatin/code/diabetes_analysis/')
import helper as h

Using TensorFlow backend.


In [2]:
# Paths for loading data and saving results
path_ref='/lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/ref_combined/preprocessed/'
path_model='/lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/ref_combined/scArches/'
path_out='/lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/combined/scArches/integrate_add_ref/'
#Unique ID2 for reading/writing h5ad files with helper function
UID2='scArches_addToRef'

From the initially tested scArches models with different parameters on the ref subset we manually copied the selected one into a directory called ref_run, as the directories themselvles are named based on timestamp, so they would never have comparable name across re-runs.

In [4]:
# Load parameters of ref model
task_name_ref='ref_run'
params=pickle.load(open(path_model+task_name_ref+'/params.pkl','rb'))
print('Old params:',params)
params['task_name']=task_name_ref
print('Corrected old params:',params)

Old params: {'z_dimension': 15, 'architecture': [128, 128, 128], 'task_name': 'run_scArches1601891506.923574', 'x_dimension': 2000, 'beta': 0.0, 'alpha': 0.99, 'loss_fn': 'sse', 'n_epochs': 150, 'batch_size': 128, 'subset_beta': False, 'hvg_n': '2000'}
Corrected old params: {'z_dimension': 15, 'architecture': [128, 128, 128], 'task_name': 'ref_run', 'x_dimension': 2000, 'beta': 0.0, 'alpha': 0.99, 'loss_fn': 'sse', 'n_epochs': 150, 'batch_size': 128, 'subset_beta': False, 'hvg_n': '2000'}


In [5]:
# Change params for new networks
params_new=params.copy()
params_new['task_name']='run_scArches'+str(datetime.datetime.now().timestamp())
params_new['sca_version']='scArches v2'
#params_new['early_stop']=early_stop
print('New params',params_new)

New params {'z_dimension': 15, 'architecture': [128, 128, 128], 'task_name': 'run_scArches1603270547.313737', 'x_dimension': 2000, 'beta': 0.0, 'alpha': 0.99, 'loss_fn': 'sse', 'n_epochs': 150, 'batch_size': 128, 'subset_beta': False, 'hvg_n': '2000', 'sca_version': 'scArches v2'}


In [6]:
# *** Reference adata
adata_ref=h.open_h5ad(file=path_ref+'data_normalised.h5ad',unique_id2=UID2)

adata_ref.var.rename(columns={'highly_variable': 'highly_variable_2000'}, inplace=True)
hvg_col='highly_variable_'+params['hvg_n']
# Select HVG
adata_ref=adata_ref[:,adata_ref.var[hvg_col]]

# Ensure that adata.raw.X (used by scArches in nb) has the same genes as adata.X
adata_ref.raw=adata_ref.raw[:,adata_ref.raw.var_names.isin(adata_ref.var_names)].to_adata()
# Rename size factors so that scArches finds them
if 'size_factors' in adata_ref.obs.columns:
    print('Size factors are already present - renaming them to size_factors_old')
adata_ref.obs.rename(columns={'size_factors': 'size_factors_old', 'size_factors_sample': 'size_factors'}, inplace=True)

In [7]:
# *** New data
data=[('NOD','/lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/GSE144471/data_normlisedForIntegration.h5ad'),
      ('NOD_elimination','/lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/GSE117770/data_normlisedForIntegration.h5ad'),
      ('spikein_drug','/lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/GSE142465/data_normlisedForIntegration.h5ad'),
      ('embryo','/lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/GSE132188/rev7/data_normlisedForIntegration.h5ad'),
      ('VSG','/lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/VSG_PF_WT_cohort/rev7/nonref/data_normlisedForIntegration.h5ad'),
      ('STZ','/lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/islet_glpest_lickert/rev7/nonref/data_normlisedForIntegration.h5ad')]

adatas=[]
for study,path_pp in data:
    print(study)
    #Load data
    adata_pp=h.open_h5ad(file=path_pp,unique_id2=UID2)
    print('Preprocessed data shape:',adata_pp.shape)  
    # Extract raw data, subset it to QC cells and integration genes and normalise raw
    adata=adata_pp.raw.to_adata()
    adata=adata[adata_pp.obs_names,adata_ref.var_names]
    adata.obs=adata_pp.obs.copy()
    # Normalise raw data with previously compited size factors
    adata.raw=adata.copy()
    adata.X /= adata.obs['size_factors_sample'].values[:,None] # This reshapes the size-factors array
    sc.pp.log1p(adata)
    adata.X = np.asarray(adata.X)
    adatas.append(adata)
    print('Integration data shape:',adata.shape) 

# Combine datasets    
adata_query = ann.AnnData.concatenate( *adatas,  batch_key = 'study', batch_categories = [d[0] for d in data ]).copy()

NOD
Preprocessed data shape: (2690, 15034)
Integration data shape: (2690, 2000)
NOD_elimination
Preprocessed data shape: (54329, 18696)
Integration data shape: (54329, 2000)
spikein_drug
Preprocessed data shape: (33331, 17709)
Integration data shape: (33331, 2000)
embryo
Preprocessed data shape: (37561, 17631)
Integration data shape: (37561, 2000)
VSG
Preprocessed data shape: (57157, 19958)
Integration data shape: (57157, 2000)
STZ
Preprocessed data shape: (41150, 17846)
Integration data shape: (41150, 2000)


In [8]:
# Rename size factors so that scArches finds them
if 'size_factors' in adata_query.obs.columns:
    print('Size factors are already present - renaming them to size_factors_old')
else:
    adata_query.obs.rename(columns={'size_factors': 'size_factors_old', 'size_factors_sample': 'size_factors'}, inplace=True)

In [9]:
print('Reference adata')
print(adata_ref)
print('Query adata')
print(adata_query)

Reference adata
AnnData object with n_obs × n_vars = 73487 × 2000
    obs: 'file', 'n_counts', 'n_genes', 'mt_frac', 'doublet_score', 'size_factors_study', 'pre_cell_type', 'S_score', 'G2M_score', 'phase', 'phase_cyclone', 's_cyclone', 'g2m_cyclone', 'g1_cyclone', 'x_score', 'y_score', 'sex', 'age', 'design', 'strain', 'tissue', 'technique', 'study', 'study_sample', 'size_factors', 'cell_type', 'cell_type_multiplet', 'cell_subtype', 'cell_subtype_multiplet', 'true_type'
    var: 'n_cells-Fltp_2y', 'n_cells-Fltp_P16', 'n_cells-Fltp_adult', 'n_cells-STZ', 'n_cells-VSG', 'highly_variable_2000', 'highly_variable_5000', 'highly_variable_2000_beta', 'highly_variable_2000_Fltp_P16', 'highly_variable_2000_beta_Fltp_P16', 'highly_variable_2000_Fltp_adult', 'highly_variable_2000_Fltp_2y', 'highly_variable_2000_VSG', 'highly_variable_2000_STZ', 'highly_variable_2000_beta_Fltp_adult', 'highly_variable_2000_beta_Fltp_2y', 'highly_variable_2000_beta_VSG', 'highly_variable_2000_beta_STZ', 'highly_var

In [10]:
# *** Restore model
network = sca.models.scArches(task_name=params['task_name'],
    x_dimension=params['x_dimension'],
    z_dimension=params['z_dimension'],
    architecture=params['architecture'],
    gene_names=adata_ref.var_names.tolist(),
    conditions=adata_ref.obs['study_sample'].unique().tolist(),
    alpha=params['alpha'], 
    beta=params['beta'],
    loss_fn=params['loss_fn'],
    model_path=path_model,
    )









Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

scArches' network has been successfully constructed!




Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
scArches' network has been successfully compiled!


In [11]:
network.train(adata_ref,
              condition_key='study_sample',
              retrain=False,
              # These conditions are not really needed as network is not retrained
              n_epochs=params['n_epochs'],
              batch_size=params['batch_size'],
              save=False
             )



scArches' network has been successfully compiled!
cvae's weights has been successfully restored!


In [12]:
# *** Modify scArches model with new studies
network_new=network

network_new = sca.operate(network_new,
    new_task_name=params_new['task_name'],
    new_conditions=adata_query.obs['study_sample'].unique(),
    # Does not work, so change below
    #new_network_kwargs={'model_path':path_out}
    # Mo's suggestion
    #new_network_kwargs={'use_batchnorm':params_new['use_batchnorm']},
    version=params_new['sca_version']
                     )
network_new.model_path=path_out+network_new.task_name+os.sep
network_new.train(adata_query,
          condition_key='study_sample',
          retrain=True,
          n_epochs=params_new['n_epochs'],
          batch_size=params_new['batch_size'],
          save=True,
          #early_stop_limit=params_new['early_stop']
         )   

scArches' network has been successfully constructed!
scArches' network has been successfully compiled!


 |████████████████████| 100.0%  - loss: 373.6227 - mmd_loss: 0.0000 - recon_loss: 373.6227 - val_loss: 94.3628 - val_mmd_loss: 0.0000 - val_recon_loss: 94.3628

scArches has been successfully saved in /lustre/groups/ml01/workspace/karin.hrovatin/data/pancreas/scRNA/combined/scArches/integrate_add_ref/run_scArches1603270547.313737/.


In [13]:
# *** Save integration results

In [14]:
# Save params
pickle.dump(params_new, open( path_out+params_new['task_name']+"/params.pkl", "wb" ) )

In [15]:
# Get latent representation
# Combined data for latent representation
adata = ann.AnnData.concatenate( adata_query,adata_ref, batch_categories =['nonref','ref']).copy()
# Make sure that obs_names match ref
adata.obs_names=[name.replace('-ref','').replace('-nonref','') for name in adata.obs_names]
latent_adata = network_new.get_latent(adata, 'study_sample')
print('Latent adata:\n',latent_adata.shape)

Latent adata:
 (299705, 15)


In [16]:
#Compute neighbours and UMAP
sc.pp.neighbors(latent_adata,n_pcs=0)
sc.tl.umap(latent_adata)

In [17]:
# Save latent data
h.save_h5ad(adata=latent_adata,file=path_out+params_new['task_name']+'/latent.h5ad',unique_id2=UID2)

... storing 'file' as categorical
... storing 'study' as categorical
... storing 'study_sample' as categorical
... storing 'reference' as categorical
... storing 'pre_cell_type' as categorical
... storing 'phase' as categorical
... storing 'phase_cyclone' as categorical
... storing 'sex' as categorical
... storing 'age' as categorical
... storing 'design' as categorical
... storing 'strain' as categorical
... storing 'tissue' as categorical
... storing 'technique' as categorical
... storing 'cell_type' as categorical
... storing 'cell_type_multiplet' as categorical
... storing 'cell_subtype' as categorical
... storing 'cell_subtype_multiplet' as categorical
... storing 'true_type' as categorical
