In [216]:
import os
import scanpy as sc
from os.path import join, exists
from os import listdir
import anndata
import scipy
import numpy as np
import sys

from utils import *

# convert counts into float32
# Convenience method for computing the size of objects
def print_size_in_MB(x):
    return '{:.3} MB'.format(x.__sizeof__()/1e6)

def print_size_in_MB_sparse_matrix(a):
    # a = scipy.sparse.csr_matrix(np.random.randint(10, size=(40, 3)))
    # x = a.data.nbytes + a.indptr.nbytes + a.indices.nbytes
    size = a.data.size/(1024**2)
    return '{:.3} MB'.format(size)

import warnings
warnings.filterwarnings("ignore")

### Prepare input for general run

In [233]:
scib_methods_path = 'scib_config.methods.yml'
lines = [r for r in open(scib_methods_path)]
methods_data = {}
curr_method = None
method_parms = []
for e in lines:
    if e.startswith('  ') and not e.startswith('   '):
        if curr_method is not None:
            methods_data[curr_method] = method_parms
        curr_method = e.strip()[:-1]
        method_parms = []
    else:
        method_parms.append(e.strip())
        
# last method
methods_data[curr_method] = method_parms

In [234]:
methods.keys()

dict_keys(['seurat', 'trvae', 'harmony', 'scvi', 'fastmnn', 'mnn', 'scanorama', 'saucie', 'trvaep', 'bbknn', 'conos', 'combat', 'liger', 'scgen', 'desc'])

In [235]:
methods_query = ['scanorama', 'seurat', 'combat', 'bbknn'] # 'saucie'] # 'scgen']

In [237]:


# methods use


scib_template_path = 'scib_config.template.yml'
exists(scib_template_path)
lines = ''.join([r for r in open(scib_template_path)])

import pandas as pd
from os.path import abspath
input_dir = '/storage/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/data/integration_march_2021/input'
batches = ['dataset', 'batch.merged']
labels = ['cell.type']

queries = []
for f in listdir(input_dir):
    
    if 'all' in f:
        continue
    if not f.endswith('.h5ad') or 'part' in f:
        continue
    p = abspath(join(input_dir, f))
    
    for batch in batches:
        for label in labels:
            queries.append([p, batch, label])
queries = pd.DataFrame(queries, columns=['file', 'batch_key', 'label_key'])

dataset_template = '''
  $query_name:
    batch_key: $batch_key
    label_key: $label_key
    assay: expression
    organism: human
    file: $path
'''
out_queries = ''
for ri, r in queries.iterrows():
    # print(ri, r.values)
    next_query = dataset_template.replace('$query_name', os.path.basename(r['file'].replace('.h5ad', '')) + '_' + r['batch_key'] + '_' + r['label_key'])
    next_query = next_query.replace('$batch_key', r['batch_key']).replace('$label_key', r['label_key']).replace('$path', r['file'])
    out_queries += next_query

out_methods = ''
for q in methods_query:
    next_method = '  ' + q + ':\n    ' + '\n    '.join(methods_data[q])
    out_methods += next_method + '\n'
out = lines.replace('$METHODS_QUERY', out_methods)
out = out.replace('$DATA_SCENARIOS_QUERY', out_queries)

config_output_path = '../../scib/config.yaml'
writer = open(config_output_path, 'w')
writer.writelines(out + '\n')
writer.close()

overwrite = False
for n_sample_per_batch in [500, 1000]: #  None]:
    # examine types, columns and others incorporated in the object
    
    code_n_cells = (('_' + str(n_sample_per_batch) if n_sample_per_batch is not None else ''))

    print(code_n_cells)

    print('# of cells (input argument)', n_sample_per_batch)
    
    code_output = (('_' + str(n_sample_per_batch) if n_sample_per_batch is not None else '_all'))
    output_path = '../../data/integration_march_2021/input/input%s_cells.h5ad' % code_output
    abspath_scib = os.path.abspath(output_path)        
    print(abspath_scib)

_500
# of cells (input argument) 500
/mnt/znas/icb_zstore01/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/data/integration_march_2021/input/input_500_cells.h5ad
_1000
# of cells (input argument) 1000
/mnt/znas/icb_zstore01/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/data/integration_march_2021/input/input_1000_cells.h5ad


### Check queries for routines that should run in GPU (e.g. scgen)

In [214]:
p = '/storage/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/scib/snakemake_dryrun.out'
queries = [q for q in open(p) if q.strip().startswith('conda') and 'scgen' in q and 'runIntegration'in q]
for q in queries:
    print(q)

        conda run -n scIB-python python scripts/runIntegration.py -i /storage/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/scIB_run/input_all_cells_batch.merged_cell.type/prepare/unscaled/hvg/adata_pre.h5ad -o /storage/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/scIB_run/input_all_cells_batch.merged_cell.type/integration/unscaled/hvg/scgen.h5ad 	      -b batch.merged --method scgen -v 2000 -c cell.type 	      

        conda run -n scIB-python python scripts/runIntegration.py -i /storage/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/scIB_run/input_1000_cells_batch.merged_cell.type/prepare/unscaled/hvg/adata_pre.h5ad -o /storage/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/scIB_run/input_1000_cells_batch.merged_cell.type/integration/unscaled/hvg/scgen.h5ad 	      -b batch.merged --method scgen -v 2000 -c cell.type 	      

        conda run -n scIB-python python script