### Scran normalization step (per sample)
- All the samples are processed independently, and we generated scran samples for each of those.

In [3]:
import os
import scanpy as sc
from os.path import join
from os import listdir
import anndata
import numpy as np
import scipy
import os

In [6]:
# convert counts into float32
# Convenience method for computing the size of objects
def print_size_in_MB(x):
    print('{:.3} MB'.format(x.__sizeof__()/1e6))

In [8]:
datadir = '/storage/groups/ml01/datasets/projects/20210318_retinal_data_integration_ignacio.ibarra_malte.luecken'
outdir = '/mnt/znas/icb_zstore01/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/data/integration_march_2021/scran'

In [36]:
filenames = [f for f in os.listdir(datadir)]
filenames_md5 = [r.strip() for f in ['md5sum_chen_5_fix5.txt', 'md5sum.txt'] for r in open(os.path.join(datadir, f))]

files = set()
for qi in filenames_md5:
    md5, fi = qi.split('  ')
    found = os.path.exists(os.path.join(datadir, fi))
    if not found:
        print('not found', fi)
    files.add(fi)

In [38]:
filenames_by_dataset = {}
for f in filenames_md5:
    dataset, filename = f.split(' ')[-1].split('/')[-2:]
    if not dataset in filenames_by_dataset:
        filenames_by_dataset[dataset] = []
    filenames_by_dataset[dataset].append(filename)

## Scran normalization

In [39]:
filenames_by_dataset.keys()

dict_keys(['Chen_a_fix5', 'Wong', 'Scheetz', 'Roska', 'Chen_c', 'Hafler', 'Chen_a', 'Sanes', 'Hackney', 'Chen_b'])

In [41]:
import multiprocessing
from multiprocessing import Process
from multiprocessing import Manager

def execute_preprocessing(input_path, output_path):
    print('')
    path_preprocessing = '../../scib/scripts/preprocessing_remove_empty.py'
    cmd = 'python %s -i %s -o %s' % (path_preprocessing, input_path, output_path)
    
    try:
        print(cmd)
        os.system(cmd)
    except Exception as err:
        print('something went wrong...')
        print(err)

def run(function, input_list, n_cores, log_each=None, log=False):
    print(('run function %s with n_cores = %i' % (function, n_cores)))
    print(function)
    # print 'with input list of len'
    # print len(input_list)
    # print 'in groups of %d threads' % n_threads

    assert n_cores <= 20

    # the type of input_list has to be a list. If not
    # then it can a single element list and we cast it to list.
    if not isinstance(type(input_list[0]), type(list)):
        input_list = [[i] for i in input_list]

    n_groups = int(len(input_list) / n_cores + 1)
    # print 'n groups', n_groups

    n_done = 0
    for group_i in range(n_groups):
        start, end = group_i * n_cores, (group_i + 1) * n_cores
        # print 'start', start, 'end', end

        threads = [None] * (end - start)
        for i, pi in enumerate(range(start, min(end, len(input_list)))):
            next_args = input_list[pi]
            if log:
                print(next_args)
            # print next_kmer
            threads[i] = Process(target=function, args=next_args)
            # print 'starting process #', i
            threads[i].start()

        # print  threads
        # print 'joining threads...'
        # do some other stuff
        for i in range(len(threads)):
            if threads[i] is None:
                continue
            threads[i].join()

            n_done += 1
            if log_each is not None and log_each % n_done == 0:
                print('Done %i so far' % n_done)
    print('done...')


In [42]:
from os.path import join
arguments = []

for dataset in filenames_by_dataset:
    # print(dataset)
    for filename in filenames_by_dataset[dataset]:
        input_file = join(datadir, dataset, filename)
        next_outdir = join(outdir, dataset)
        # print(next_outdir)
        
        if not os.path.exists(next_outdir):
            os.mkdir(next_outdir)
            
        output_file = join(next_outdir, filename)

        # print(os.path.exists(output_file), output_file)
        if os.path.exists(output_file):
            # print(os.path.exists(output_file), 'skip...')
            continue
        
        # print(input_file)
        # print(output_file)
        # print('')
        
        arguments.append([input_file, output_file])
        # ad = sc.read_h5ad(join(datadir, p))

arguments

In [14]:
# this is a test. maintain commented after finishing
# !python ../../scib/scripts/preprocessing_remove_empty.py -i /storage/groups/ml01/datasets/projects/20210318_retinal_data_integration_ignacio.ibarra_malte.luecken/Wong/Retina_2B.h5ad -o /mnt/znas/icb_zstore01/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/data/integration_march_2021/scran/Wong/Retina_2B.h5ad

In [44]:
print(len(arguments))
arguments = sorted(arguments, key=lambda x: os.path.getsize(x[0]))

5


In [45]:
run(execute_preprocessing, arguments, n_cores=10)

run function <function execute_preprocessing at 0x7f64de357440> with n_cores = 10
<function execute_preprocessing at 0x7f64de357440>

python ../../scib/scripts/preprocessing_remove_empty.py -i /storage/groups/ml01/datasets/projects/20210318_retinal_data_integration_ignacio.ibarra_malte.luecken/Chen_a_fix5/10x_Lobe_D28_13_Nu.h5ad -o /mnt/znas/icb_zstore01/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/data/integration_march_2021/scran/Chen_a_fix5/10x_Lobe_D28_13_Nu.h5ad

python ../../scib/scripts/preprocessing_remove_empty.py -i /storage/groups/ml01/datasets/projects/20210318_retinal_data_integration_ignacio.ibarra_malte.luecken/Chen_a_fix5/10x_Lobe_D27_13_Nu.h5ad -o /mnt/znas/icb_zstore01/groups/ml01/workspace/ignacio.ibarra/theislab/retinal_scRNAseq_integration/data/integration_march_2021/scran/Chen_a_fix5/10x_Lobe_D27_13_Nu.h5ad

python ../../scib/scripts/preprocessing_remove_empty.py -i /storage/groups/ml01/datasets/projects/20210318_retinal_data_integrat

In [46]:
print('done...')

done...
