## Group samples by dataset, using increasing numbers of cells

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings("ignore")
import scIB
import os
import scanpy as sc
from os.path import join
from os import listdir
import anndata
import numpy as np
import scipy

In [3]:
### Use the scran related directory to map all the files we need to put together.
datadir_orig = '/storage/groups/ml01/datasets/projects/20210318_retinal_data_integration_ignacio.ibarra_malte.luecken'
datadir_scran = '../../data/integration_march_2021/scran'
os.path.exists(datadir_orig), os.path.exists(datadir_scran)

(True, True)

In [4]:

# convert counts into float32
# Convenience method for computing the size of objects
def print_size_in_MB(x):
    print('{:.3} MB'.format(x.__sizeof__()/1e6))

filenames = [f for f in os.listdir(datadir_orig)]
filenames_md5 = [r.strip() for f in ['md5sum_chen_5_fix5.txt', 'md5sum.txt'] for r in open(os.path.join(datadir_orig, f))]

files = set()
for qi in filenames_md5:
    md5, fi = qi.split('  ')
    # print(fi)
    found = os.path.exists(os.path.join(datadir_scran, fi))
    if not found:
        print('not found', fi)
    files.add(fi)


In [5]:
filenames_by_dataset = {}
for f in filenames_md5:
    dataset, filename = f.split(' ')[-1].split('/')[-2:]
    if not dataset in filenames_by_dataset:
        filenames_by_dataset[dataset] = []
    filenames_by_dataset[dataset].append(filename)

In [6]:
filenames_by_dataset

{'Chen_a_fix5': ['10x3_Lobe_D005_13_NeuN.h5ad',
  '10x3_Lobe_D013_13_NeuN.h5ad',
  '10x_Lobe_D27_13_Nu.h5ad',
  '10x_Lobe_D28_13_Nu.h5ad',
  '10x_Lobe_D30_13_NeuN.h5ad'],
 'Wong': ['Retina_3B.h5ad',
  'Retina_2B.h5ad',
  'Retina_1.h5ad',
  'Retina_2A.h5ad',
  'Retina_3A.h5ad'],
 'Scheetz': ['GSM3745992.h5ad',
  'GSM3745996.h5ad',
  'GSM3745993.h5ad',
  'GSM3745997.h5ad',
  'GSM3745995.h5ad',
  'GSM3745994.h5ad'],
 'Roska': ['R-00646_03_Fovea_Retina_Left.h5ad',
  'R-00646_04_Fovea_Retina_Right.h5ad',
  'R-00646_04_Fovea_ChoroidRPE_Left.h5ad',
  'R-00646_04_Periphery_ChoroidRPE_Left.h5ad',
  'R-00646_07_Periphery_ChoroidRPE.h5ad',
  'R-00646_03_Fovea_ChoroidRPE_Right.h5ad',
  'R-00646_04_Periphery_Retina_Left.h5ad',
  'R-00646_01_Periphery_ChoroidRPE_Left.h5ad',
  'R-00646_03_Fovea_Retina_Right.h5ad',
  'R-00646_03_Periphery_Retina_Right.h5ad',
  'R-00646_07_Periphery_RetinaChoroidRPE.h5ad',
  'R-00646_01_Periphery_Retina_Right.h5ad',
  'R-00646_04_Fovea_ChoroidRPE_Right.h5ad',
  'R-0064

This function returns datasets per code, concatenated

In [24]:
def get_by_dataset(dataset_name, filenames=None, n_sample=None):
    adatas = []
    
    if (filenames is None):
        filenames = [f for f in listdir(join(datadir_scran, dataset_name))]
    print('# datasets', len(filenames))
    for i, f in enumerate(filenames):
        if len(adatas) % 20 == 0:
            print('loaded so far', len(adatas))

        p = join(datadir_scran, dataset_name, f)
                    
        fix5_path = p.replace('Chen_a', 'Chen_a_fix5')
        if os.path.exists(fix5_path):
            # print(os.path.exists(fix5_path), fix5_path)
            print('replace this path')
            print(p)
            p = fix5_path
            print('by this path')
        
        print(i, n_sample, p)
        ad = sc.read_h5ad(p)
        
        if n_sample is not None:
            print('subsampling')
            idx_sample = ad.obs.sample(n_sample if n_sample < ad.shape[0] else ad.shape[0]).index
            ad = ad[ad.obs.index.isin(idx_sample),:]
            # print(ad.shape)        
            
        ad.obs['dataset'] = dataset_name
        ad.obs['filename'] = f.replace('.h5ad', '')
        adatas.append(ad)
    return adatas[0].concatenate(adatas[1:]) # join='outer')


In [25]:
from os.path import exists
for n_sample in [250, 500, 1000, None]:
    if n_sample != 500 and n_sample != None:
        continue
    for dataset in filenames_by_dataset:
        
        if dataset == 'Chen_a_fix5':
            continue
        
        print(dataset)

        subsampling_code = ('_' + str(n_sample) if n_sample is not None else '')
        next_filename = '%s%s.h5ad' % (dataset, subsampling_code)
        outdir = '../../data/integration_march_2021/input/bydataset%s' % subsampling_code
        if not exists(outdir):
            os.mkdir(outdir)
        
        path_by_dataset = join(outdir, '%s' % (next_filename))
        
        if exists(path_by_dataset):
            continue
        
        ad = get_by_dataset(dataset, filenames=filenames_by_dataset[dataset])
        if n_sample is not None:
            sel_idx = ad.obs.groupby('batch').apply(lambda x: x.sample(min(n_sample, len(x)))).index.get_level_values(None)
            ad = ad[ad.obs.index.isin(sel_idx),:]
            # print(ad.obs.batch.value_counts())
            print(ad.shape)
        
        print(ad.shape)
        ad.write(path_by_dataset, compression='lzf')
        print(dataset, 'done...')

Wong
Scheetz
Roska
Chen_c
Hafler
Chen_a
# datasets 40
loaded so far 0
../../data/integration_march_2021/scran/Chen_a/10x3_Lobe_19_D006_NeuN.h5ad
../../data/integration_march_2021/scran/Chen_a/10x3_Lobe_19_D006_NeuN.h5ad
../../data/integration_march_2021/scran/Chen_a/10x3_Lobe_19_D007_NeuN.h5ad
../../data/integration_march_2021/scran/Chen_a/10x3_Lobe_19_D007_NeuN.h5ad
../../data/integration_march_2021/scran/Chen_a/10x3_Lobe_19_D010_Nu.h5ad
../../data/integration_march_2021/scran/Chen_a/10x3_Lobe_19_D010_Nu.h5ad
../../data/integration_march_2021/scran/Chen_a/10x_Lobe_D026_13_NeuN.h5ad
../../data/integration_march_2021/scran/Chen_a/10x_Lobe_D026_13_NeuN.h5ad
../../data/integration_march_2021/scran/Chen_a/10x_Lobe_D28_13_NeuN.h5ad
../../data/integration_march_2021/scran/Chen_a/10x_Lobe_D28_13_NeuN.h5ad
../../data/integration_march_2021/scran/Chen_a/10x3_Lobe_19_D005_NeuN.h5ad
../../data/integration_march_2021/scran/Chen_a/10x3_Lobe_19_D005_NeuN.h5ad
../../data/integration_march_2021/scran/

KeyboardInterrupt: 

### The following snippet merges old and new Chen_a_fix5 recounted samples with the old Chen_a samples

In [None]:
print('obsolete')
assert False

In [None]:
import scanpy as sc
for n_sample in [250, 500, 1000, None]:
    if n_sample != 500 and n_sample != None:
        continue
    
    subsampling_code = ('_' + str(n_sample) if n_sample is not None else '')
    log_path = os.path.join('../../data/integration_march_2021/input/bydataset%s' % subsampling_code, 'merged.txt')
    if not os.path.exists(log_path):
        outdir = '../../data/integration_march_2021/input/bydataset%s' % subsampling_code
        
        path_chen_fix = os.path.join(outdir, 'Chen_a_fix5%s.h5ad' % subsampling_code)
        path_chen_old = os.path.join(outdir, 'Chen_a%s.h5ad' % subsampling_code)
        
        print(os.path.exists(path_chen_fix), path_chen_fix)
        print(os.path.exists(path_chen_old), path_chen_old)
        
        ad_chen_fix = sc.read_h5ad(path_chen_fix)
        ad_chen_old = sc.read_h5ad(path_chen_old)
        print(ad_chen_fix.shape, ad_chen_old.shape)
        ad_chen_others = ad_chen_old[~ad_chen_old.obs['filename'].isin(set(ad_chen_fix.obs['filename']))]
        print(ad_chen_others.shape, ad_chen_fix.shape)
        ad_chen = ad_chen_fix.concatenate(ad_chen_others)
        ad_chen.write(os.path.join(outdir, 'Chen_a_fix5%s.h5ad' % subsampling_code))
        print(ad_chen.obs['filename'].value_counts())
        writer = open(log_path, 'w')
        writer.write('chen new samples and old samples have been merged\n')
        writer.close()
    else:
        print(n_sample, 'skip...')

500 skip...
True ../../data/integration_march_2021/input/bydataset/Chen_a_fix5.h5ad
True ../../data/integration_march_2021/input/bydataset/Chen_a.h5ad
(198265, 28685) (1338345, 14560)
(1143648, 14560) (198265, 28685)
