**In this notebook we will filter for different numbers of HVGs for benchmark datasets, and save files for clustering**

In [30]:
import h5py
import scanpy as sc
import anndata
import loompy as lp

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys


### Save h5 and loom files with __ HVGs for clustering methods

In [72]:
#List of looms for benchmark datasets
looms = ['/home/tchari/counts/allen_bivi/loom/processed_allen_A08_raw.loom',
        '/home/tchari/counts/allen_bivi/loom/processed_allen_B02H01_raw.loom',
        '/home/tchari/counts/scMix/cl3/loom/processed_cl3_raw.loom',
        '/home/tchari/counts/scMix/cl5/loom/processed_cl5_raw.loom']

short = ['allen_b08','allen_b02h01','cl3','cl5']


#Set number of HVGs to try (based on standard procedure)
hvgs = [300, 1000, 2000, 4000]

!mkdir ./hvg_objs


for l,s in zip(looms,short):
    ds = lp.connect(l)
    S = ds.layers['spliced'][:,:]
    U = ds.layers['unspliced'][:,:]
    bars = ds.ca['barcode']
    subclass = ds.ca['subclass_label']
    g_names = ds.ra['gene_name']
    ds.close()
    
    if l == '/home/tchari/counts/allen_bivi/loom/processed_allen_B02H01_raw.loom':
        X = U.T #nuclear data
        print('nuclear')
    else:
        X = S.T
    
    for h in hvgs:

        adata = anndata.AnnData(X=X)
        adata.layers["counts"] = adata.X.copy()  # preserve counts
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        adata.raw = adata
        sc.pp.highly_variable_genes(adata, n_top_genes=h)
        #adata = adata[:, adata.var.highly_variable]
        
        S_sub = S[adata.var.highly_variable,:]
        U_sub = U[adata.var.highly_variable,:]
        g_names_sub = g_names[adata.var.highly_variable]
        
        retAdata = anndata.AnnData(
            X=S_sub.T,
            layers={
                'spliced': S_sub.T,
                'unspliced': U_sub.T
            },
            obs=pd.DataFrame({'barcode': bars,'subclass_label':subclass},
                             index=bars),
            var=pd.DataFrame({'gene_name': g_names_sub},index=g_names_sub)
        )

        retAdata.write_loom('./hvg_objs/'+s+'_'+str(h)+'hvgs.loom')
        
        #h5 files for scMDC
        hf = h5py.File('./hvg_objs/'+s+'_'+str(h)+'hvgs.h5', 'w')
        hf.create_dataset('X1', data=U_sub.T)
        hf.create_dataset('X2', data=S_sub.T)

        uniqs = dict(zip(np.unique(subclass),list(range(len(np.unique(subclass))))))
        ys = [uniqs[i] for i in subclass]
        hf.create_dataset('Y', data=ys)
        hf.close()


mkdir: cannot create directory ‘./hvg_objs’: File exists


In [73]:
!ls -lh ./hvg_objs

total 2.2G
-rw-rw-r--. 1 tchari tchari 117M Feb  8 18:50 allen_b02h01_1000hvgs.h5
-rw-rw-r--. 1 tchari tchari  14M Feb  8 18:53 allen_b02h01_1000hvgs.loom
-rw-rw-r--. 1 tchari tchari 233M Feb  8 18:50 allen_b02h01_2000hvgs.h5
-rw-rw-r--. 1 tchari tchari  24M Feb  8 18:50 allen_b02h01_2000hvgs.loom
-rw-rw-r--. 1 tchari tchari  35M Feb  8 18:50 allen_b02h01_300hvgs.h5
-rw-rw-r--. 1 tchari tchari 5.2M Feb  8 18:50 allen_b02h01_300hvgs.loom
-rw-rw-r--. 1 tchari tchari 466M Feb  8 18:51 allen_b02h01_4000hvgs.h5
-rw-rw-r--. 1 tchari tchari  44M Feb  8 18:51 allen_b02h01_4000hvgs.loom
-rw-rw-r--. 1 tchari tchari  44M Feb  8 18:49 allen_b08_1000hvgs.h5
-rw-rw-r--. 1 tchari tchari  13M Feb  8 18:49 allen_b08_1000hvgs.loom
-rw-rw-r--. 1 tchari tchari  87M Feb  8 18:49 allen_b08_2000hvgs.h5
-rw-rw-r--. 1 tchari tchari  23M Feb  8 18:49 allen_b08_2000hvgs.loom
-rw-rw-r--. 1 tchari tchari  14M Feb  8 18:49 allen_b08_300hvgs.h5
-rw-rw-r--. 1 tchari tchari 4.1M Feb  8 18:49 allen_b08_30

In [75]:
# !rm ./hvg_objs/meK_looms.tar.gz
# !rm ./hvg_objs/meK_h5s.tar.gz

In [61]:
# test = lp.connect('./hvg_objs/cl3_300hvgs.loom')
# test.ca.keys()
# test.close()

['barcode', 'obs_names', 'subclass_label']

In [66]:
# hf = h5py.File('./hvg_objs/cl5_2000hvgs.h5')
# test = np.array(hf['Y'])
# testX = np.array(hf['X1'])
# hf.close()

In [77]:
!tar -cvzf ./hvg_objs/meK_looms.tar.gz ./hvg_objs/*.loom

./hvg_objs/allen_b02h01_1000hvgs.loom
./hvg_objs/allen_b02h01_2000hvgs.loom
./hvg_objs/allen_b02h01_300hvgs.loom
./hvg_objs/allen_b02h01_4000hvgs.loom
./hvg_objs/allen_b08_1000hvgs.loom
./hvg_objs/allen_b08_2000hvgs.loom
./hvg_objs/allen_b08_300hvgs.loom
./hvg_objs/allen_b08_4000hvgs.loom
./hvg_objs/cl3_1000hvgs.loom
./hvg_objs/cl3_2000hvgs.loom
./hvg_objs/cl3_300hvgs.loom
./hvg_objs/cl3_4000hvgs.loom
./hvg_objs/cl5_1000hvgs.loom
./hvg_objs/cl5_2000hvgs.loom
./hvg_objs/cl5_300hvgs.loom
./hvg_objs/cl5_4000hvgs.loom


In [78]:
!tar -cvzf ./hvg_objs/meK_h5s.tar.gz ./hvg_objs/*.h5

./hvg_objs/allen_b02h01_1000hvgs.h5
./hvg_objs/allen_b02h01_2000hvgs.h5
./hvg_objs/allen_b02h01_300hvgs.h5
./hvg_objs/allen_b02h01_4000hvgs.h5
./hvg_objs/allen_b08_1000hvgs.h5
./hvg_objs/allen_b08_2000hvgs.h5
./hvg_objs/allen_b08_300hvgs.h5
./hvg_objs/allen_b08_4000hvgs.h5
./hvg_objs/cl3_1000hvgs.h5
./hvg_objs/cl3_2000hvgs.h5
./hvg_objs/cl3_300hvgs.h5
./hvg_objs/cl3_4000hvgs.h5
./hvg_objs/cl5_1000hvgs.h5
./hvg_objs/cl5_2000hvgs.h5
./hvg_objs/cl5_300hvgs.h5
./hvg_objs/cl5_4000hvgs.h5
