In [2]:
import scanpy as sc
import os 
os.chdir("../../")
import scnet as sn

In [17]:
def hvg_batch(adata, batch_key=None, target_genes=2000, flavor='cell_ranger', n_bins=20, adataOut=False):
    """
    Method to select HVGs based on mean dispersions of genes that are highly 
    variable genes in all batches. Using a the top target_genes per batch by
    average normalize dispersion. If target genes still hasn't been reached, 
    then HVGs in all but one batches are used to fill up. This is continued 
    until HVGs in a single batch are considered.
    """
    
    adata_hvg = adata if adataOut else adata.copy()

    n_batches = len(adata_hvg.obs[batch_key].cat.categories)

    # Calculate double target genes per dataset
    sc.pp.highly_variable_genes(adata_hvg,
                                flavor=flavor, 
                                n_top_genes=target_genes,
                                n_bins=n_bins, 
                                batch_key=batch_key)

    nbatch1_dispersions = adata_hvg.var['dispersions_norm'][adata_hvg.var.highly_variable_nbatches >
                                                           len(adata_hvg.obs[batch_key].cat.categories)-1]
    
    nbatch1_dispersions.sort_values(ascending=False, inplace=True)

    if len(nbatch1_dispersions) > target_genes:
        hvg = nbatch1_dispersions.index[:target_genes]
    
    else:
        enough = False
        print(f'Using {len(nbatch1_dispersions)} HVGs from full intersect set')
        hvg = nbatch1_dispersions.index[:]
        not_n_batches = 1
        
        while not enough:
            target_genes_diff = target_genes - len(hvg)

            tmp_dispersions = adata_hvg.var['dispersions_norm'][adata_hvg.var.highly_variable_nbatches ==
                                                                (n_batches-not_n_batches)]

            if len(tmp_dispersions) < target_genes_diff:
                print(f'Using {len(tmp_dispersions)} HVGs from n_batch-{not_n_batches} set')
                hvg = hvg.append(tmp_dispersions.index)
                not_n_batches += 1

            else:
                print(f'Using {target_genes_diff} HVGs from n_batch-{not_n_batches} set')
                tmp_dispersions.sort_values(ascending=False, inplace=True)
                hvg = hvg.append(tmp_dispersions.index[:target_genes_diff])
                enough=True

    print(f'Using {len(hvg)} HVGs')

    if not adataOut:
        del adata_hvg
        return hvg.tolist()
    else:
        return adata_hvg[:,hvg].copy()



In [10]:
adata_count = sc.read("./data/pancreas/pancreas_count.h5ad")
adata_count

AnnData object with n_obs × n_vars = 15921 × 15369 
    obs: 'batch', 'study', 'cell_type'

In [11]:
adata = adata_count.copy()

In [12]:
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)

In [13]:
adata_hvg = hvg_batch(adata, batch_key="study", target_genes=7000, adataOut=True)
adata_hvg

Using 1209 HVGs from full intersect set
Using 2016 HVGs from n_batch-1 set
Using 3115 HVGs from n_batch-2 set
Using 660 HVGs from n_batch-3 set
Using 7000 HVGs


AnnData object with n_obs × n_vars = 15921 × 7000 
    obs: 'batch', 'study', 'cell_type', 'n_counts'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'

In [17]:
adata_normalized = adata.copy()

In [18]:
adata_normalized = sn.tl.normalize(adata_normalized,
                                   filter_min_counts=False,
                                   logtrans_input=True,
                                   size_factors=True,
                                   normalize_input=False,
                                   n_top_genes=-1,
                                   )
adata_normalized

AnnData object with n_obs × n_vars = 15921 × 7000 
    obs: 'batch', 'study', 'cell_type', 'n_counts', 'size_factors'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'

In [19]:
adata = adata[:, adata_normalized.var_names]
adata

View of AnnData object with n_obs × n_vars = 15921 × 7000 
    obs: 'batch', 'study', 'cell_type'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'

In [20]:
adata.X.min(), adata.X.max()

(0.0, 1453667.0)

In [22]:
adata_normalized.X.min(), adata_normalized.X.max()

(0.0, 8.5599375)

In [21]:
adata.write_h5ad("./data/pancreas_hvg_normal/pancreas_hvg_normal_count.h5ad")

In [23]:
adata_normalized.write_h5ad("./data/pancreas_hvg_normal/pancreas_hvg_normal_normalized.h5ad")

In [24]:
adata_hvg.X.min(), adata_hvg.X.max()

(0.0, 1453667.0)

In [25]:
adata_hvg.write_h5ad("./data/pancreas_hvg_br/pancreas_hvg_br_count.h5ad")

In [26]:
adata_hvg_normalized = adata_hvg.copy()

In [27]:
adata_hvg_normalized = sn.tl.normalize(adata_hvg_normalized,
                                       filter_min_counts=False,
                                       logtrans_input=True,
                                       size_factors=True,
                                       normalize_input=False,
                                       n_top_genes=7000,
                                       )
adata_hvg_normalized

AnnData object with n_obs × n_vars = 15921 × 7000 
    obs: 'batch', 'study', 'cell_type', 'n_counts', 'size_factors'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'

In [28]:
adata_hvg_normalized.X.min(), adata_hvg_normalized.X.max()

(0.0, 8.168483)

In [30]:
adata_hvg_normalized.write_h5ad("./data/pancreas_hvg_br/pancreas_hvg_br_normalized.h5ad")

In [36]:
adata = sc.read("./data/pancreas/pancreas_count.h5ad")
adata

AnnData object with n_obs × n_vars = 15681 × 15369 
    obs: 'batch', 'study', 'cell_type'

In [37]:
adata.obs['study'].value_counts()

Pancreas inDrop         8391
Pancreas SS2            2961
Pancreas CelSeq2        2426
Pancreas CelSeq         1271
Pancreas Fluidigm C1     632
Name: study, dtype: int64

In [38]:
adata.obs['cell_type'].value_counts()

Pancreas Beta           5085
Pancreas Alpha          4704
Pancreas Ductal         2104
Pancreas Delta          1041
Pancreas Endothelial     836
Pancreas Acinar          713
Pancreas Gamma           637
Pancreas Stellate        561
Name: cell_type, dtype: int64

In [31]:
adata = adata[~adata.obs['cell_type'].isin(['Pancreas Unkonwn'])]
adata.obs['cell_type'] = adata.obs['cell_type'].replace("Pancreas Beta(ER stress)", "Pancreas Beta")
adata = adata[~adata.obs['cell_type'].isin(['Pancreas Beta(ER stress)'])]
adata.obs['study'] = adata.obs['study'].replace("Pancreas Celseq", "Pancreas CelSeq")

Trying to set attribute `.obs` of view, making a copy.
Trying to set attribute `.obs` of view, making a copy.


In [34]:
adata

AnnData object with n_obs × n_vars = 15681 × 15369 
    obs: 'batch', 'study', 'cell_type'

In [35]:
adata.write_h5ad("./data/pancreas/pancreas_count.h5ad")

... storing 'study' as categorical
