In [1]:
import json
import sys
import os
import numpy as np
import skimage
from scipy import ndimage
from scipy import special
from scipy import sparse
import anndata
import nibabel as nib
import pandas as pd
import datashader as ds
import time
from sklearn.decomposition import PCA
import collections
import pickle as pkl
import zarr
import h5py
import anndata as ad

import ants
import nibabel as nib

sys.path.insert(0, '/home/fangming/projects/myutils')
from __init__plots import *
sns.set_style('white')

import importlib
import powerplots
importlib.reload(powerplots)
import imageu
importlib.reload(imageu)
import regu
importlib.reload(regu)
import basicu
importlib.reload(basicu)


# sys.path.insert(0, '/home/fangming/projects/dredfish/packages/dredFISH')
sys.path.insert(0, '/home/fangming/projects/dredfish/packages/PySpots')
from MERFISH_Objects.FISHData import *


In [2]:
# sns.boxplot(scrna.values.flatten())
def group_sum(mat, groups, group_order=[]):
    """
    mat is a matrix (cell-by-feature) ; group are the labels (for each cell).
    
    this can be speed up!!! take advantage of the cluster label structure... check my metacell analysis script as well
    """
    m, n = mat.shape
    assert m == len(groups)
    if len(group_order) == 0:
        group_order = np.unique(groups)
    
    group_idx = basicu.get_index_from_array(group_order, groups)
    groupmat = sparse.csc_matrix(([1]*m, (group_idx, np.arange(m)))) # group by cell
    
    return groupmat.dot(mat), group_order

def group_mean(mat, groups, group_order=[]):
    """
    mat is a matrix (cell-by-feature) ; group are the labels (for each cell).
    """
    m, n = mat.shape
    assert m == len(groups)
    if len(group_order) == 0:
        group_order = np.unique(groups)
    
    group_idx = basicu.get_index_from_array(group_order, groups)
    groupmat = sparse.csc_matrix(([1]*m, (group_idx, np.arange(m)))) # group by cell
    groupmat = groupmat/np.sum(groupmat, axis=1)  # row
    
    return groupmat.dot(mat), group_order

def IQR_normalize_vector(v):
    v = v.copy()
    v = v-np.percentile(v,50)
    v = v/(np.percentile(v,75)-np.percentile(v,25))
    return v

def IQR_normalize_matrix_bycol(mat):
    mat = mat.copy()
    for i in range(mat.shape[1]):
        """ Scale bitwise first """
        mat[:,i] = IQR_normalize_vector(mat[:,i])
    return mat

def zscore(v, **kwargs):
    return (v-np.mean(v, **kwargs))/(np.std(v, **kwargs))

def zscore_matrix_bycol(mat):
    return zscore(mat, axis=0) # across rows

# Load data

In [3]:
prj_dir = '/home/fangming/projects/dredfish/'
dat_dir = prj_dir + 'data/'
res_dir = prj_dir + 'results/'
fig_dir = prj_dir + 'figures/'

In [4]:
# allen matrix
f = f'{dat_dir}rna/scrna_ss_ctxhippo_a_exon_count_matrix_v2.h5ad'
scrna = ad.read_h5ad(f)
print(scrna.shape)

# DPNMF matrix 
f = f'{dat_dir}dpnmf.csv' 
pmat = pd.read_csv(f, index_col=0)
print(pmat.shape)

(73347, 45768)
(9711, 24)


In [5]:
# for each allen cluster, get their DPNMF measurements

# remove genes with zero counts
pmat = pmat[pmat.sum(axis=1) > 0]
# 5818 genes, 5576 overlap with Allen SS 
genes = np.intersect1d(pmat.index.values, scrna.var.index.values)
genes_idx = basicu.get_index_from_array(scrna.var.index.values, genes)
print(np.unique(pmat.index.values).shape, np.unique(scrna.var.index.values).shape, genes.shape)

pmat = pmat.loc[genes]

(5818,) (45768,) (5576,)


In [6]:
%%time
Xcell = scrna.X

# correct for library size (each cluster has the same total counts)
Xc_norm = basicu.sparse_libsize_norm(Xcell)

# projection
Xcr_norm = Xc_norm[:,genes_idx].dot(pmat)

# # IQR norm by feautures
# Xcp = zscore_matrix_bycol(Xcr_norm)

CPU times: user 1min 55s, sys: 4min 43s, total: 6min 39s
Wall time: 7min 2s


In [7]:
scrna

AnnData object with n_obs × n_vars = 73347 × 45768
    obs: 'donor_sex_id', 'donor_sex_label', 'donor_sex_color', 'region_id', 'region_label', 'region_color', 'platform_label', 'cluster_order', 'cluster_label', 'cluster_color', 'subclass_order', 'subclass_label', 'subclass_color', 'neighborhood_id', 'neighborhood_label', 'neighborhood_color', 'class_order', 'class_label', 'class_color', 'exp_component_name', 'external_donor_name_label', 'full_genotype_label', 'facs_population_plan_label', 'injection_roi_label', 'injection_materials_label', 'injection_method_label', 'injection_type_label', 'full_genotype_id', 'full_genotype_color', 'external_donor_name_id', 'external_donor_name_color', 'facs_population_plan_id', 'facs_population_plan_color', 'injection_materials_id', 'injection_materials_color', 'injection_method_id', 'injection_method_color', 'injection_roi_id', 'injection_roi_color', 'injection_type_id', 'injection_type_color', 'cell_type_accession_label', 'cell_type_alias_label', 'ce

In [8]:
Xcr_norm

array([[ 375181.10002363,   65590.69645607,  178763.04222405, ...,
          80849.83572292,  233802.39185572, 1573859.12275589],
       [ 386782.56370842,   78100.50263834,  228846.04216266, ...,
          72493.76508677,  261229.68187928, 1691416.93384945],
       [ 330038.05532682,   57793.23883879,  224242.24100351, ...,
          75187.10413492,  257466.14829719, 1537874.9802959 ],
       ...,
       [  93455.60415649,  321611.71696424,  371433.87145185, ...,
          73337.14973783,  184700.66510534, 1626045.15356207],
       [ 213186.26533389,   67800.55113566,  152313.2962544 , ...,
         110288.91963696,  256774.27808619, 1273398.44875634],
       [ 169172.3426789 ,   95462.94227123,  322866.29774165, ...,
          92563.20447397,  140770.35049963, 1250851.52884841]])

In [9]:
Xcr_norm.shape

(73347, 24)

In [10]:
# save as an anndata
adata = ad.AnnData(
    X=Xcr_norm,
    obs=scrna.obs,
)
adata

AnnData object with n_obs × n_vars = 73347 × 24
    obs: 'donor_sex_id', 'donor_sex_label', 'donor_sex_color', 'region_id', 'region_label', 'region_color', 'platform_label', 'cluster_order', 'cluster_label', 'cluster_color', 'subclass_order', 'subclass_label', 'subclass_color', 'neighborhood_id', 'neighborhood_label', 'neighborhood_color', 'class_order', 'class_label', 'class_color', 'exp_component_name', 'external_donor_name_label', 'full_genotype_label', 'facs_population_plan_label', 'injection_roi_label', 'injection_materials_label', 'injection_method_label', 'injection_type_label', 'full_genotype_id', 'full_genotype_color', 'external_donor_name_id', 'external_donor_name_color', 'facs_population_plan_id', 'facs_population_plan_color', 'injection_materials_id', 'injection_materials_color', 'injection_method_id', 'injection_method_color', 'injection_roi_id', 'injection_roi_color', 'injection_type_id', 'injection_type_color', 'cell_type_accession_label', 'cell_type_alias_label', 'cell_

In [11]:
# output = f'{dat_dir}rna/scrna_ss_ctxhippo_a_exon_DPNMF_matrix.h5ad'
# adata.write(output)
# !chmod 444 $output