In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

In [None]:
import os

from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.spatial

import anndata
import scanpy as sc
import umap

import torch
from scmg.model.contrastive_embedding import CellEmbedder, embed_adata, decode_cell_state_embedding

from scmg.preprocessing.data_standardization import GeneNameMapper
gene_name_mapper = GeneNameMapper()


In [None]:
from sklearn.metrics.pairwise import pairwise_distances

def get_neighbor_mask(adata, ct):

    ct_mask = adata.obs['cell_type'] == ct

    dist_mtx = pairwise_distances(adata.obsm['X_scmg'], adata[ct_mask].obsm['X_scmg'],
                                        metric='euclidean', n_jobs=32)

    radius = 2
    weights = np.exp(-(dist_mtx / radius) ** 2).mean(axis=1)
    
    return np.logical_and(weights > 0.2, ~ct_mask)

In [None]:
adata = sc.read_h5ad('../ref_cell_adata_measured_count.h5ad')
adata.X = adata.X.astype(np.float32)
sc.pp.normalize_total(adata, target_sum=1e4)

adata

In [None]:
cell_types = np.unique(adata.obs['cell_type'])

adata_ct = anndata.AnnData(X=np.zeros((len(cell_types), adata.shape[1]), dtype=np.float32), 
                           obs=pd.DataFrame({'cell_type': cell_types}).set_index('cell_type'), 
                           var=adata.var.copy())
adata_ct.obs['cell_count'] = 0
adata_ct.obs['neighbor_cell_count'] = 0

adata_ct.layers['foreground_mean'] = np.zeros((len(cell_types), adata.shape[1]), dtype=np.float32)
adata_ct.layers['background_mean'] = np.zeros((len(cell_types), adata.shape[1]), dtype=np.float32)
adata_ct.layers['neighbor_mean'] = np.zeros((len(cell_types), adata.shape[1]), dtype=np.float32)
adata_ct.layers['foreground_exp_frac'] = np.zeros((len(cell_types), adata.shape[1]), dtype=np.float32)
adata_ct.layers['background_exp_frac'] = np.zeros((len(cell_types), adata.shape[1]), dtype=np.float32)
adata_ct.layers['neighbor_exp_frac'] = np.zeros((len(cell_types), adata.shape[1]), dtype=np.float32)
adata_ct.layers['pval'] = np.zeros((len(cell_types), adata.shape[1]), dtype=np.float32)

adata_ct

In [None]:
from scipy.stats import ttest_ind
import statsmodels.stats.multitest

for i in tqdm(range(adata_ct.shape[0])):
    ct = adata_ct.obs.index[i]

    adata_foreground = adata[adata.obs['cell_type'] == ct].copy()
    adata_ct.X[i] = adata_foreground.X.mean(axis=0)

    neighbor_mask = get_neighbor_mask(adata, ct)
    adata_neighbor = adata[neighbor_mask].copy()
    adata_ct.obs['cell_count'].iloc[i] = adata_foreground.shape[0]
    adata_ct.obs['neighbor_cell_count'].iloc[i] = adata_neighbor.shape[0]

    adata_ct.layers['foreground_mean'][i] = adata_foreground.X.mean(axis=0)
    adata_ct.layers['background_mean'][i] = adata.X.mean(axis=0)
    adata_ct.layers['foreground_exp_frac'][i] = (adata_foreground.X > 0).mean(axis=0)
    adata_ct.layers['background_exp_frac'][i] = (adata.X > 0).mean(axis=0)

    if adata_neighbor.shape[0] > 0:
        adata_ct.layers['neighbor_mean'][i] = adata_neighbor.X.mean(axis=0)
        adata_ct.layers['neighbor_exp_frac'][i] = (adata_neighbor.X > 0).mean(axis=0)

    pvals = []
    for j in range(adata_ct.shape[1]):
        result = ttest_ind(adata_foreground.X[:, j], adata.X[:, j], 
                 alternative='greater', equal_var=False)
        pvals.append(result.pvalue)

    adata_ct.layers['pval'][i] = pvals

adata_ct.layers['pval'] = np.nan_to_num(adata_ct.layers['pval'], nan=1)
adata_ct.layers['pval_adj'] = statsmodels.stats.multitest.multipletests(
    adata_ct.layers['pval'].reshape(-1), method='fdr_bh'
    )[1].reshape(adata_ct.layers['pval'].shape)

adata_ct.write_h5ad('adata_cell_type_DE_scores.h5ad')

In [None]:
all_gene_de_df = pd.DataFrame({
    'cell_type' : np.repeat(adata_ct.obs.index, adata_ct.shape[1]),
    'neighbor_cell_count' : np.repeat(adata_ct.obs['neighbor_cell_count'], adata_ct.shape[1]),
    'gene' : np.tile(adata_ct.var.index, adata_ct.shape[0]),
    'human_gene_name' : np.tile(adata_ct.var['human_gene_name'], adata_ct.shape[0]),
    'foreground_mean' : adata_ct.layers['foreground_mean'].reshape(-1),
    'background_mean' : adata_ct.layers['background_mean'].reshape(-1),
    'neighbor_mean' : adata_ct.layers['neighbor_mean'].reshape(-1),
    'fc'  : (adata_ct.layers['foreground_mean'] / (adata_ct.layers['background_mean'] + 1e-9)).reshape(-1),
    'foreground_exp_frac' : adata_ct.layers['foreground_exp_frac'].reshape(-1),
    'background_exp_frac' : adata_ct.layers['background_exp_frac'].reshape(-1),
    'neighbor_exp_frac' : adata_ct.layers['neighbor_exp_frac'].reshape(-1),
    'pval' : adata_ct.layers['pval'].reshape(-1),
    'pval_adj' : adata_ct.layers['pval_adj'].reshape(-1),
})

all_gene_de_df.to_parquet('cell_type_DE_scores.parquet', index=False)
all_gene_de_df

In [None]:
all_gene_de_df = pd.read_parquet('cell_type_DE_scores.parquet')
all_gene_de_df

In [None]:
marker_gene_df = all_gene_de_df[all_gene_de_df['pval_adj'] < 0.01]
marker_gene_df = marker_gene_df[marker_gene_df['fc'] > 5]
marker_gene_df = marker_gene_df[marker_gene_df['foreground_exp_frac'] > 0.2]
marker_gene_df = marker_gene_df[marker_gene_df['foreground_exp_frac'] > 3 * marker_gene_df['background_exp_frac']].copy()
marker_gene_df = marker_gene_df.sort_values('fc', ascending=False)
marker_gene_df

In [None]:
marker_gene_df[marker_gene_df['cell_type'] == 'Epiblast'][:30]

In [None]:
marker_gene_df[marker_gene_df['human_gene_name'] == 'IKZF1']

In [None]:
all_gene_de_df

all_gene_de_df[all_gene_de_df['human_gene_name'] == 'POU5F1'].sort_values('fc', ascending=False)[:20]

In [None]:
marker_gene_count_map = marker_gene_df['cell_type'].value_counts().to_dict()
adata.obs['marker_gene_count'] = adata.obs['cell_type'].map(marker_gene_count_map).fillna(0)
adata.obs['n_exp_genes'] = np.sum(adata.X > 0, axis=1)

sc.pl.umap(adata, color=['n_exp_genes'], vmax=6000)
sc.pl.umap(adata, color=['marker_gene_count'], vmax=500)

In [None]:
marker_gene_df['cell_type'].value_counts().hist(bins=30)