# Run memento for inference comparison

Power analysis for DM, DV, and DC

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import scipy as sp
import itertools
import numpy as np
import scipy.stats as stats
from scipy.integrate import dblquad
import seaborn as sns
from statsmodels.stats.multitest import fdrcorrection
import imp
import random
import statsmodels.api as sm
pd.options.display.max_rows = 999
pd.set_option('display.max_colwidth', 100)
import pickle as pkl
import time
import string
from sklearn.datasets import make_spd_matrix


  import imp


In [4]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'medium',
         'axes.labelsize': 'medium',
         'axes.titlesize':'medium',
         'figure.titlesize':'medium',
         'xtick.labelsize':'small',
         'ytick.labelsize':'small'}
pylab.rcParams.update(params)


In [5]:
# import sys
# sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.9-py3.8.egg')
# import memento
# import memento.simulate as simulate

In [6]:
import sys
sys.path.append('/home/ssm-user/Github/memento')

import memento.model.rna as rna
import memento.estimator.hypergeometric as hg
import memento.util as util

In [7]:
data_path = '/data_volume/memento/simulation/inference/'

### Run memento for DE

In [62]:
q=0.1
data_path = '/data_volume/memento/simulation/inference/'
adata = sc.read(data_path + 'means/anndata_0.h5ad')
dispersions = pd.read_csv(data_path + 'means/dispersions_0.csv', index_col=0)
dispersions['gene'] = dispersions['gene'].astype(str)

adata.obs['q'] = q
adata.X = adata.X.astype(float)

rna.MementoRNA.setup_anndata(
        adata=adata,
        q_column='q',
        label_columns=['group', 'condition'],
        num_bins=30,
        trim_percent=1,
        shrinkage=0.5)

# adata = adata[:, adata.X.mean(axis=0).A1 > 0.02]
adata = adata[:, dispersions['gene']]
model = rna.MementoRNA(adata=adata)

model.compute_estimate(
    estimand='mean',
    get_se=True,
    n_jobs=30,
)

df = pd.DataFrame(index=adata.uns['memento']['groups'])
df['group'] = df.index.str.split('^').str[1]
df['condition'] = df.index.str.split('^').str[2]

cov_df = pd.get_dummies(df[['group']], drop_first=True).astype(float)
stim_df = (df[['condition']]=='stim').astype(float)
cov_df = sm.add_constant(cov_df)

glm_result = model.differential_mean(
    covariates=cov_df, 
    treatments=stim_df,
    dispersions=pd.Series(dispersions['dispersion'].tolist(), index=dispersions['gene']),
    family='quasiGLM',
    verbose=2,
    n_jobs=5)

_, glm_result['fdr'] = fdrcorrection(glm_result['pval'])
glm_result.to_csv(data_path + 'means/memento_{}.csv'.format(0))


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  78 tasks      | elapsed:    0.1s
[Parallel(n_jobs=5)]: Done 11766 tasks      | elapsed:    6.7s
[Parallel(n_jobs=5)]: Done 15395 out of 15395 | elapsed:    8.1s finished


### Run memento for DV

In [10]:
# dv_sim_adata = sc.read(data_path + 'dv.h5ad')
# dv_sim_adata.obs['q'] = 0.07

# memento.setup_memento(dv_sim_adata, q_column='q', filter_mean_thresh=0.07,trim_percent=1, shrinkage=0)
# dv_sim_adata.obs['memento_size_factor'] = dv_sim_adata.X.sum(axis=1).A1

# memento.create_groups(dv_sim_adata, label_columns=['ct_real'])
# memento.compute_1d_moments(dv_sim_adata, filter_genes=True)

# meta_df = memento.get_groups(dv_sim_adata)
# meta_df = pd.get_dummies(meta_df, prefix='', prefix_sep='', drop_first=False)

# treatment = meta_df[['A']]
# covariate = pd.DataFrame(np.ones((treatment.shape[0], 1)), columns=['intercept'])

# memento.ht_1d_moments(
#     dv_sim_adata, 
#     treatment=treatment,
#     covariate=covariate,
#     num_boot=5000, 
#     verbose=1,
#     num_cpus=40,
#     resampling='bootstrap',
#     approx=False)

# memento_dv_result = memento.get_1d_ht_result(dv_sim_adata)
# memento_dv_result.to_csv(data_path + 'memento_dv.csv', index=False)
# dv_sim_adata.write(data_path + 'dv_filtered.h5ad')
# dv_sim_norm_adata = dv_sim_adata.copy().copy()
# sc.pp.normalize_total(dv_sim_norm_adata)
# sc.pp.log1p(dv_sim_norm_adata)
# sc.pp.scale(dv_sim_norm_adata)
# dv_sim_norm_adata.write(data_path + 'dv_filtered_norm.h5ad')

### Run memento for DC

In [10]:
dc_sim_adata = sc.read(data_path + 'dc.h5ad')
dc_sim_adata.obs['q'] = 0.07

memento.setup_memento(dc_sim_adata, q_column='q', filter_mean_thresh=0.1,trim_percent=1, shrinkage=0)
dc_sim_adata.obs['memento_size_factor'] = dc_sim_adata.X.sum(axis=1).A1
dc_sim_adata.obs['ct'] = [0.0 if x == 'A' else 1.0 for x in dc_sim_adata.obs['ct_real']]
memento.create_groups(dc_sim_adata, label_columns=['ct'])
memento.compute_1d_moments(dc_sim_adata, filter_genes=True)
candidate_pairs = list(itertools.combinations(np.arange(800).astype(str),2))
pairs = [(a,b) for a,b in candidate_pairs if a in dc_sim_adata.var.index and b in dc_sim_adata.var.index]
memento.compute_2d_moments(dc_sim_adata, gene_pairs=pairs)


meta_df = memento.get_groups(dc_sim_adata)
meta_df = pd.get_dummies(meta_df, prefix='', prefix_sep='', drop_first=False)

treatment = meta_df[['ct']]
covariate = pd.DataFrame(np.ones((treatment.shape[0], 1)), columns=['intercept'])

%env PYTHONWARNINGS=ignore::RuntimeWarning

memento.ht_2d_moments(
    dc_sim_adata, 
    treatment=treatment,
    covariate=covariate,
    num_boot=10000, 
    verbose=1,
    num_cpus=40,
    resampling='bootstrap',
    approx=False)

memento_dc_result = memento.get_2d_ht_result(dc_sim_adata)
memento_dc_result.to_csv(data_path + 'memento_dc.csv', index=False)
dc_sim_adata.write(data_path + 'dc_filtered.h5ad')
dc_sim_norm_adata = dc_sim_adata.copy().copy()
sc.pp.normalize_total(dc_sim_norm_adata)
sc.pp.log1p(dc_sim_norm_adata)
# sc.pp.scale(dc_sim_norm_adata)
dc_sim_norm_adata.write(data_path + 'dc_filtered_norm.h5ad')

  df_sub[k].cat.remove_unused_categories(inplace=True)




[Parallel(n_jobs=40)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 120 tasks      | elapsed:    7.5s
[Parallel(n_jobs=40)]: Done 370 tasks      | elapsed:   10.9s
[Parallel(n_jobs=40)]: Done 720 tasks      | elapsed:   16.7s
[Parallel(n_jobs=40)]: Done 1170 tasks      | elapsed:   23.9s
[Parallel(n_jobs=40)]: Done 1720 tasks      | elapsed:   35.3s
[Parallel(n_jobs=40)]: Done 2370 tasks      | elapsed:   46.8s
[Parallel(n_jobs=40)]: Done 3120 tasks      | elapsed:   57.1s
[Parallel(n_jobs=40)]: Done 3970 tasks      | elapsed:  1.2min
[Parallel(n_jobs=40)]: Done 4920 tasks      | elapsed:  1.4min
[Parallel(n_jobs=40)]: Done 5970 tasks      | elapsed:  1.7min
[Parallel(n_jobs=40)]: Done 6441 out of 6441 | elapsed:  1.8min finished
... storing 'memento_group' as categorical


In [22]:
memento_dc_result.head(20)

Unnamed: 0,gene_1,gene_2,tx,corr_coef,corr_se,corr_pval
0,3,14,ct,0.277872,0.187676,0.117188
1,3,17,ct,-0.145232,0.132907,0.248875
2,3,20,ct,-0.382769,0.152638,0.019798
3,3,32,ct,0.281984,0.153706,0.062794
4,3,49,ct,-0.418318,0.154505,0.006699
5,3,62,ct,-0.247087,0.11145,0.035496
6,3,65,ct,0.007696,0.079006,0.920108
7,3,113,ct,-0.265293,0.139124,0.055694
8,3,121,ct,-0.029174,0.154846,0.843816
9,3,130,ct,0.259545,0.119634,0.031897


In [32]:
# dc_sim_adata = sc.read(data_path + 'dc.h5ad')
# dc_sim_adata.obs['q'] = 0.07

# memento.setup_memento(dc_sim_adata, q_column='q', filter_mean_thresh=0.07,trim_percent=1, shrinkage=0)
# dc_sim_adata.obs['memento_size_factor'] = dc_sim_adata.X.sum(axis=1).A1
# memento.create_groups(dc_sim_adata, label_columns=['ct_real'])
# memento.compute_1d_moments(dc_sim_adata, filter_genes=True)

# with open(data_path+'dc_null_indices.pkl', 'rb') as f:
#     null_idxs = pkl.load(f)
    
# null_idxs = [(a,b) for a,b in zip(null_idxs[0], null_idxs[1])]
# null_idxs_set = set(null_idxs)
# sampled_null_idxs = random.sample(null_idxs, 2500)
# sampled_non_null_idxs = [(a,b) for a,b in itertools.combinations(np.arange(500).astype(str),2) if (a,b) not in null_idxs_set]

# pairs = sampled_null_idxs + sampled_non_null_idxs
# pairs = [(a,b) for a,b in pairs if a in dc_sim_adata.var.index and b in dc_sim_adata.var.index]
# memento.compute_2d_moments(dc_sim_adata, gene_pairs=pairs)

# meta_df = memento.get_groups(dc_sim_adata)
# meta_df = pd.get_dummies(meta_df, prefix='', prefix_sep='', drop_first=False)

# treatment = meta_df[['A']]
# covariate = pd.DataFrame(np.ones((treatment.shape[0], 1)), columns=['intercept'])

# %env PYTHONWARNINGS=ignore::RuntimeWarning

# memento.ht_2d_moments(
#     dc_sim_adata, 
#     treatment=treatment,
#     covariate=covariate,
#     num_boot=10000, 
#     verbose=1,
#     num_cpus=40,
#     resampling='bootstrap',
#     approx=False)

# memento_dc_result = memento.get_2d_ht_result(dc_sim_adata)
# memento_dc_result.to_csv(data_path + 'memento_dc.csv', index=False)
# dc_sim_adata.write(data_path + 'dc_filtered.h5ad')
# dc_sim_norm_adata = dc_sim_adata.copy().copy()
# sc.pp.normalize_total(dc_sim_norm_adata)
# sc.pp.log1p(dc_sim_norm_adata)
# sc.pp.scale(dc_sim_norm_adata)
# dc_sim_norm_adata.write(data_path + 'dc_filtered_norm.h5ad')