# Generate simulation data for comparison

In [51]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as io
import scipy.stats as stats
from pybedtools import BedTool
import pickle as pkl
import scipy as sp

%matplotlib inline

In [2]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.6-py3.8.egg')
sys.path.append('/home/ssm-user/Github/misc-seq/miscseq/')
import encode
import memento
from memento import simulate

In [3]:
data_path = '/data_volume/memento/simulation/'

### Extract real parameters

In [9]:
ifn_adata = sc.read(data_path + '../demux/interferon_filtered.h5ad')

### Isolate subset of CD14+ Monocytes for method comparisons

In [35]:
ifn_adata.obs.ind

index
AAACATACATTTCC-1    1016
AAACATACCAGAAA-1    1256
AAACATACCATGCA-1    1488
AAACATACCTCGCT-1    1256
AAACATACCTGGTA-1    1039
                    ... 
TTTGCATGCCTGAA-1    1244
TTTGCATGCCTGTC-1    1256
TTTGCATGCTAAGC-1     107
TTTGCATGGGACGA-1    1488
TTTGCATGTCTTAC-1    1016
Name: ind, Length: 24495, dtype: category
Categories (8, object): ['101', '107', '1015', '1016', '1039', '1244', '1256', '1488']

In [47]:
ctrl_subset = sc.pp.subsample(ifn_adata[(ifn_adata.obs.cell == 'CD14+ Monocytes') & (ifn_adata.obs.stim=='ctrl') & ifn_adata.obs.ind.isin(['1015', '1488'])].copy(), n_obs=1000, copy=True)
stim_subset = sc.pp.subsample(ifn_adata[(ifn_adata.obs.cell == 'CD14+ Monocytes') & (ifn_adata.obs.stim=='stim') & ifn_adata.obs.ind.isin(['1015', '1488'])].copy(), n_obs=1000, copy=True)
subset = sc.AnnData.concatenate(ctrl_subset, stim_subset)
subset = subset[:,(subset.X.mean(axis=0) > 2.5).A1].copy()
subset.X = subset.X.toarray().astype(int)
print(subset)

AnnData object with n_obs × n_vars = 2000 × 125
    obs: 'tsne1', 'tsne2', 'ind', 'stim', 'cluster', 'cell', 'multiplets', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'cell_type', 'batch'
    var: 'gene_ids', 'mt', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    obsm: 'X_tsne'


In [57]:
subset.obs['stim_indicator'] = (subset.obs.stim == 'stim').astype(int)
subset.obs.to_csv(data_path + 'acc_fpr/metadata.csv')
subset.var.to_csv(data_path + 'acc_fpr/gene_info.csv')

In [58]:
for subsample_rate in [0.8, 0.6, 0.4, 0.2, 0.1]:
    
    X_subsample = simulate.capture_sampling(subset.X, q=0.1, process='hyper')[1]
    
    io.mmwrite(data_path + 'acc_fpr/subsampled_{}.mtx'.format(subsample_rate), X_subsample)
    

### Power calculation

1000 genes total, 300 are DE and 300 are DV

In [8]:
adata = ifn_adata[ifn_adata.obs.cell_type == 'CD4 T cells - ctrl']
# data = adata.X.copy()
# relative_data = data.toarray()/data.sum(axis=1)

n_cells = 10000
q=0.07
x_param, z_param, Nc, good_idx = simulate.extract_parameters(adata.X, q=q)

### Simulation code

In [9]:
def simulate_two_datasets(x_param, Nc, n_cells, q, diff='mean'):
    
    log_means_1, log_variances_1 = np.log(x_param[0]), np.log(x_param[1])
    log_means_2, log_variances_2 = log_means_1.copy(), log_variances_1.copy()
    
    if diff == 'null':
        norm_cov_1, norm_cov_2 = 'indep', 'indep'
    if diff == 'mean':
        log_means_2[:500] += 0.5
        norm_cov_1, norm_cov_2 = 'indep', 'indep'
    if diff == 'variability':
        log_variances_2[:500] += 0.5
        norm_cov_1, norm_cov_2 = 'indep', 'indep'
    if diff == 'correlation':
        norm_cov_1 = make_spd_matrix(log_means_1.shape[0])
        norm_corr_1 = norm_cov_1/np.outer(np.sqrt(np.diag(norm_cov_1)), np.sqrt(np.diag(norm_cov_1)))
        norm_corr_subset = norm_corr_1[:100, :100].copy()
        
        change_indices = np.where(norm_corr_subset < 0.5)
        change_indices = (change_indices[0][:150], change_indices[1][:150])
        norm_corr_subset[change_indices] += 0.5
        
        norm_corr_2 = norm_corr_1.copy()
        norm_corr_2[:100, :100] = norm_corr_subset
        norm_cov_2 = norm_corr_2 * np.outer(np.sqrt(np.diag(norm_cov_1)), np.sqrt(np.diag(norm_cov_1)))
    
    data_1 = simulate.simulate_transcriptomes(
        n_cells=n_cells, 
        means=np.exp(log_means_1)*Nc.mean(),
        variances=(np.exp(log_variances_1) + np.exp(log_means_1)**2)*(Nc**2).mean() - np.exp(log_means_1)**2*Nc.mean()**2,
        Nc=Nc,
        norm_cov=norm_cov_1)
    
    data_2 = simulate.simulate_transcriptomes(
        n_cells=n_cells, 
        means=np.exp(log_means_2)*Nc.mean(),
        variances=(np.exp(log_variances_2) + np.exp(log_means_2)**2)*(Nc**2).mean() - np.exp(log_means_2)**2*Nc.mean()**2,
        Nc=Nc,
        norm_cov=norm_cov_2)
    
    true_data = np.vstack([data_1, data_2])
    _, hyper_captured = simulate.capture_sampling(true_data, q=q, process='hyper')
    
    anndata = sc.AnnData(sp.sparse.csr_matrix(hyper_captured))
    anndata.obs['ct_real'] = ['A' for i in range(n_cells)] + ['B' for i in range(n_cells)]
    anndata.obs['ct_shuffled'] = np.random.choice(['A', 'B'], anndata.shape[0])
    anndata.obs['BatchInfo'] = 1
    
    if diff == 'correlation':
        return anndata, change_indices
    else:
        return anndata, None

In [12]:
n_cells_list = np.logspace(3, 5.5, 10).astype(int)

for num_cells in [250, 500, 2500]:
    
    a, _ = simulate_two_datasets(x_param, Nc, num_cells, q, diff='mean')
    
    a.write(data_path + 'runtime/{}.h5ad'.format(num_cells))
    print('finished ', num_cells)

... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


finished  250


... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


finished  500


... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


finished  2500


In [13]:
for i in range(100):
    
    if i % 10 == 0 and i > 0:
        print(i)
    
    a, _ = simulate_two_datasets(x_param, Nc, 5000, q, diff='mean')
    a.write(data_path + 'runtime/5000_{}.h5ad'.format(i))

... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


10


... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


20


... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


30


... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


40


... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


50


... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


60


... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


70


... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


80


... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical


90


... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
... storing 'ct_real' as categorical
... storing 'ct_shuffled' as categorical
