# Generate simulation data for comparison

In [1]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.io as io
import scipy.stats as stats
from pybedtools import BedTool
import pickle as pkl
import scipy as sp

%matplotlib inline

In [2]:
import sys
sys.path.append('/home/ssm-user/Github/scrna-parameter-estimation/dist/memento-0.0.9-py3.8.egg')
sys.path.append('/home/ssm-user/Github/misc-seq/miscseq/')
import encode
import memento
from memento import simulate

In [3]:
data_path = '/data_volume/memento/simulation/'

In [4]:
ifn_adata = sc.read(data_path + '../demux/interferon_filtered.h5ad')

### Create a downsampled dataset of CD14+ Monocytes for method comparisons

In [5]:
ctrl_subset = sc.pp.subsample(ifn_adata[(ifn_adata.obs.cell == 'CD14+ Monocytes') & (ifn_adata.obs.stim=='ctrl')].copy(), n_obs=2500, copy=True)
stim_subset = sc.pp.subsample(ifn_adata[(ifn_adata.obs.cell == 'CD14+ Monocytes') & (ifn_adata.obs.stim=='stim')].copy(), n_obs=2500, copy=True)
subset = sc.AnnData.concatenate(ctrl_subset, stim_subset)
subset = subset[:,(subset.X.mean(axis=0) > 2.5).A1].copy()
subset.X = subset.X.toarray().astype(int)
print(subset)

  df_sub[k].cat.remove_unused_categories(inplace=True)


AnnData object with n_obs × n_vars = 5000 × 127
    obs: 'tsne1', 'tsne2', 'ind', 'stim', 'cluster', 'cell', 'multiplets', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'cell_type', 'batch'
    var: 'gene_ids', 'mt', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    obsm: 'X_tsne'


  df_sub[k].cat.remove_unused_categories(inplace=True)


In [6]:
subset.obs['stim_indicator'] = (subset.obs.stim == 'stim').astype(int)
subset.obs.to_csv(data_path + 'acc_fpr/metadata.csv')
subset.var.to_csv(data_path + 'acc_fpr/gene_info.csv')

original_data = subset.X.copy()

In [11]:
gen = np.random.Generator(np.random.PCG64(42343))
for downsample_rate in [0.9, 0.7, 0.5, 0.3, 0.1]:
    
    print(downsample_rate)
    
    for replicate in range(10):
    
        
        X_subsample = simulate.capture_sampling(original_data, q=downsample_rate, process='hyper', gen=gen)[1]

        subset.X = X_subsample

        subset.write(data_path + 'acc_fpr/downsampled/downsampled_{}_{}.h5ad'.format(downsample_rate, replicate))

        subset_norm = subset.copy()

        sc.pp.log1p(subset_norm)
        sc.pp.scale(subset_norm)
        subset_norm.write(data_path + 'acc_fpr/downsampled/downsampled_{}_{}_norm.h5ad'.format(downsample_rate, replicate))
    #     io.mmwrite(data_path + 'acc_fpr/subsampled_{}.mtx'.format(subsample_rate), X_subsample)
    

0.9
0.7
0.5
0.3
0.1


### Create a subsampled dataset for method comparisons

In [12]:
adata = sc.read('/data_volume/memento/demux/interferon.h5ad')

In [13]:
adata = adata[(adata.obs.cell == 'CD14+ Monocytes')]

In [14]:
adata.var.index = adata.var.index.map(lambda x: '-'.join(x.split('.')))

In [17]:
n_cells = 250
for replicate in range(10):
    
    adata_sample = sc.pp.subsample(adata, n_obs=n_cells, copy=True, random_state=None).copy().copy()
    adata_sample.write(data_path + 'acc_fpr/subsampled/subampled_{}_{}_norm.h5ad'.format(n_cells,replicate))

... storing 'ind' as categorical
... storing 'stim' as categorical
... storing 'cluster' as categorical
... storing 'cell' as categorical
... storing 'multiplets' as categorical
... storing 'ind' as categorical
... storing 'stim' as categorical
... storing 'cluster' as categorical
... storing 'cell' as categorical
... storing 'multiplets' as categorical
... storing 'ind' as categorical
... storing 'stim' as categorical
... storing 'cluster' as categorical
... storing 'cell' as categorical
... storing 'multiplets' as categorical
... storing 'ind' as categorical
... storing 'stim' as categorical
... storing 'cluster' as categorical
... storing 'cell' as categorical
... storing 'multiplets' as categorical
... storing 'ind' as categorical
... storing 'stim' as categorical
... storing 'cluster' as categorical
... storing 'cell' as categorical
... storing 'multiplets' as categorical
... storing 'ind' as categorical
... storing 'stim' as categorical
... storing 'cluster' as categorical
... st