# Preprocess Perturb-seq data for benchmarking

In [22]:
%load_ext autoreload

In [23]:
%autoreload 2

In [12]:
import scanpy as sc
import scipy.stats as stats
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import string
import random
import logging
import itertools
import statsmodels.api as sm
import seaborn as sns
import random

In [13]:
import sys
sys.path.append('/home/ssm-user/Github/memento')

In [14]:
import memento.model.rna as rna
import memento.estimator.hypergeometric as hg

In [15]:
data_path = '/data_volume/memento/'

In [16]:
logging.basicConfig(
    format="%(asctime)s %(process)-7s %(levelname)-8s %(message)s",
    level=logging.INFO, 
    datefmt="%Y-%m-%d %H:%M:%S",
)
logging.captureWarnings(True)

### Generate guide pairs

In [17]:
adata = sc.read(data_path + 'tfko140/tfko.sng.guides.full.ct.h5ad')
adata.obs['q'] = 0.15
adata.obs['guide'] = adata.obs['guide1_cov'].str.split('.').str[0]+'.'+adata.obs['guide1_cov'].str.split('.').str[1]
adata.obs['target'] = adata.obs['guide1_cov'].str.split('.').str[0]
adata.write(data_path + 'tfko140/tfko.sng.guides.full.ct.details.h5ad')

Only considering the two last: ['.ct', '.h5ad'].
Only considering the two last: ['.ct', '.h5ad'].


... storing 'guide' as categorical
... storing 'target' as categorical


In [18]:
mapping = dict(adata.obs[['target', 'guide']].drop_duplicates().groupby('target')['guide'].apply(list))
donors = adata.obs['donor'].drop_duplicates().tolist()
targets = adata.obs['target'].drop_duplicates().tolist()

In [19]:
pairs = list(itertools.combinations(targets, 2))
random.shuffle(pairs)
selected_pairs = pairs[:100]

In [27]:
guide_pairs = []
for a, b in selected_pairs:
    
    guide_pairs.append((
        mapping[a][0],
        mapping[b][0],
        mapping[a][1],
        mapping[b][1]))
guide_pairs = pd.DataFrame(guide_pairs, columns=['target1_guide1', 'target2_guide1', 'target1_guide2', 'target2_guide2'])
guide_pairs.to_csv(data_path + 'tfko140/benchmarking/guide_pairs_to_test.csv', index=False)

### Generate pseudobulks

Can use memento's `sum` estimator for this

In [21]:
rna.MementoRNA.setup_anndata(
    adata=adata,
    q_column='q',
    label_columns=['donor', 'guide'])

2023-06-29 17:36:19 2614606 INFO     setup_anndata: creating groups
2023-06-29 17:36:43 2614606 INFO     setup_anndata: computing cell sizes


In [22]:
model = rna.MementoRNA(adata=adata)

In [23]:
model.compute_estimate(estimator='sum')

2023-06-29 17:38:18 2614606 INFO     compute_estimate: gene_list is None, using all genes...


In [26]:
model.estimates['sum'].T.to_csv(data_path + 'tfko140/pseudobulks.csv')