In [1]:
import pandas as pd
import scanpy as sc
sc.set_figure_params(dpi=100, frameon=False)
sc.logging.print_header()

scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.19.2 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.12.2 python-igraph==0.9.1 louvain==0.7.0 pynndescent==0.5.2


In [2]:
import os
os.chdir('./../')
from compert.helper import rank_genes_groups_by_cov

Using backend: pytorch


In [3]:
import warnings
warnings.filterwarnings('ignore')   

In [4]:
full = True 
load_adata = True 
adata_in = 'datasets/lincs_full.h5ad' if full else 'datasets/lincs.h5ad'
adata = sc.read(adata_in) if load_adata else None

adata_out = ''.join(adata_in.split('.')[:-1]) + '_pp.h5ad'
adata_out

'datasets/lincs_full_pp.h5ad'

In [5]:
adata.obs['condition'] = adata.obs['pert_iname']
adata.obs['condition'] = adata.obs['condition'].str.replace('/','|')

adata.obs['cell_type'] = adata.obs['cell_id']
adata.obs['dose_val'] = adata.obs['pert_dose']
adata.obs['cov_drug_dose_name'] = adata.obs.cell_type.astype(str) + '_' + adata.obs.condition.astype(str) + '_' + adata.obs.dose_val.astype(str)
adata.obs['control'] = (adata.obs['condition'] == 'DMSO').astype(int)

# adata.obs['cov_drug_dose_name'] = adata.obs['cov_drug_dose_name'].str.replace('/','|')

In [6]:
pd.crosstab(adata.obs.condition, adata.obs.cell_type)

cell_type,A375,A549,A673,AGS,ASC,ASC.C,BT20,CD34,CL34,CORL23,...,SW620,SW948,T3M10,THP1,TYKNU,U266,U937,VCAP,WSUDLCL2,YAPC
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(+)-3-(1-propyl-piperidin-3-yl)-phenol,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"(+|-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"1,2,3,4,5,6-hexabromocyclohexane",5,5,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"1,2,3,4-tetrahydroisoquinoline",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"1,2-dichlorobenzene",3,6,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,12,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zonisamide,23,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,9,0,18
zopiclone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zosuquidar,20,6,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,10,0,17
zoxazolamine,6,5,0,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,10,0,0


In [7]:
drug_abundance = adata.obs.condition.value_counts()
suff_drug_abundance = drug_abundance.index[drug_abundance>5]

In [8]:
# Delete conditions isufficient # of observations
adata = adata[adata.obs.condition.isin(suff_drug_abundance)].copy()
adata 

AnnData object with n_obs × n_vars = 1023036 × 978
    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control'
    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'
    uns: 'cydata_pull'

Calculate differential genes manually, such that the genes are the same per condition.

In [9]:
%%time
from tqdm.notebook import tqdm
import numpy as np 

de_genes = {}
de_genes_quick = {}

adata_df = adata.to_df()
adata_df['condition'] = adata.obs.condition
dmso = adata_df[adata_df.condition == "DMSO"].mean()

for cond, df in tqdm(adata_df.groupby('condition')): 
    if cond != 'DMSO':
        drug_mean = df.mean()
        de_50_idx = np.argsort(abs(drug_mean-dmso))[-50:]
        de_genes_quick[cond] = drug_mean.index[de_50_idx].values

if full: 
    de_genes = de_genes_quick

else:
    sc.tl.rank_genes_groups(
        adata,
        groupby='condition', 
        reference='DMSO',
        rankby_abs=True,
        n_genes=50
    )
    for cond in tqdm(np.unique(adata.obs['condition'])):
        if cond != 'DMSO':
            df = sc.get.rank_genes_groups_df(adata, group=cond)  # this takes a while
            de_genes[cond] = df['names'][:50].values


  0%|          | 0/17990 [00:00<?, ?it/s]

CPU times: user 41.5 s, sys: 877 ms, total: 42.4 s
Wall time: 42.4 s


Mapping from `rank_genes_groups_cov` might cause problems when drug contains '_'

In [10]:
def extract_drug(cond): 
    return '_'.join(cond.split('_')[1:-1])

adata.obs['cov_drug_dose_name'].apply(lambda s: len(s.split('_'))).value_counts()

3    1022382
4        654
Name: cov_drug_dose_name, dtype: int64

In [11]:
adata.uns['rank_genes_groups_cov'] = {cond: de_genes_quick[extract_drug(cond)] for cond in adata.obs.cov_drug_dose_name.unique() if extract_drug(cond) != 'DMSO'}

In [12]:
adata.obs['split'] = 'train'

# take ood from top occurring perturbations to avoid losing data on low occ ones
ood_idx = sc.pp.subsample(
    adata[adata.obs.condition.isin(list(adata.obs.condition.value_counts().index[1:50]))],
    .1,
    copy=True
).obs.index
adata.obs['split'].loc[ood_idx] = 'ood'

# take test from a random subsampling of the rest
test_idx = sc.pp.subsample(
    adata[adata.obs.split != 'ood'],
    .16,
    copy=True
).obs.index
adata.obs['split'].loc[test_idx] = 'test'

In [13]:
pd.crosstab(adata.obs['split'], adata.obs['condition'])

condition,(+)-3-(1-propyl-piperidin-3-yl)-phenol,"(+|-)-7-hydroxy-2-(N,N-di-n-propylamino)tetralin","1,2,3,4,5,6-hexabromocyclohexane","1,2,3,4-tetrahydroisoquinoline","1,2-dichlorobenzene","1,2-propylene-glycol",1-benzylimidazole,1-methylisoquinoline,1-monopalmitin,1-phenylbiguanide,...,ziprasidone,zofenopril-calcium,zolantidine,zolmitriptan,zolpidem,zonisamide,zopiclone,zosuquidar,zoxazolamine,zuclopenthixol
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ood,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
test,5,0,8,4,13,5,9,6,7,3,...,37,23,12,25,30,38,2,31,13,15
train,13,18,29,14,58,22,40,12,27,25,...,152,103,52,87,175,185,10,151,79,68


In [14]:
try: 
    del(adata.uns['rank_genes_groups'])  # too large
except: 
    print('All good.')

All good.


In [15]:
# code compatibility
from scipy import sparse
adata.X = sparse.csr_matrix(adata.X)

In [16]:
sc.write('datasets/lincs_full_pp.h5ad', adata)

... storing 'condition' as categorical
... storing 'cov_drug_dose_name' as categorical
... storing 'split' as categorical


In [17]:
print('all done.')

all done.


### Check that `adata.uns[rank_genes_groups_cov]` has all entries in `adata.obs.cov_drug_dose_name` as keys

In [18]:
for i, k in enumerate(adata.obs.cov_drug_dose_name.unique()):
    try: 
        adata.uns['rank_genes_groups_cov'][k]
    except: 
        print(f"{i}: {k}") if 'DMSO' not in k else None

### Checking the same for the stored adata object

In [19]:
adata_2 = sc.read('datasets/lincs_full_pp.h5ad')

In [20]:
for i, k in enumerate(adata_2.obs.cov_drug_dose_name.unique()):
    try: 
        adata_2.uns['rank_genes_groups_cov'][k]
    except: 
        print(f"{i}: {k}") if 'DMSO' not in k else None

In [21]:
set(list(adata.uns['rank_genes_groups_cov'])) - set((list(adata_2.uns['rank_genes_groups_cov'])))

set()

In [22]:
set((list(adata_2.uns['rank_genes_groups_cov']))) - set(list(adata.uns['rank_genes_groups_cov']))

set()