**Requires**
* `'sciplex3_matched_genes_lincs.h5ad'`
* `'sciplex3_lincs_genes.h5ad'`
* `'trapnell_final_V7.h5ad'`

**Output**
* `'trapnell_cpa.h5ad'`
* `'trapnell_cpa_subset.h5ad'`

## Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rdkit 
import scanpy as sc

from rdkit import Chem
from compert.paths import DATA_DIR, PROJECT_DIR

sc.set_figure_params(dpi=100, frameon=False)
sc.logging.print_header()

scanpy==1.9.0.dev41+g58f4904c anndata==0.7.6 umap==0.5.1 numpy==1.19.2 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.12.2 python-igraph==0.9.1 louvain==0.7.0 pynndescent==0.5.2


In [2]:
%load_ext autoreload
%autoreload 2

## Load data

In [3]:
# adata_cpa = sc.read(DATA_DIR/'sciplex3_old_reproduced.h5ad')
adata_cpa = sc.read(PROJECT_DIR/'datasets'/'sciplex3_lincs_genes.h5ad')
adata_cpi = sc.read(PROJECT_DIR/'datasets'/'trapnell_final_V7.h5ad')

Determine output directory

In [34]:
adata_out = PROJECT_DIR / 'datasets' / 'trapnell_cpa_lincs_genes.h5ad'
adata_out_subset = PROJECT_DIR / 'datasets' / 'trapnell_cpa_subset_lincs_genes.h5ad'

Overview over adata files

In [5]:
# adata_cpa

In [6]:
# adata_cpi

## Check and align `.obs_names`

In [7]:
# adata_cpa.obs_names

In [8]:
# adata_cpi.obs_names

In [9]:
def del_extensions(string, num):
    return '-'.join(string.split('-')[:-num])


In [10]:
for name in adata_cpi.obs_names: 
    print(del_extensions(name, 1))
    break

F05_F10_RT_BC_353_Lig_BC_93-0


Delte merging atefacts from adata obs names

In [11]:
# adata_cpi
new_index = adata_cpi.obs_names.to_series().apply(lambda s: del_extensions(s, 1))
adata_cpi.obs.set_index(new_index, inplace=True)

In [12]:
# adata_cpa
new_index = adata_cpa.obs_names.to_series().apply(lambda s: del_extensions(s, 2))
adata_cpa.obs.set_index(new_index, inplace=True)

In [13]:
# adata_cpa.obs_names

Check that `obs_names` are identical, if identical output is `0`

In [14]:
((adata_cpi.obs_names == adata_cpa.obs_names)-1).sum()

0

## Check columns in `adata_{cpi,cpa}.obs`

In [15]:
cond_cpa = adata_cpa.obs.columns.isin(adata_cpi.obs.columns)
cond_cpi = adata_cpi.obs.columns.isin(adata_cpa.obs.columns)

In [16]:
print(f'Shared columns:\n\n\t {adata_cpa.obs.columns[cond_cpa].values}\n\n')
print(f'Unique columns for CPI compared to CPA:\n\n\t {adata_cpa.obs.columns[~cond_cpa].values}')

Shared columns:

	 ['cell_type' 'dose' 'dose_character' 'dose_pattern' 'g1s_score'
 'g2m_score' 'pathway' 'pathway_level_1' 'pathway_level_2' 'product_dose'
 'product_name' 'proliferation_index' 'replicate' 'size_factor' 'target'
 'vehicle' 'n_counts' 'condition' 'control' 'split']


Unique columns for CPI compared to CPA:

	 ['batch' 'dose_val' 'drug_dose_name' 'cov_drug_dose_name' 'cov_drug'
 'split_all' 'ct_dose' 'split1' 'split2' 'split3' 'split4' 'split5'
 'split6' 'split7' 'split8' 'split9' 'split10' 'split11' 'split12'
 'split13' 'split14' 'split15' 'split16' 'split17' 'split18' 'split19'
 'split20' 'split21' 'split22' 'split23' 'split24' 'split25' 'split26'
 'split27' 'split28']


In [17]:
print(f'Shared columns:\n\n\t {adata_cpi.obs.columns[cond_cpi].values}\n\n')
print(f'Unique columns for CPA compared to CPI:\n\n\t {adata_cpi.obs.columns[~cond_cpi].values}')

Shared columns:

	 ['cell_type' 'dose' 'dose_character' 'dose_pattern' 'g1s_score'
 'g2m_score' 'pathway' 'pathway_level_1' 'pathway_level_2' 'product_dose'
 'product_name' 'proliferation_index' 'replicate' 'size_factor' 'target'
 'vehicle' 'n_counts' 'condition' 'split' 'control']


Unique columns for CPA compared to CPI:

	 ['log_counts' 'n_genes' 'pert_iname' 'SMILES' 'DrugBank_ID' 'Age' 'Gender'
 'Disease' 'Histology' 'mutRate' 'split_train_test'
 'split_OOD_max_dose_A549' 'split_OOD_max_dose_MCF7'
 'split_OOD_max_dose_K562' 'split_OOD_max_dose' 'split_OOD_mean_dose_A549'
 'split_OOD_mean_dose_MCF7' 'split_OOD_mean_dose_K562'
 'split_OOD_mean_dose' 'n_counts_hvg' 'n_genes_hvg' 'sum_gene_exp'
 'mol_weight' 'ATC_level_1' 'ATC_level_1_abb' 'target_gene_names'
 'target_names' 'logP' 'logS' 'water_solubility' 'molecular_formula'
 'polar_surface_area' 'refractivity' 'polarizability'
 'rotatable_bond_count' 'h_bond_acceptor_count' 'h_bond_donor_count'
 'pKa_strongest_acidic' 'pKa_stronges

## Add SMILES from CPI to CPA 
This is possible since the index of the two adatas are identical (section above)

In [18]:
# Stats on SMILES comlun
adata_cpi.obs.SMILES.value_counts()

                                                                                                 6464
Cl.Cl.C1[C@@H](NC2CCNCC2)[C@@H]1C1=CC=CC=C1 |r,c:14,16,t:12|                                     1868
COC(=O)C1=CC=C2N(CCCC3=CC=CC=C3)C(NC(=O)C3=CC=CC=C3)=NC2=C1 |c:14,16,25,27,29,32,t:4,6,12,23|    1868
CCS(=O)(=O)N1CC(CC#N)(C1)N1C=C(C=N1)C1=NC=NC2=C1C=CN2 |c:14,16,21,23,26,t:19|                    1862
CCN(CC)C(=O)C(\C#N)=C\C1=CC(=C(O)C(O)=C1)[N+]([O-])=O |c:17,t:11,13|                             1853
                                                                                                 ... 
O=C(NC1CC1)NC1=CNN=C1C1=NC2=CC(CN3CCOCC3)=CC=C2N1 |c:11,26,28,t:8,14,16|                          910
CC1CCCC2(C)OC2CC(OC(=O)CC(O)C(C)(C)C(=O)C(C)C1O)\C(C)=C\C3=CSC(=N3)C                              757
Cl.CN1CCC(C(O)C1)C2=C3OC(=CC(=O)C3=C(O)C=C2O)C4=C(Cl)C=CC=C4                                      693
CC1CCCC2OC2CC(OC(=O)CC(O)C(C)(C)C(=O)C(C)C1O)\C(C)=C\C3=CSC(=N3)C                 

Add SMILES

In [19]:
adata_cpa.obs['SMILES'] = adata_cpi.obs.SMILES

## Check that SMILES match `obs.condition` data

Print some stats on the `condition` columns

In [20]:
print(f'We have {len(list(adata_cpa.obs.condition.value_counts().index))} drug names in adata_cpa: \n\n\t{list(adata_cpa.obs.condition.value_counts().index)}\n\n')
print(f'We have {len(list(adata_cpi.obs.condition.value_counts().index))} drug names in adata_cpi: \n\n\t{list(adata_cpi.obs.condition.value_counts().index)}')

We have 188 drug names in adata_cpa: 

	['control', 'ENMD-2076', 'BRD4770', 'GSK-LSD1', 'Baricitinib', 'Entacapone', 'RG108', 'WP1066', 'Curcumin', 'Capecitabine', 'Mesna', 'Tubastatin', 'Tranylcypromine', 'Busulfan', 'Cerdulatinib', 'Tofacitinib', 'PD98059', 'AICAR', 'SRT2104', 'Tacedinaline', 'Valproic', 'Triamcinolone', 'CEP-33779', 'Clevudine', 'Anacardic', 'MK-0752', 'Filgotinib', 'Altretamine', 'NVP-BSK805', 'GSK', 'UNC0631', 'Tazemetostat', 'WHI-P154', 'Sirtinol', 'PJ34', 'Sodium', 'Daphnetin', 'Ofloxacin', 'AG-490', 'Meprednisone', 'S3I-201', 'IOX2', 'JNJ-26854165', 'Motesanib', 'Zileuton', 'MC1568', 'Streptozotocin', 'INO-1001', 'Fasudil', 'Ramelteon', 'Entinostat', 'Selisistat', 'Aminoglutethimide', 'S-Ruxolitinib', 'Costunolide', 'A-366', 'Andarine', 'Fluorouracil', 'Quercetin', 'Fulvestrant', 'TGX-221', 'UNC1999', 'Tie2', 'Ki8751', 'UNC0379', 'Resminostat', 'AC480', 'AG-14361', 'PD173074', 'Divalproex', 'EED226', 'Maraviroc', 'Ki16425', 'CUDC-101', 'Carmofur', 'SL-327', 'Th

Check that assigned SMILES match the condition,  
it should be just one smiles string per condition

### Check for nans

In [21]:
cond = adata_cpa.obs.condition=='nan'
nan_conditions = adata_cpi[adata_cpa[cond].obs_names].obs.condition.value_counts().index.values
print(nan_conditions)
for condition in nan_conditions: 
    cond = adata_cpa.obs.condition == condition
    print(f'\nFor {condition}, we have {cond.sum()} entries in adata_cpa')

[], Categories (0, object): []


Reassign nan conditions

In [22]:
cond = adata_cpa.obs.condition=='nan'
print(f'Before reassignment: #nan values in adata_cpa: {cond.sum()}')
adata_cpa.obs.condition = list(adata_cpa.obs.condition)
adata_cpa.obs.loc[cond, 'condition'] = adata_cpi.obs.condition[cond]
print(f'After reassignment: #nan values in adata_cpa: {(adata_cpa.obs.condition=="nan").sum()}')
adata_cpa.obs.condition.astype('category')
adata_cpa.obs.condition

Before reassignment: #nan values in adata_cpa: 0
After reassignment: #nan values in adata_cpa: 0


index
F05_F10_RT_BC_353_Lig_BC_93-0            WP1066
H02_E09_RT_BC_302_Lig_BC_102-1       Gandotinib
B04_E09_RT_BC_325_Lig_BC_104-1       BMS-536924
F05_F10_RT_BC_193_Lig_BC_130           GSK-LSD1
A01_F10_RT_BC_381_Lig_BC_47             KW-2449
                                      ...      
F05_E09_RT_BC_261_Lig_BC_62-0            Aurora
F10_E09_RT_BC_317_Lig_BC_213-0        Anacardic
D04_F10_RT_BC_54_Lig_BC_70-1       Temsirolimus
C10_E09_RT_BC_25_Lig_BC_294-1           ABT-737
F06_F10_RT_BC_137_Lig_BC_154-1    S-Ruxolitinib
Name: condition, Length: 290888, dtype: object

Check for specific conditions that have been nans in other trapnell datasets

In [23]:
for condition in ['SRT1720', 'Alvespimycin', 'YM155']: 
    cond = adata_cpa.obs.condition == condition
    print(f'\nFor {condition}, we have {cond.sum()} entries in adata_cpa')


For SRT1720, we have 1512 entries in adata_cpa

For Alvespimycin, we have 930 entries in adata_cpa

For YM155, we have 394 entries in adata_cpa


Control has an empty string

### Take care of `control` SMILES

In [24]:
counts = adata_cpa[adata_cpa.obs.condition=='control'].obs.SMILES.value_counts()
list(counts.index[counts>0])

['']

Add DMSO SMILES:`CS(C)=O`

In [25]:
adata_cpa.obs["SMILES"] = adata_cpa.obs["SMILES"].cat.rename_categories({"": "CS(C)=O"})

In [26]:
adata_cpa.obs.loc[adata_cpi.obs.condition=='control', 'SMILES'].value_counts()

CS(C)=O                                                                                                                6464
Cl.O=S(=O)(N1CCCNCC1)C1=C2C=CN=CC2=CC=C1 |c:11,13,15,18,20|                                                               0
Cl.CCS(=O)(=O)N1CCN(CC1)C1=CC=C(NC2=NC(NC3CC3)=C(C=N2)C(N)=O)C=C1 |c:24,26,32,t:12,14,17|                                 0
Cl.CN1CCC2=C(C1)C1=C(C=CC=C1)N2CC1=CC=C(C=C1)C(=O)NO |c:4,10,12,20,22,t:8,18|                                             0
Cl.CN1CCC(C(O)C1)C2=C3OC(=CC(=O)C3=C(O)C=C2O)C4=C(Cl)C=CC=C4                                                              0
                                                                                                                       ... 
CN1C(=O)C(=C2N(C(=O)N(C3CC3)C(=O)C2=C1NC4=CC=C(I)C=C4F)C5=CC(=CC=C5)NC(C)=O)C                                             0
CN1C=C(C2=CC=CC=C12)C1=C(C(=O)NC1=O)C1=CN(C2CCN(CC3=NC=CC=C3)CC2)C2=CC=CC=C12 |c:2,6,30,32,40,t:4,8,12,20,28,38,42|       0
CN1C=C(\

### Check double assigned condition

In [27]:
for pert, df in adata_cpa.obs.groupby('condition'):
    n_smiles = (df.SMILES.value_counts()!=0).sum()
    print(f"{pert}: {n_smiles}") if n_smiles > 1 else None

ENMD-2076: 2


In [28]:
condition = 'ENMD-2076'
cond_cpa = adata_cpa.obs.condition==condition
cond_cpi = adata_cpi.obs.condition==condition
assert np.where(cond_cpa==cond_cpi, 0, 1).sum()==0
counts = adata_cpi[cond_cpi].obs.SMILES.value_counts()
[print(f'For {condition} in adata_cpi, we have this SMILES string: \n\n\t{smiles}\n\n\n')for smiles in list(counts.index[counts>0])];

For ENMD-2076 in adata_cpi, we have this SMILES string: 

	CN1CCN(CC1)C1=CC(NC2=NNC(C)=C2)=NC(\C=C\C2=CC=CC=C2)=N1 |c:16,18,25,27,29,t:8,12,23|



For ENMD-2076 in adata_cpi, we have this SMILES string: 

	O[C@H]([C@@H](O)C(O)=O)C(O)=O.CN1CCN(CC1)C1=NC(\C=C\C2=CC=CC=C2)=NC(NC2=NNC(C)=C2)=C1 |r,c:24,26,28,36,38,t:17,22,32|





Reassign condition two second SMILES from above cell

In [29]:
smiles = list(counts.index[counts>0])[1]
adata_cpa.obs.condition = list(adata_cpa.obs.condition)
adata_cpa.obs.loc[adata_cpa.obs.SMILES==smiles, 'condition'] = 'ENMD-2076_TartaricAcid'
adata_cpa.obs.condition = adata_cpa.obs.condition.astype('category')

Check that condition align with SMILES

If everything is correct there should be no output

In [30]:
for pert, df in adata_cpa.obs.groupby('condition'):
    n_smiles = (df.SMILES.value_counts()!=0).sum()
    print(f"{pert}: {n_smiles}") if n_smiles > 1 else None

## Make SMILES canonical

In [31]:
print(f'rdkit version: {rdkit.__version__}\n')

adata_cpa.obs.SMILES = adata_cpa.obs.SMILES.apply(Chem.CanonSmiles)

rdkit version: 2021.03.2



## Rename weird drug `(+)-JQ1`
This had a different name in the old Sciplex dataset, where it was called `JQ1`. We rename it for consistency.

In [32]:
adata_cpa.obs["condition"] = adata_cpa.obs["condition"].cat.rename_categories({"(+)-JQ1": "JQ1"})

## Create subset `adata_cpa_subset` from `adata_cpa`

In [33]:
adatas = []

for perturbation in np.unique(adata_cpa.obs.condition): 
    tmp = adata_cpa[adata_cpa.obs.condition == perturbation].copy()
    tmp = sc.pp.subsample(tmp, n_obs=40, copy=True)
    adatas.append(tmp)

adata_cpa_subset = adatas[0].concatenate(adatas[1:])
adata_cpa_subset.uns = adata_cpa.uns.copy()

adata_cpa_subset

AnnData object with n_obs × n_vars = 7560 × 977
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'batch', 'n_counts', 'dose_val', 'condition', 'drug_dose_name', 'cov_drug_dose_name', 'cov_drug', 'control', 'split', 'split_all', 'ct_dose', 'split1', 'split2', 'split3', 'split4', 'split5', 'split6', 'split7', 'split8', 'split9', 'split10', 'split11', 'split12', 'split13', 'split14', 'split15', 'split16', 'split17', 'split18', 'split19', 'split20', 'split21', 'split22', 'split23', 'split24', 'split25', 'split26', 'split27', 'split28', 'SMILES'
    var: 'id', 'gene_id', 'in_lincs', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1'
    uns: 'hvg', 'rank_genes_groups_cov', 'splits'

## Safe both adata objects

In [35]:
adata_cpa.write(adata_out)
adata_cpa_subset.write(adata_out_subset)

... storing 'dose_character' as categorical
... storing 'dose_pattern' as categorical
... storing 'pathway' as categorical
... storing 'pathway_level_1' as categorical
... storing 'pathway_level_2' as categorical
... storing 'product_dose' as categorical
... storing 'product_name' as categorical
... storing 'target' as categorical
... storing 'condition' as categorical
... storing 'drug_dose_name' as categorical
... storing 'cov_drug_dose_name' as categorical
... storing 'cov_drug' as categorical
... storing 'split' as categorical
... storing 'split_all' as categorical
... storing 'ct_dose' as categorical
... storing 'split1' as categorical
... storing 'split2' as categorical
... storing 'split3' as categorical
... storing 'split4' as categorical
... storing 'split5' as categorical
... storing 'split6' as categorical
... storing 'split7' as categorical
... storing 'split8' as categorical
... storing 'split9' as categorical
... storing 'split10' as categorical
... storing 'split11' as c

### Loading the result for `adata_out`

In [36]:
adata = sc.read(adata_out_subset)
adata.obs.dose.value_counts()

10.0       2085
100.0      1927
1000.0     1845
10000.0    1663
0.0          40
Name: dose, dtype: int64

In [37]:
adata

AnnData object with n_obs × n_vars = 7560 × 977
    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'batch', 'n_counts', 'dose_val', 'condition', 'drug_dose_name', 'cov_drug_dose_name', 'cov_drug', 'control', 'split', 'split_all', 'ct_dose', 'split1', 'split2', 'split3', 'split4', 'split5', 'split6', 'split7', 'split8', 'split9', 'split10', 'split11', 'split12', 'split13', 'split14', 'split15', 'split16', 'split17', 'split18', 'split19', 'split20', 'split21', 'split22', 'split23', 'split24', 'split25', 'split26', 'split27', 'split28', 'SMILES'
    var: 'id', 'gene_id', 'in_lincs', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1'
    uns: 'hvg', 'rank_genes_groups_cov', 'splits'

In [38]:
adata_out

PosixPath('/storage/groups/ml01/projects/2021_chemicalCPA_leon.hetzel/datasets/trapnell_cpa_lincs_genes.h5ad')