In [1]:
import os 
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
sc.set_figure_params(dpi=80, frameon=False)
sc.logging.print_header()
os.getcwd()

scanpy==1.8.0.dev78+gc488909a anndata==0.7.6 umap==0.5.1 numpy==1.19.2 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.9.1 louvain==0.7.0


'/home/icb/leon.hetzel/git/CPA_graphs/notebooks'

In [2]:
%load_ext autoreload
%autoreload 2

### Loading LINCS and reference data

In [3]:
full = False
if full:
    adata = sc.read('../datasets/lincs_full.h5ad')
    adata_out = '../datasets/lincs_full_smiles.h5ad' 
else: 
    adata = sc.read('../datasets/lincs.h5ad')
    adata_out = '../datasets/lincs_smiles.h5ad'  


Checking number of drugs for LINCS

In [4]:
pert_id_unique = pd.Series(np.unique(adata.obs.pert_id))
print(f"# of unique perturbations: {len(pert_id_unique)}")

# of unique perturbations: 1120


Loading reference dataframe and restricting to `'pert_id'` and `'canonical_smiles'`

In [5]:
reference_df = pd.read_csv('../datasets/GSE92742_Broad_LINCS_pert_info.txt', delimiter = "\t")
reference_df = reference_df.loc[reference_df.pert_id.isin(pert_id_unique), ['pert_id', 'canonical_smiles']]
reference_df.canonical_smiles.value_counts()

-666                                                                                                                                                                             6
restricted                                                                                                                                                                       2
CS(=O)(=O)CCNCc1ccc(o1)-c1ccc2ncnc(Nc3ccc(OCc4cccc(F)c4)c(Cl)c3)c2c1                                                                                                             2
Oc1ccc2c(c1)oc3c2c(=O)oc4cc(O)ccc34                                                                                                                                              1
CCOC(=O)CCSc1nnc(S)s1                                                                                                                                                            1
                                                                                                         

In [6]:
cond = ~pert_id_unique.isin(reference_df.pert_id)
print(f"From {len(pert_id_unique)} total drugs, {cond.sum()} were not part of the reference dataframe.")

From 1120 total drugs, 132 were not part of the reference dataframe.


Adding `'canoncical_smiles'` column to `adata.obs` via `pd.merge`

In [7]:
adata.obs = adata.obs.reset_index().merge(reference_df, how="left").set_index('index')

Removing invalid SMILES strings 

In [8]:
adata.obs.canonical_smiles = adata.obs.canonical_smiles.astype('str')
invalid_smiles = adata.obs.canonical_smiles.isin(['-666', 'restricted', 'nan'])
print(f'Among {len(adata)} observations, {100*invalid_smiles.sum()/len(adata):.2f}% ({invalid_smiles.sum()}) do not have a valid SMILES string')
adata = adata[~invalid_smiles]

Among 199620 observations, 11.54% (23033) do not have a valid SMILES string


Remove invalid `'pert_dose'` value: `-666`

In [9]:
cond = adata.obs.pert_dose.isin([-666])
adata = adata[~cond]
print(f"A total of {cond.sum()} observations have invalid dose values")

A total of 0 observations have invalid dose values


In [10]:
drugs_validation = adata.obs.canonical_smiles.value_counts() < 6
valid_drugs = drugs_validation.index[~drugs_validation]
cond = adata.obs.canonical_smiles.isin(valid_drugs)
print(f"A total of {(~cond).sum()} observation belong to drugs which do not have enough replicates")
adata = adata[cond]

A total of 0 observation belong to drugs which do not have enough replicates


Checking that SMILES are valid according to `rdkit` 

In [11]:
from rdkit import Chem

def check_smiles(smiles):
    m = Chem.MolFromSmiles(smiles,sanitize=False)
    if m is None:
        print('invalid SMILES')
        return False
    else:
        try:
            Chem.SanitizeMol(m)
        except:
            print('invalid chemistry')
            return False
    return True

def remove_invalid_smiles(dataframe, smiles_key: str = 'SMILES', return_condition: bool = False):
    unique_drugs = pd.Series(np.unique(dataframe[smiles_key]))
    valid_drugs = unique_drugs.apply(check_smiles)
    print(f"A total of {(~valid_drugs).sum()} have invalid SMILES strings")
    _validation_map = dict(zip(unique_drugs, valid_drugs))
    cond = dataframe[smiles_key].apply(lambda x: _validation_map[x])
    if return_condition: 
        return cond
    dataframe = dataframe[cond].copy()
    return dataframe

adata

View of AnnData object with n_obs × n_vars = 176587 × 978
    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'batch', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control', 'split', 'canonical_smiles'
    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'
    uns: 'rank_genes_groups_cov'

In [12]:
cond = remove_invalid_smiles(adata.obs, smiles_key='canonical_smiles', return_condition=True)
adata = adata[cond]
adata 

A total of 0 have invalid SMILES strings


View of AnnData object with n_obs × n_vars = 176587 × 978
    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'batch', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control', 'split', 'canonical_smiles'
    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'
    uns: 'rank_genes_groups_cov'

In [13]:
adata.write(adata_out)
adata

Trying to set attribute `.obs` of view, copying.
... storing 'pert_id' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'canonical_smiles' as categorical


AnnData object with n_obs × n_vars = 176587 × 978
    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'batch', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control', 'split', 'canonical_smiles'
    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'
    uns: 'rank_genes_groups_cov'

In [14]:
print('Finished')

Finished


### Add additional drugbank info to `adata.obs`

In [None]:
from os.path import exists

drugbank_path = '../datasets/drug_bank/drugbank_all.csv'
if exists(drugbank_path): 
    drugbank_df = pd.read_csv(drugbank_path)
else: 
    print(f'Invalid path: {drugbank_path}')

In [None]:
from rdkit.Chem import CanonSmiles

drugs_canonical = pd.Series(np.unique(adata.obs.canonical_smiles)).apply(CanonSmiles)
db_canonical_smiles = drugbank_df.SMILES.apply(CanonSmiles)
n_overlap = drugs_canonical.isin(db_canonical_smiles).sum()
print(f'From a total of {len(drugs_canonical)}, {100*n_overlap/len(drugs_canonical):.2f}% ({n_overlap}) is also available in drugbank.')

In [None]:
cond = db_canonical_smiles.isin(drugs_canonical)
drugbank_df.loc[cond, ['ATC_level_1']].value_counts()

In [15]:
adata.obs

Unnamed: 0_level_0,cell_id,det_plate,det_well,lincs_phase,pert_dose,pert_dose_unit,pert_id,pert_iname,pert_mfc_id,pert_time,...,rna_plate,rna_well,batch,condition,cell_type,dose_val,cov_drug_dose_name,control,split,canonical_smiles
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
REP.A001_A375_24H_X1_B22:B13-2-0,A375,REP.A001_A375_24H_X1_B22,B13,2,10.000000,um,BRD-A25234499,aminoglutethimide,BRD-A25234499-001-18-3,24.0,...,,,0,aminoglutethimide,A375,10.000000,A375_aminoglutethimide_10.0,0,train,CCC1(CCC(=O)NC1=O)c1ccc(N)cc1
REP.A001_A375_24H_X1_B22:B14-2-0,A375,REP.A001_A375_24H_X1_B22,B14,2,3.333330,um,BRD-A25234499,aminoglutethimide,BRD-A25234499-001-18-3,24.0,...,,,0,aminoglutethimide,A375,3.333330,A375_aminoglutethimide_3.33333,0,train,CCC1(CCC(=O)NC1=O)c1ccc(N)cc1
REP.A001_A375_24H_X1_B22:B15-2-0,A375,REP.A001_A375_24H_X1_B22,B15,2,1.111110,um,BRD-A25234499,aminoglutethimide,BRD-A25234499-001-18-3,24.0,...,,,0,aminoglutethimide,A375,1.111110,A375_aminoglutethimide_1.11111,0,train,CCC1(CCC(=O)NC1=O)c1ccc(N)cc1
REP.A001_A375_24H_X1_B22:B16-2-0,A375,REP.A001_A375_24H_X1_B22,B16,2,0.370370,um,BRD-A25234499,aminoglutethimide,BRD-A25234499-001-18-3,24.0,...,,,0,aminoglutethimide,A375,0.370370,A375_aminoglutethimide_0.37037,0,train,CCC1(CCC(=O)NC1=O)c1ccc(N)cc1
REP.A001_A375_24H_X1_B22:B17-2-0,A375,REP.A001_A375_24H_X1_B22,B17,2,0.123457,um,BRD-A25234499,aminoglutethimide,BRD-A25234499-001-18-3,24.0,...,,,0,aminoglutethimide,A375,0.123457,A375_aminoglutethimide_0.123457,0,train,CCC1(CCC(=O)NC1=O)c1ccc(N)cc1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PCLB003_PC3_24H_X3_B13:A04-1-1,PC3,,,1,1.000000,-666,DMSO,DMSO,,24.0,...,PCLB003_PC3_24H_X3,A04,1,DMSO,PC3,1.000000,PC3_DMSO_1.0,1,test,CS(=O)C
PCLB003_PC3_24H_X3_B13:A05-1-1,PC3,,,1,1.000000,-666,DMSO,DMSO,,24.0,...,PCLB003_PC3_24H_X3,A05,1,DMSO,PC3,1.000000,PC3_DMSO_1.0,1,train,CS(=O)C
PCLB003_PC3_24H_X3_B13:A06-1-1,PC3,,,1,1.000000,-666,DMSO,DMSO,,24.0,...,PCLB003_PC3_24H_X3,A06,1,DMSO,PC3,1.000000,PC3_DMSO_1.0,1,train,CS(=O)C
PCLB003_PC3_24H_X3_B13:B04-1-1,PC3,,,1,1.000000,-666,DMSO,DMSO,,24.0,...,PCLB003_PC3_24H_X3,B04,1,DMSO,PC3,1.000000,PC3_DMSO_1.0,1,test,CS(=O)C
