In [1]:
import os 
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
sc.set_figure_params(dpi=80, frameon=False)
sc.logging.print_header()
os.getcwd()

scanpy==1.8.0.dev78+gc488909a anndata==0.7.6 umap==0.5.1 numpy==1.19.2 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.1 statsmodels==0.12.2 python-igraph==0.9.1 louvain==0.7.0


'/home/icb/leon.hetzel/git/CPA_graphs/notebooks'

In [2]:
%load_ext autoreload
%autoreload 2

### Loading LINCS and reference data

In [3]:
full = True
if full:
    adata = sc.read('../datasets/lincs_full.h5ad')
    adata_out = '../datasets/lincs_full_smiles.h5ad' 
else: 
    adata = sc.read('../datasets/lincs.h5ad')
    adata_out = '../datasets/lincs_smiles.h5ad'  


Checking number of drugs for LINCS

In [4]:
pert_id_unique = pd.Series(np.unique(adata.obs.pert_id))
print(f"# of unique perturbations: {len(pert_id_unique)}")

# of unique perturbations: 21304


Loading reference dataframe and restricting to `'pert_id'` and `'canonical_smiles'`

In [5]:
reference_df = pd.read_csv('../datasets/GSE92742_Broad_LINCS_pert_info.txt', delimiter = "\t")
reference_df = reference_df.loc[reference_df.pert_id.isin(pert_id_unique), ['pert_id', 'canonical_smiles']]
reference_df.canonical_smiles.value_counts()

-666                                                                                                                                                  63
restricted                                                                                                                                            14
CCC1=C[C@@H]2C[N@](C1)Cc1c([nH]c3ccccc13)[C@@](C2)(C(=O)OC)c1cc2c(cc1OC)N(C)[C@@H]1[C@]22CCN3CC=C[C@@](CC)([C@@H]23)[C@@H](OC(C)=O)[C@]1(O)C(=O)OC     2
CN(\N=C\c1cnc2ccc(Br)cn12)S(=O)(=O)c1cc(ccc1C)[N+]([O-])=O                                                                                             2
CS(=O)(=O)CCNCc1ccc(o1)-c1ccc2ncnc(Nc3ccc(OCc4cccc(F)c4)c(Cl)c3)c2c1                                                                                   2
                                                                                                                                                      ..
CC(C)=CCC\C(C)=C\CC\C(C)=C\COP(O)(=O)OP(O)(O)=O                                   

In [6]:
cond = ~pert_id_unique.isin(reference_df.pert_id)
print(f"From {len(pert_id_unique)} total drugs, {cond.sum()} were not part of the reference dataframe.")

From 21304 total drugs, 890 were not part of the reference dataframe.


Adding `'canoncical_smiles'` column to `adata.obs` via `pd.merge`

In [7]:
adata.obs = adata.obs.reset_index().merge(reference_df, how="left").set_index('index')

Removing invalid SMILES strings 

In [8]:
adata.obs.canonical_smiles = adata.obs.canonical_smiles.astype('str')
invalid_smiles = adata.obs.canonical_smiles.isin(['-666', 'restricted', 'nan'])
print(f'Among {len(adata)} observations, {100*invalid_smiles.sum()/len(adata):.2f}% ({invalid_smiles.sum()}) do not have a valid SMILES string')
adata = adata[~invalid_smiles]

Among 1034271 observations, 13.51% (139764) do not have a valid SMILES string


Remove invalid `'pert_dose'` value: `-666`

In [9]:
cond = adata.obs.pert_dose.isin([-666])
adata = adata[~cond]
print(f"A total of {cond.sum()} observations have invalid dose values")

A total of 42592 observations have invalid dose values


In [10]:
drugs_validation = adata.obs.canonical_smiles.value_counts() < 5
valid_drugs = drugs_validation.index[~drugs_validation]
cond = adata.obs.canonical_smiles.isin(valid_drugs)
print(f"A total of {(~cond).sum()} observation belong to drugs which do not have enough replicates")
adata = adata[cond]

A total of 3808 observation belong to drugs which do not have enough replicates


Checking that SMILES are valid according to `rdkit` 

In [11]:
from rdkit import Chem

def check_smiles(smiles):
    m = Chem.MolFromSmiles(smiles,sanitize=False)
    if m is None:
        print('invalid SMILES')
        return False
    else:
        try:
            Chem.SanitizeMol(m)
        except:
            print('invalid chemistry')
            return False
    return True

def remove_invalid_smiles(dataframe, smiles_key: str = 'SMILES', return_condition: bool = False):
    unique_drugs = pd.Series(np.unique(dataframe[smiles_key]))
    valid_drugs = unique_drugs.apply(check_smiles)
    print(f"A total of {(~valid_drugs).sum()} have invalid SMILES strings")
    _validation_map = dict(zip(unique_drugs, valid_drugs))
    cond = dataframe[smiles_key].apply(lambda x: _validation_map[x])
    if return_condition: 
        return cond
    dataframe = dataframe[cond].copy()
    return dataframe

adata

View of AnnData object with n_obs × n_vars = 848107 × 978
    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'canonical_smiles'
    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'
    uns: 'cydata_pull'

In [12]:
cond = remove_invalid_smiles(adata.obs, smiles_key='canonical_smiles', return_condition=True)
adata = adata[cond]
adata 

A total of 0 have invalid SMILES strings


View of AnnData object with n_obs × n_vars = 848107 × 978
    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'canonical_smiles'
    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'
    uns: 'cydata_pull'

In [13]:
adata.write(adata_out)
adata

Trying to set attribute `.obs` of view, copying.
... storing 'pert_id' as categorical
Trying to set attribute `.obs` of view, copying.
... storing 'canonical_smiles' as categorical


AnnData object with n_obs × n_vars = 848107 × 978
    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'canonical_smiles'
    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'
    uns: 'cydata_pull'

In [14]:
print('Finished')

Finished
