In [2]:
import os 
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
sc.set_figure_params(dpi=80, frameon=False)
sc.logging.print_header()
os.getcwd()

scanpy==1.8.1 anndata==0.7.6 umap==0.5.1 numpy==1.19.2 scipy==1.6.2 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.12.2 python-igraph==0.9.1 louvain==0.7.0 pynndescent==0.5.2


'/mnt/home/icb/leon.hetzel/git/CPA_graphs/preprocessing'

In [3]:
%load_ext autoreload
%autoreload 2

### Loading LINCS and reference data

In [4]:
full = False
load_adata = True 
if full:
    adata_in = '../datasets/lincs_full_pp.h5ad'
    adata_out = '../datasets/lincs_full_smiles.h5ad' 
else: 
    adata_in = '../datasets/lincs_pp.h5ad'
    adata_out = '../datasets/lincs_smiles.h5ad'  
adata = sc.read(adata_in) if load_adata else None


Checking number of drugs for LINCS

In [5]:
pert_id_unique = pd.Series(np.unique(adata.obs.pert_id))
print(f"# of unique perturbations: {len(pert_id_unique)}")

# of unique perturbations: 1120


Loading reference dataframe and restricting to `'pert_id'` and `'canonical_smiles'`

In [6]:
reference_df = pd.read_csv('../datasets/GSE92742_Broad_LINCS_pert_info.txt', delimiter = "\t")
reference_df = reference_df.loc[reference_df.pert_id.isin(pert_id_unique), ['pert_id', 'canonical_smiles']]
reference_df.canonical_smiles.value_counts()

-666                                                                                                                                         6
CS(=O)(=O)CCNCc1ccc(o1)-c1ccc2ncnc(Nc3ccc(OCc4cccc(F)c4)c(Cl)c3)c2c1                                                                         2
restricted                                                                                                                                   2
Nc1ccccc1NC(=O)c1ccc(CNc2nccc(n2)-c2cccnc2)cc1                                                                                               1
CC(=O)OCCN1C(=O)c2c(C1=O)c3cc(ccc3nc2C)S(=O)(=O)N4CCOCC4                                                                                     1
                                                                                                                                            ..
NCC[C@H](O)C(=O)N[C@@H]1C[C@H](N)[C@@H](O[C@H]2O[C@H](CN)[C@@H](O)[C@H](O)[C@H]2O)[C@H](O)[C@H]1O[C@H]1O[C@H](CO)[C@@H](O)[C@H](N)[C@H]1O    1

In [7]:
cond = ~pert_id_unique.isin(reference_df.pert_id)
print(f"From {len(pert_id_unique)} total drugs, {cond.sum()} were not part of the reference dataframe.")

From 1120 total drugs, 132 were not part of the reference dataframe.


Adding `'canoncical_smiles'` column to `adata.obs` via `pd.merge`

In [8]:
adata.obs = adata.obs.reset_index().merge(reference_df, how="left").set_index('index')

Removing invalid SMILES strings 

In [9]:
adata.obs.canonical_smiles = adata.obs.canonical_smiles.astype('str')
invalid_smiles = adata.obs.canonical_smiles.isin(['-666', 'restricted', 'nan'])
print(f'Among {len(adata)} observations, {100*invalid_smiles.sum()/len(adata):.2f}% ({invalid_smiles.sum()}) do not have a valid SMILES string')
adata = adata[~invalid_smiles]

Among 199620 observations, 11.54% (23033) do not have a valid SMILES string


Remove invalid `'pert_dose'` value: `-666`

In [10]:
cond = adata.obs.pert_dose.isin([-666])
adata = adata[~cond]
print(f"A total of {cond.sum()} observations have invalid dose values")

A total of 0 observations have invalid dose values


In [11]:
drugs_validation = adata.obs.canonical_smiles.value_counts() < 6
valid_drugs = drugs_validation.index[~drugs_validation]
cond = adata.obs.canonical_smiles.isin(valid_drugs)
print(f"A total of {(~cond).sum()} observation belong to drugs which do not have enough replicates")
adata = adata[cond]

A total of 0 observation belong to drugs which do not have enough replicates


Checking that SMILES are valid according to `rdkit` 

In [12]:
from rdkit import Chem

def check_smiles(smiles):
    m = Chem.MolFromSmiles(smiles,sanitize=False)
    if m is None:
        print('invalid SMILES')
        return False
    else:
        try:
            Chem.SanitizeMol(m)
        except:
            print('invalid chemistry')
            return False
    return True

def remove_invalid_smiles(dataframe, smiles_key: str = 'SMILES', return_condition: bool = False):
    unique_drugs = pd.Series(np.unique(dataframe[smiles_key]))
    valid_drugs = unique_drugs.apply(check_smiles)
    print(f"A total of {(~valid_drugs).sum()} have invalid SMILES strings")
    _validation_map = dict(zip(unique_drugs, valid_drugs))
    cond = dataframe[smiles_key].apply(lambda x: _validation_map[x])
    if return_condition: 
        return cond
    dataframe = dataframe[cond].copy()
    return dataframe

adata

View of AnnData object with n_obs × n_vars = 176587 × 978
    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'batch', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control', 'split', 'cov_drug_name', 'eval_category', 'canonical_smiles'
    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'
    uns: 'rank_genes_groups_cov'

In [13]:
cond = remove_invalid_smiles(adata.obs, smiles_key='canonical_smiles', return_condition=True)
adata = adata[cond]

A total of 0 have invalid SMILES strings


### Add additional drugbank info to `adata.obs`

In [14]:
from os.path import exists

drugbank_path = '../datasets/drug_bank/drugbank_all.csv'
if exists(drugbank_path): 
    drugbank_df = pd.read_csv(drugbank_path)
else: 
    print(f'Invalid path: {drugbank_path}')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [15]:
from rdkit.Chem import CanonSmiles

drugs_canonical = pd.Series(np.unique(adata.obs.canonical_smiles)).apply(CanonSmiles)
db_canonical_smiles = drugbank_df.SMILES.apply(CanonSmiles)
n_overlap = drugs_canonical.isin(db_canonical_smiles).sum()
print(f'From a total of {len(drugs_canonical)}, {100*n_overlap/len(drugs_canonical):.2f}% ({n_overlap}) is also available in drugbank.')

From a total of 979, 20.43% (200) is also available in drugbank.




In [16]:
cond = db_canonical_smiles.isin(drugs_canonical)
drugbank_df.loc[cond, ['ATC_level_1']].value_counts()

ATC_level_1                               
an                                            68
ANTINEOPLASTIC AND IMMUNOMODULATING AGENTS    42
NERVOUS SYSTEM                                22
CARDIOVASCULAR SYSTEM                         16
ALIMENTARY TRACT AND METABOLISM               13
GENITO URINARY SYSTEM AND SEX HORMONES         8
RESPIRATORY SYSTEM                             6
SENSORY ORGANS                                 6
DERMATOLOGICALS                                5
INSECTICIDES AND REPELLENTS                    4
MUSCULO-SKELETAL SYSTEM                        4
ANTIINFECTIVES FOR SYSTEMIC USE                2
BLOOD AND BLOOD FORMING ORGANS                 2
VARIOUS                                        2
dtype: int64

### Add `train`, `test`, `ood` split for full lincs dataset (if not already part in `adata.obs`)

In [17]:
from sklearn.model_selection import train_test_split

if 'split' not in list(adata.obs):
    print("Addig 'split' to 'adata.obs'.")
    unique_drugs = np.unique(adata.obs.canonical_smiles)
    drugs_train, drugs_tmp = train_test_split(unique_drugs, test_size=0.2)
    drugs_val, drugs_test = train_test_split(drugs_tmp, test_size=0.5)

    adata.obs['split'] = 'train'
    adata.obs.loc[adata.obs.canonical_smiles.isin(drugs_val), 'split'] = 'test'
    adata.obs.loc[adata.obs.canonical_smiles.isin(drugs_test), 'split'] = 'ood'

### Check that `.obs.split=='test'` has sufficient samples for `pert_id` and `cell_id`

In [18]:
adata.obs.split.value_counts()

train    144579
test      27592
ood        4416
Name: split, dtype: int64

In [19]:
cond_test = adata.obs.split.isin(['test'])
adata.obs.loc[cond_test, 'cell_id'].value_counts()

MCF7        4091
PC3         3330
VCAP        2244
HT29        2201
A375        2130
            ... 
H1299         22
TYKNU         21
COV644        20
WSUDLCL2      20
HS27A          2
Name: cell_id, Length: 82, dtype: int64

In [20]:
adata.obs.loc[cond_test, 'pert_id'].value_counts()

DMSO             7465
BRD-A19037878     465
BRD-K49328571     325
BRD-K21680192     190
BRD-K59369769     172
                 ... 
BRD-K75527158       1
BRD-K70504303       1
BRD-K08111712       1
BRD-K92977333       1
BRD-K80686274       1
Name: pert_id, Length: 975, dtype: int64

In [21]:
pert_count_treshold = 5
cov_count_treshold = 20

pert_id_neg = adata.obs.loc[cond_test, 'pert_id'].value_counts() < pert_count_treshold
print(f"pert_id: {pert_id_neg.sum()}/{len(pert_id_neg)} converted back to 'train' due to insufficient # of samples.")

cov_id_neg = adata.obs.loc[cond_test, 'cell_id'].value_counts() < cov_count_treshold
print(f"cell_id: {cov_id_neg.sum()}/{len(cov_id_neg)} converted back to 'train' due to insufficient # of samples.")

cond = cond_test & adata.obs.pert_id.isin(pert_id_neg.index[pert_id_neg])
cond |= cond_test & adata.obs.cell_id.isin(cov_id_neg.index[cov_id_neg])

pert_id: 179/975 converted back to 'train' due to insufficient # of samples.
cell_id: 1/82 converted back to 'train' due to insufficient # of samples.


In [22]:
adata.obs['split1'] = adata.obs.split.copy()
adata.obs.loc[cond, 'split1'] = 'train'
print(f"split['test']: {cond.sum()}/{len(cond)} samples are converted back to 'train'.")

Trying to set attribute `.obs` of view, copying.


split['test']: 506/176587 samples are converted back to 'train'.


In [23]:
adata.obs.split1.value_counts()

train    145085
test      27086
ood        4416
Name: split1, dtype: int64

In [24]:
adata.write(adata_out)
adata

... storing 'pert_id' as categorical
... storing 'canonical_smiles' as categorical


AnnData object with n_obs × n_vars = 176587 × 978
    obs: 'cell_id', 'det_plate', 'det_well', 'lincs_phase', 'pert_dose', 'pert_dose_unit', 'pert_id', 'pert_iname', 'pert_mfc_id', 'pert_time', 'pert_time_unit', 'pert_type', 'rna_plate', 'rna_well', 'batch', 'condition', 'cell_type', 'dose_val', 'cov_drug_dose_name', 'control', 'split', 'cov_drug_name', 'eval_category', 'canonical_smiles', 'split1'
    var: 'pr_gene_title', 'pr_is_lm', 'pr_is_bing'
    uns: 'rank_genes_groups_cov'

In [28]:
adata.obs['drug_dose_name']

KeyError: 'drug_dose_name'

### Loading the result for `adata_out`

In [24]:
adata = sc.read(adata_out)

### Check that `adata.uns[rank_genes_groups_cov]` has all entries in `adata.obs.cov_drug_dose_name` as keys

In [26]:
for i, k in enumerate(adata.obs.cov_drug_dose_name.unique()):
    try: 
        adata.uns['rank_genes_groups_cov'][k]
    except: 
        print(f"{i}: {k}") if 'DMSO' not in k else None