In [5]:
import os 
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
sc.set_figure_params(dpi=80, frameon=False)
sc.logging.print_header()
os.getcwd()

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Loading LINCS and reference data

In [None]:
full = False
if full:
    adata = sc.read('../datasets/lincs_full.h5ad')
    adata_out = '../datasets/lincs_full_smiles.h5ad' 
else: 
    adata = sc.read('../datasets/lincs.h5ad')
    adata_out = '../datasets/lincs_smiles.h5ad'  


Checking number of drugs for LINCS

In [None]:
pert_id_unique = pd.Series(np.unique(adata.obs.pert_id))
print(f"# of unique perturbations: {len(pert_id_unique)}")

# of unique perturbations: 1120


Loading reference dataframe and restricting to `'pert_id'` and `'canonical_smiles'`

In [None]:
reference_df = pd.read_csv('../datasets/GSE92742_Broad_LINCS_pert_info.txt', delimiter = "\t")
reference_df = reference_df.loc[reference_df.pert_id.isin(pert_id_unique), ['pert_id', 'canonical_smiles']]
reference_df.canonical_smiles.value_counts()

-666                                                                                         6
CS(=O)(=O)CCNCc1ccc(o1)-c1ccc2ncnc(Nc3ccc(OCc4cccc(F)c4)c(Cl)c3)c2c1                         2
restricted                                                                                   2
CC(/C=C/C1=C(C)CCCC1(C)C)=C\C=C\C(C)=C/C(O)=O                                                1
CC(C)NC(=O)N(C)C[C@@H]1OCCCC[C@H](C)Oc2ccc(NC(=O)C3CCCCC3)cc2C(=O)N(C[C@@H]1C)[C@@H](C)CO    1
                                                                                            ..
CCN(CC)CCCCNc1ncc2cc(-c3cc(OC)cc(OC)c3)c(NC(=O)NC(C)(C)C)nc2n1                               1
Cc1ccc(C)n1-c1cccc(c1)-c1nn[nH]n1                                                            1
CCOC(=O)C1=C(C)N=c2s\c(=C/c3cc(Br)c(O)c(Br)c3)c(=O)n2C1c1ccc2OCOc2c1                         1
COc1ccccc1NC(=O)N1CCCCN2[C@@H](CO)[C@H]([C@H]2C1)c1ccc(cc1)-c1cccc(c1)C#N                    1
Clc1nssc1=Nc2nncs2                                

In [None]:
cond = ~pert_id_unique.isin(reference_df.pert_id)
print(f"From {len(pert_id_unique)} total drugs, {cond.sum()} were not part of the reference dataframe.")

From 1120 total drugs, 132 were not part of the reference dataframe.


Adding `'canoncical_smiles'` column to `adata.obs` via `pd.merge`

In [None]:
adata.obs = adata.obs.reset_index().merge(reference_df, how="left").set_index('index')

Removing invalid SMILES strings 

In [None]:
adata.obs.canonical_smiles = adata.obs.canonical_smiles.astype('str')
invalid_smiles = adata.obs.canonical_smiles.isin(['-666', 'restricted', 'nan'])
print(f'Among {len(adata)} observations, {100*invalid_smiles.sum()/len(adata):.2f}% ({invalid_smiles.sum()}) do not have a valid SMILES string')
adata = adata[~invalid_smiles]

Among 199620 observations, 11.54% (23033) do not have a valid SMILES string


Remove invalid `'pert_dose'` value: `-666`

In [None]:
cond = adata.obs.pert_dose.isin([-666])
adata = adata[~cond]
print(f"A total of {cond.sum()} observations have invalid dose values")

A total of 0 observations have invalid dose values


In [None]:
drugs_validation = adata.obs.canonical_smiles.value_counts() < 5
valid_drugs = drugs_validation.index[~drugs_validation]
cond = adata.obs.canonical_smiles.isin(valid_drugs)
print(f"A total of {(~cond).sum()} observation belong to drugs which do not have enough replicates")
adata = adata[cond]

A total of 0 observation belong to drugs which do not have enough replicates


Checking that SMILES are valid according to `rdkit` 

In [None]:
adata.write(adata_out)
adata

In [None]:
adata.write(adata_out)
adata