**Requirements** 
* According to this [paper](https://arxiv.org/pdf/1904.01561.pdf), features are computed with [descriptastorus](https://github.com/bp-kelley/descriptastorus) package
* Install via: `pip install git+https://github.com/bp-kelley/descriptastorus`

## General imports

In [1]:
import sys 
sys.path.insert(0, "/")  # this depends on the notebook depth and must be adapted per notebook
from compert.paths import DATA_DIR, EMBEDDING_DIR

In [2]:
import numpy as np
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

## Load Smiles list

In [3]:
dataset_name = 'lincs_trapnell'

In [4]:
import pandas as pd 
smiles_df = pd.read_csv(EMBEDDING_DIR / f'{dataset_name}.smiles')
smiles_list = smiles_df['smiles'].values

In [5]:
print(f'Number of smiles strings: {len(smiles_list)}')

Number of smiles strings: 17869


In [6]:
from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator
generator = MakeGenerator(("RDKit2D",))
for name, numpy_type in generator.GetColumns():
    print(f"{name}({numpy_type.__name__})")



RDKit2D_calculated(bool)
BalabanJ(float64)
BertzCT(float64)
Chi0(float64)
Chi0n(float64)
Chi0v(float64)
Chi1(float64)
Chi1n(float64)
Chi1v(float64)
Chi2n(float64)
Chi2v(float64)
Chi3n(float64)
Chi3v(float64)
Chi4n(float64)
Chi4v(float64)
EState_VSA1(float64)
EState_VSA10(float64)
EState_VSA11(float64)
EState_VSA2(float64)
EState_VSA3(float64)
EState_VSA4(float64)
EState_VSA5(float64)
EState_VSA6(float64)
EState_VSA7(float64)
EState_VSA8(float64)
EState_VSA9(float64)
ExactMolWt(float64)
FpDensityMorgan1(float64)
FpDensityMorgan2(float64)
FpDensityMorgan3(float64)
FractionCSP3(float64)
HallKierAlpha(float64)
HeavyAtomCount(float64)
HeavyAtomMolWt(float64)
Ipc(float64)
Kappa1(float64)
Kappa2(float64)
Kappa3(float64)
LabuteASA(float64)
MaxAbsEStateIndex(float64)
MaxAbsPartialCharge(float64)
MaxEStateIndex(float64)
MaxPartialCharge(float64)
MinAbsEStateIndex(float64)
MinAbsPartialCharge(float64)
MinEStateIndex(float64)
MinPartialCharge(float64)
MolLogP(float64)
MolMR(float64)
MolWt(float64)

In [7]:
n_jobs = 16
data = Parallel(n_jobs=n_jobs)(delayed(generator.process)(smiles) for smiles in tqdm(smiles_list, position=0, leave=True) )

  0%|          | 0/17869 [00:00<?, ?it/s]

In [8]:
embedding = np.array(data)
embedding.shape

(17869, 201)

## Check `nans` and `infs`

Check for `nans`

In [9]:
drug_idx, feature_idx = np.where(np.isnan(embedding))
print(f'drug_idx:\n {drug_idx}')
print(f'feature_idx:\n {feature_idx}')

drug_idx:
 [   93    93    93    93   206   206   206   206   866   866   866   866
  2607  2607  2607  2607  2614  2614  2614  2614  5336  5336  5336  5336
  5602  5602  5602  5602  5685  5685  5685  5685  5774  5774  5774  5774
  5782  5782  5782  5782  5805  5805  5805  5805  5807  5807  5807  5807
  6036  6036  6036  6036 17136 17136 17136 17136]
feature_idx:
 [40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46
 40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46 40 42 44 46
 40 42 44 46 40 42 44 46]


Check for `infs` and add to idx lists

In [10]:
drug_idx_infs, feature_idx_infs = np.where(np.isinf(embedding))

drug_idx = np.concatenate((drug_idx, drug_idx_infs))
feature_idx = np.concatenate((feature_idx, feature_idx_infs))

Features that have these invalid values:

In [11]:
np.array(generator.GetColumns())[np.unique(feature_idx)]

array([['MaxAbsPartialCharge', <class 'numpy.float64'>],
       ['MaxPartialCharge', <class 'numpy.float64'>],
       ['MinAbsPartialCharge', <class 'numpy.float64'>],
       ['MinPartialCharge', <class 'numpy.float64'>]], dtype=object)

Set values to `0`

In [12]:
embedding[drug_idx, feature_idx] 

array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, inf, inf, inf, inf, inf, inf])

In [13]:
embedding[drug_idx, feature_idx] = 0

## Save

In [14]:
import pandas as pd

df = pd.DataFrame(data=embedding,index=smiles_list,columns=[f'latent_{i}' for i in range(embedding.shape[1])]) 

# Drop first feature from generator (RDKit2D_calculated)
df.drop(columns=['latent_0'], inplace=True)

# Drop columns with 0 standard deviation
threshold = 0.01
columns=[f'latent_{idx+1}' for idx in np.where(df.std() <= threshold)[0]]
print(f'Deleting columns with std<={threshold}: {columns}')
df.drop(columns=[f'latent_{idx+1}' for idx in np.where(df.std() <= 0.01)[0]], inplace=True)

Deleting columns with std<=0.01: ['latent_90', 'latent_103', 'latent_152', 'latent_164', 'latent_187', 'latent_196']


Check that correct columns were deleted: 

In [15]:
np.where(df.std() <= threshold)

(array([], dtype=int64),)

### Normalise dataframe

In [16]:
normalized_df=(df-df.mean())/df.std()

In [17]:
normalized_df.head()

Unnamed: 0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,...,latent_190,latent_191,latent_192,latent_193,latent_194,latent_195,latent_197,latent_198,latent_199,latent_200
C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1,0.987011,-0.770585,-0.997189,-1.132568,-0.931373,-1.050912,-1.233309,-1.125313,-1.354848,-1.217915,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,-0.311624
Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n3n2)[nH]n1,-0.283302,0.697392,0.00033,0.001786,0.06259,0.113854,-0.020283,-0.053318,-0.087372,-0.134349,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,-0.708922
Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4)cc[nH]c3=O)nc12,-0.535098,0.909597,0.1011,0.04287,0.10379,0.229804,0.08047,0.045557,-0.049695,-0.098408,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,-1.298021
Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1,-3.751746,-1.807921,-1.731786,-1.388457,-1.179856,-1.629691,-1.306154,-1.423073,-1.218564,-1.359103,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,1.725542
O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1,-0.457417,-0.326019,-0.838592,-0.796089,-0.86832,-0.678709,-0.703193,-0.831348,-0.833087,-0.991385,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,1.084554


Check destination folder

In [18]:
model_name = 'rdkit2D'
fname = f'{model_name}_embedding_{dataset_name}.parquet'

directory = EMBEDDING_DIR /'rdkit' / 'data' /'embeddings'
directory.mkdir(parents=True, exist_ok=True)

Save normalised version

In [20]:
normalized_df.to_parquet(directory / fname)

Check that it worked

In [21]:
df = pd.read_parquet(directory/ fname)
df

Unnamed: 0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,...,latent_190,latent_191,latent_192,latent_193,latent_194,latent_195,latent_197,latent_198,latent_199,latent_200
C[C@H](NC(=O)/C(C#N)=C/c1cccc(Br)n1)c1ccccc1,0.987011,-0.770585,-0.997189,-1.132568,-0.931373,-1.050912,-1.233309,-1.125313,-1.354848,-1.217915,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,-0.311624
Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n3n2)[nH]n1,-0.283302,0.697392,0.000330,0.001786,0.062590,0.113854,-0.020283,-0.053318,-0.087372,-0.134349,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,-0.708922
Cc1cc(N2CCOCC2)cc2[nH]c(-c3c(NCC(O)c4cccc(Cl)c4)cc[nH]c3=O)nc12,-0.535098,0.909597,0.101100,0.042870,0.103790,0.229804,0.080470,0.045557,-0.049695,-0.098408,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,-1.298021
Cl.Cl.c1ccc([C@@H]2C[C@H]2NC2CCNCC2)cc1,-3.751746,-1.807921,-1.731786,-1.388457,-1.179856,-1.629691,-1.306154,-1.423073,-1.218564,-1.359103,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,1.725542
O=C(c1ccc(/C=C/c2n[nH]c3ccccc23)cc1)N1CCNCC1,-0.457417,-0.326019,-0.838592,-0.796089,-0.868320,-0.678709,-0.703193,-0.831348,-0.833087,-0.991385,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,1.084554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCCC(=O)Nc1ccc2c(c1)C(=O)N(C)C[C@H](OC)[C@@H](C)CN(Cc1ccc(-c3ccccn3)cc1)[C@@H](C)CO2,-0.251372,0.704331,0.913219,1.083533,1.016621,0.944881,0.999382,0.839497,0.844928,0.609324,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,-0.874405
Cc1cc(CS(=O)(=O)c2ccccc2)cc(OCc2ccc(CN3CCC[C@@H]3CO)cc2)c1,-0.704724,0.343352,0.044782,0.079949,0.151452,0.106783,0.123477,0.451369,0.086728,0.550145,...,-0.163202,-0.455579,12.591213,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,-0.381994
CN(C)CCOc1ccc(/C(=C(\CCCl)c2ccccc2)c2ccccc2)cc1,0.715589,-0.332284,-0.352032,-0.237123,-0.176995,-0.271070,-0.283603,-0.267059,-0.466131,-0.515176,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,-1.432809
CC1(C)C=Cc2c(ccc3c2[N+]([O-])=C2C3=C[C@@]34NC(=O)[C@]5(CCCN5C3=O)C[C@H]4C2(C)C)O1,-0.450599,0.715467,0.025983,0.138125,0.068538,0.018375,0.249687,0.103773,1.040614,0.795995,...,-0.163202,-0.455579,-0.075138,-0.050245,-0.069133,-0.156047,-0.157586,-0.145213,-0.464139,-0.450464
