In [30]:
import sys 
sys.path.insert(0, "home/icb/alessandro.palma/")  # this depends on the notebook depth and must be adapted per notebook
import numpy as np
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import pandas as pd 
import os
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

In [31]:
# Name of the dataset 
dataset_name = 'cellpainting'
data_dir = '/home/icb/alessandro.palma/data/metadata_processed/integrated/200k/splits/'

# Get the training and ood splits (containing the whole set of drugs in the dataset)
training_set = pd.read_csv(os.path.join(data_dir, 'datasplit-train.csv'))
ood_set = pd.read_csv(os.path.join(data_dir, 'datasplit-ood.csv'))

In [32]:
training_set_unique = training_set.drop_duplicates(subset=['SMILES'])
ood_set_unique = ood_set.drop_duplicates(subset=['SMILES'])

In [33]:
# Paste the two datasets 
dataset = pd.concat([training_set_unique, ood_set_unique],ignore_index = True)
smiles = dataset.SMILES.to_numpy()

In [34]:
from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator



In [35]:
# Create the generator for rdkit features 
generator = MakeGenerator(("RDKit2D",))

In [36]:
for name, numpy_type in generator.GetColumns():
    print(f"{name}({numpy_type.__name__})")

RDKit2D_calculated(bool)
BalabanJ(float64)
BertzCT(float64)
Chi0(float64)
Chi0n(float64)
Chi0v(float64)
Chi1(float64)
Chi1n(float64)
Chi1v(float64)
Chi2n(float64)
Chi2v(float64)
Chi3n(float64)
Chi3v(float64)
Chi4n(float64)
Chi4v(float64)
EState_VSA1(float64)
EState_VSA10(float64)
EState_VSA11(float64)
EState_VSA2(float64)
EState_VSA3(float64)
EState_VSA4(float64)
EState_VSA5(float64)
EState_VSA6(float64)
EState_VSA7(float64)
EState_VSA8(float64)
EState_VSA9(float64)
ExactMolWt(float64)
FpDensityMorgan1(float64)
FpDensityMorgan2(float64)
FpDensityMorgan3(float64)
FractionCSP3(float64)
HallKierAlpha(float64)
HeavyAtomCount(float64)
HeavyAtomMolWt(float64)
Ipc(float64)
Kappa1(float64)
Kappa2(float64)
Kappa3(float64)
LabuteASA(float64)
MaxAbsEStateIndex(float64)
MaxAbsPartialCharge(float64)
MaxEStateIndex(float64)
MaxPartialCharge(float64)
MinAbsEStateIndex(float64)
MinAbsPartialCharge(float64)
MinEStateIndex(float64)
MinPartialCharge(float64)
MolLogP(float64)
MolMR(float64)
MolWt(float64)

In [37]:
n_jobs = 16
data = Parallel(n_jobs=n_jobs)(delayed(generator.process)(smile) for smile in smiles )

In [38]:
embedding = np.array(data)

Check for the presence of nans or infinity

In [39]:
# Check for nans
drug_idx, feature_idx = np.where(np.isnan(embedding))
print(f'drug_idx:\n {drug_idx}')
print(f'feature_idx:\n {feature_idx}')

drug_idx:
 [956 956 956 956]
feature_idx:
 [40 42 44 46]


In [40]:
drug_idx_infs, feature_idx_infs = np.where(np.isinf(embedding))
print(f'drug_idx:\n {drug_idx_infs}')
print(f'feature_idx:\n {feature_idx_infs}')

drug_idx:
 []
feature_idx:
 []


In [41]:
drug_idx = np.concatenate((drug_idx, drug_idx_infs))
feature_idx = np.concatenate((feature_idx, feature_idx_infs))

In [42]:
# Features where the data has nan values
np.array(generator.GetColumns())[np.unique(feature_idx)]

array([['MaxAbsPartialCharge', <class 'numpy.float64'>],
       ['MaxPartialCharge', <class 'numpy.float64'>],
       ['MinAbsPartialCharge', <class 'numpy.float64'>],
       ['MinPartialCharge', <class 'numpy.float64'>]], dtype=object)

In [46]:
# Check if indeed the content is NaN 
embedding[drug_idx, feature_idx] 

array([0., 0., 0., 0.])

In [47]:
# Turn values to 0
embedding[drug_idx, feature_idx] = 0

## Save and normalize

In [48]:
# Convert to df
df = pd.DataFrame(data=embedding, index=smiles, columns=[f'latent_{i}' for i in range(embedding.shape[1])]) 

# Drop first feature from generator (RDKit2D_calculated)
df.drop(columns=['latent_0'], inplace=True)

# Drop columns with 0 standard deviation
threshold = 0.01
columns=[f'latent_{idx+1}' for idx in np.where(df.std() <= threshold)[0]]
print(f'Deleting columns with std<={threshold}: {columns}')
df.drop(columns=[f'latent_{idx+1}' for idx in np.where(df.std() <= 0.01)[0]], inplace=True)

Deleting columns with std<=0.01: ['latent_90', 'latent_103', 'latent_146', 'latent_152', 'latent_164', 'latent_165', 'latent_176', 'latent_187']


In [49]:
# Control if the right columns were removed
np.where(df.std() <= threshold)

(array([], dtype=int64),)

In [50]:
# Normalize features 
normalized_df=(df-df.mean())/df.std()

In [51]:
normalized_df

Unnamed: 0,latent_1,latent_2,latent_3,latent_4,latent_5,latent_6,latent_7,latent_8,latent_9,latent_10,...,latent_191,latent_192,latent_193,latent_194,latent_195,latent_196,latent_197,latent_198,latent_199,latent_200
O=C(COC(=O)c1nn(-c2ccccc2)c(=O)c2ccccc12)NC1CCS(=O)(=O)C1,-0.750603,1.491529,0.812802,0.516192,0.545281,0.803450,0.554524,0.980037,0.527846,1.114139,...,-0.364122,8.337882,-0.038918,-0.132369,-0.234364,-0.01375,-0.302587,-0.179321,-0.245922,-0.236757
C1COCCOCCOc2ccccc2OCCOCCOCCOc2ccccc2OCCOCCO1,-1.219044,-0.338241,1.734738,1.954208,1.770416,2.094202,2.024009,1.648676,0.911624,0.465929,...,-0.364122,-0.118302,-0.038918,-0.132369,-0.234364,-0.01375,-0.302587,-0.179321,-0.245922,-0.704320
[O-][N+](=O)c1ccc(cc1)-c1nc(c([nH]1)-c1ccncc1)-c1ccc(F)cc1,-0.061954,0.720068,0.131708,-0.090877,-0.265579,0.259995,-0.055359,-0.320068,-0.136217,-0.470610,...,-0.364122,-0.118302,-0.038918,-0.132369,-0.234364,-0.01375,-0.302587,-0.179321,-0.245922,-1.186060
FC(F)(F)c1cccc(Nc2ncccc2C(=O)OC2OC(=O)c3ccccc23)c1,-0.748739,0.855141,0.662634,0.208365,0.032333,0.641852,0.196621,-0.081494,0.101213,-0.258400,...,-0.364122,-0.118302,-0.038918,-0.132369,-0.234364,-0.01375,-0.302587,-0.179321,-0.245922,-0.053327
C[C@@H](O)C#Cc1ccc2c(O[C@@H](CN(C)Cc3ccc(cc3)-c3ccccc3)[C@H](C)CN([C@@H](C)CO)S2(=O)=O)c1,-0.694797,1.927810,2.268255,2.328360,2.349394,2.139945,2.212934,2.398928,2.227727,2.469015,...,2.407138,-0.118302,-0.038918,-0.132369,-0.234364,-0.01375,-0.302587,-0.179321,-0.245922,-1.143721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCOC(=O)Oc1c(OC)cc(cc1OC)C(=O)O[C@@H]1C[C@@H]2CN3CCc4c([nH]c5cc(OC)ccc45)[C@H]3C[C@@H]2[C@@H]([C@H]1OC)C(=O)OC,-1.505758,2.659614,3.382892,3.482543,3.291958,3.451726,3.282325,2.840049,3.056922,2.383355,...,-0.364122,-0.118302,-0.038918,-0.132369,-0.234364,-0.01375,-0.302587,-0.179321,-0.245922,-2.469115
COc1ccc2C[C@@H]3N(C)CCc4cc(OC)c(OC)c(Oc5cc6[C@H](Cc7ccc(Oc1c2)cc7)N(C)CCc6cc5OC)c34,-1.008273,2.975093,2.923506,3.351293,3.161291,3.186378,3.205266,2.767090,3.145208,2.462263,...,-0.364122,-0.118302,-0.038918,-0.132369,-0.234364,-0.01375,-0.302587,-0.179321,-0.245922,-2.208142
CC(C)(C)c1cc(C=C(C#N)C#N)cc(c1O)C(C)(C)C,4.571837,-0.933831,-0.373833,-0.190103,-0.364364,-0.878848,-0.568812,-0.806205,0.555687,0.147800,...,-0.364122,-0.118302,-0.038918,-0.132369,-0.234364,-0.01375,-0.302587,-0.179321,-0.245922,0.869170
Cc1cc(cc(C)c1CC1=NCCN1)C(C)(C)C,1.129811,-1.444612,-1.029162,-0.533155,-0.705891,-1.233662,-0.630698,-0.864798,0.070180,-0.286136,...,-0.364122,-0.118302,-0.038918,-0.132369,-0.234364,-0.01375,-0.302587,-0.179321,-0.245922,1.290882


In [52]:
model_name = 'rdkit2D'
fname = f'{model_name}_embedding_{dataset_name}.csv'
embedding_dir = '/home/icb/alessandro.palma/data/metadata_processed/original/embeddings/rdkit'

if not os.path.exists(embedding_dir):
    os.mkdir(embedding_dir)

In [53]:
df = normalized_df.to_csv(os.path.join(embedding_dir, fname))

In [54]:
np.unique(normalized_df.index)

array(['BrC1=CN2CCS(=O)(=O)N=C2C(Br)=C1', 'Brc1c(Br)c(Br)c2[nH]nnc2c1Br',
       'Brc1c(CSc2nc3ccccc3s2)nc2ncccn12', ...,
       'c1nc(cs1)-c1nc2ccccc2[nH]1', 'c1nn(-c2ccccc2)c2ncn3cnnc3c12',
       'c1nn(nc1-c1nn[nH]n1)-c1ccccc1'], dtype=object)