## ISMD (Inversed Synthesizable Molecular Design) Totorial

This tutorial will proceed as follow:

1. initial setup and data preparation
2. descriptor preparation for forward model (likelihood)
3. forward model (likelihood) preparation
4. proposal mdoel preparation 
5. a complete ismd run

### 1.1 import packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

import xenonpy
import onmt
from xenonpy.descriptor import Fingerprints
from xenonpy.inverse.iqspr import GaussianLogLikelihood
from xenonpy.contrib.ismd import ReactionDescriptor
from xenonpy.contrib.ismd import ReactantPool
from xenonpy.contrib.ismd import Reactor
from xenonpy.contrib.ismd import ISMD

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy

### 1.2 load data

In [2]:
# ground truth data
ground_truth_path = "/home/qiz/data/lab_database/ismd_data/STEREO_id_reactant_product_xlogp_tpsa.csv"
data = pd.read_csv(ground_truth_path)[:1000]
data.head()

Unnamed: 0,reactant_index,reactant,product,XLogP,TPSA
0,12163.22445,CCS(=O)(=O)Cl.OCCBr,CCS(=O)(=O)OCCBr,0.8,51.8
1,863.20896,CC(C)CS(=O)(=O)Cl.OCCCl,CC(C)CS(=O)(=O)OCCCl,1.6,51.8
2,249087.0,O=[N+]([O-])c1cccc2cnc(Cl)cc12,Nc1cccc2cnc(Cl)cc12,2.4,38.9
3,153658.2344,Cc1cc2c([N+](=O)[O-])cccc2c[n+]1[O-].O=P(Cl)(C...,Cc1cc2c([N+](=O)[O-])cccc2c(Cl)n1,3.3,58.7
4,297070.0,CCCCC[C@H](O)C=CC1C=CC(=O)C1CC=CCCCC(=O)O,CCCCC[C@H](O)C=CC1CCC(=O)C1CC=CCCCC(=O)O,3.8,74.6


In [3]:
# reactant pool
reactant_pool_path = "/home/qiz/data/lab_database/ismd_data/STEREO_pool_df.csv"

reactant_pool = pd.read_csv(reactant_pool_path)

# show the first three elements in the reactant pool
reactant_pool.head()

Unnamed: 0,id,SMILES
0,0,O=C(Cl)Oc1ccc(Cc2ccc(C(F)(F)F)cc2)cc1
1,1,CCc1cc(C2CCN(C(=O)OC(C)(C)C)CC2)ccc1Nc1ncc(C(F...
2,2,CC(NC(=O)OCc1ccccc1)C(C)NC(=O)c1ccccc1O
3,3,OC[C@H]1NCC[C@@H]1O
4,4,C#CCCN1C(=O)c2ccccc2C1=O


In [4]:
# similarity matrix of reactant pool
sim_matrix_path = "/home/qiz/data/lab_database/ismd_data/ZINC_sim_sparse.npz"
reactant_pool_sim = scipy.sparse.load_npz(sim_matrix_path).tocsr()

# show the list of indice whose molecule is similar to the first one in the reactant pool
print(reactant_pool_sim[0,:].nonzero()[1].tolist())

[0, 9850, 11897, 23561, 25594, 28947, 30750, 31361, 44204, 46017, 76945, 118108, 145556, 145734, 164311, 186671, 205326, 207174, 209595, 215653, 218310, 222491, 224002, 232232, 233447, 252758, 274284, 278177, 288659, 291331, 294003, 294172, 300867, 306289, 307663, 331897, 334538, 335455, 343644, 360531, 364663, 365676, 376086, 378821, 412563, 442160, 443411, 452943, 460860, 479253, 487849, 491373, 499241, 500259, 523929, 525478, 528040, 559770, 567735, 568783, 582833, 584542, 586316, 588491, 595094, 599275, 601808, 603887, 617189]


### 2.1 descripter
data is transformed in the following flow:

index of reactant -> smiles of reactant -> smiles of product -> fingerprint of product

In [5]:
# take some samples (index of reactant)
samples = data["reactant_index"][:100].tolist()
samples = pd.DataFrame({"reactant_index":samples})
print(samples)

   reactant_index
0     12163.22445
1       863.20896
2          249087
3    153658.23440
4          297070
..            ...
95  499625.479808
96   41132.117367
97    9818.621851
98    39851.29982
99    582137.1976

[100 rows x 1 columns]


### 2.1.1 index of reactant -> smiles of reactant
Obtain the smiles by ReactantPool module via index

Note: the ReactantPool also used as proposal model in step 4

In [6]:
pool_obj = ReactantPool(pool_data=reactant_pool, similarity_matrix=reactant_pool_sim, splitter='.')

In [7]:
pool_obj.index2reactant(samples)

['CCS(=O)(=O)Cl.OCCBr',
 'CC(C)CS(=O)(=O)Cl.OCCCl',
 'O=[N+]([O-])c1cccc2cnc(Cl)cc12',
 'Cc1cc2c([N+](=O)[O-])cccc2c[n+]1[O-].O=P(Cl)(Cl)Cl',
 'CCCCC[C@H](O)C=CC1C=CC(=O)C1CC=CCCCC(=O)O',
 'CC(=O)OCC1=C(C(=O)O)N2C(=O)[C@@H](NC(=O)C(OC(C)=O)c3ccccc3)[C@H]2SC1',
 'COc1cccc(C2(CC(Cl)(Cl)Cl)CO2)c1.ClC(Cl)(Cl)CC1(c2ccc(Br)cc2)CO1',
 'COc1cc2ccccc2cc1C(=O)O.O=S(Cl)Cl',
 'CCN(CC)CC.O.O=C(Cl)Oc1ccccc1',
 'CCOC(N)=O.Cc1ccc(N=C=O)cc1N=C=O',
 'CCO.CN1c2ccccc2CC(=O)c2cc(Br)ccc21',
 'CCOC(=O)CC1(O)Cc2ccccc2N(C)c2ccc(SCC)cc21',
 'CCOC(=O)C=C1Cc2ccccc2N(C)c2ccc(SCC)cc21',
 'CCSc1ccc2c(c1)C(CC(=O)Oc1ccc([N+](=O)[O-])cc1)=Cc1ccccc1N2C.CNC',
 'CCSc1ccc2c(c1)C(CC(=O)N(C)C)=Cc1ccccc1N2C',
 'CCSc1ccc2c(c1)C(CCN(C)C)=Cc1ccccc1N2C.O.[OH-]',
 'Nc1cc2c(cc1Br)nc1n2CCC1.[C-]#N',
 'Cl.Nc1ccc2nc3n(c2c1)CCC3',
 'CCOC(=O)c1ccc2c(c1)nc1n2CCC1',
 'CC(C)(C)NN.O=C1CCCCC1.[C-]#N',
 'CC(C)(C)NN.O=C1CCCCC1.[C-]#N',
 'CC1CC(C)(C)CC(C#N)(NNC(C)(C)C)C1',
 'C#N.CC(=NNC(C)(C)C)C1CC1',
 'CC(C)(C)NNC(C)(C#N)C1CC1',
 'CC(C)(C)NNC(

### 2.1.2 smiles of reactant -> fingerprint of product

In [8]:
# build molecular transformer (smiles of reactant -> smiles of product)
reactor_path = "/home/qiz/data/lab_database/models/STEREO_mixed_augm_model_average_20.pt"
ChemicalReactor = Reactor()
ChemicalReactor.BuildReactor(model_list=[reactor_path], max_length=100, n_best=1, gpu=0)

In [9]:
# build fingerprint descriptor (smiles of product -> fingerprint of product)

RDKit_FPs = Fingerprints(featurizers=['ECFP', 'MACCS'], input_type='smiles')

In [10]:
# build reaction descriptor (index of reactant -> fingerprint of product)
# a combination of reactor and fingerprint descripter

RD = ReactionDescriptor(descriptor_calculator=RDKit_FPs,reactor=ChemicalReactor,reactant_pool=pool_obj)

In [11]:
sample_FPs = RD.transform(samples)
sample_FPs.head()

RDKit ERROR: [20:39:21] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 13
RDKit ERROR: 
RDKit ERROR: [20:39:21] Can't kekulize mol.  Unkekulized atoms: 6 13 14 15 22 23 24
RDKit ERROR: 


Unnamed: 0,maccs:0,maccs:1,maccs:2,maccs:3,maccs:4,maccs:5,maccs:6,maccs:7,maccs:8,maccs:9,...,ecfp3:2038,ecfp3:2039,ecfp3:2040,ecfp3:2041,ecfp3:2042,ecfp3:2043,ecfp3:2044,ecfp3:2045,ecfp3:2046,ecfp3:2047
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
samples

Unnamed: 0,reactant_index,reactant_SMILES,product,validate
0,12163.22445,CCS(=O)(=O)Cl.OCCBr,CCS(=O)(=O)OCCBr,True
1,863.20896,CC(C)CS(=O)(=O)Cl.OCCCl,CC(C)CS(=O)(=O)OCCCl,True
2,249087,O=[N+]([O-])c1cccc2cnc(Cl)cc12,Nc1cccc2cnc(Cl)cc12,True
3,153658.23440,Cc1cc2c([N+](=O)[O-])cccc2c[n+]1[O-].O=P(Cl)(C...,Cc1cc2c([N+](=O)[O-])cccc2c(Cl)n1,True
4,297070,CCCCC[C@H](O)C=CC1C=CC(=O)C1CC=CCCCC(=O)O,CCCCC[C@H](O)/C=C/C1C=CC(=O)C1CC=CCCCC(=O)O,True
...,...,...,...,...
95,499625.479808,CCCOS(=O)(=O)c1ccc(C)cc1.CN(C)c1nc(O)cc(O)n1,CCCOc1cc(O)nc(N(C)C)n1,True
96,41132.117367,CC(C)(C)[O-].CNc1nc(=O)c2cccnc2n1Cc1ccc(F)cc1,Cn1c(=O)c2cccnc2n(Cc2ccc(F)cc2)c1=O,True
97,9818.621851,CC#N.O=C1CCCc2ccccc21,N#CC=C1CCCc2ccccc21,True
98,39851.29982,CCOC(=O)CCl.COc1ccc(C=O)cc1,CCOC(=O)C1OC1c1ccc(OC)cc1,True


In [13]:
sample_FPs.head()

Unnamed: 0,maccs:0,maccs:1,maccs:2,maccs:3,maccs:4,maccs:5,maccs:6,maccs:7,maccs:8,maccs:9,...,ecfp3:2038,ecfp3:2039,ecfp3:2040,ecfp3:2041,ecfp3:2042,ecfp3:2043,ecfp3:2044,ecfp3:2045,ecfp3:2046,ecfp3:2047
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
RD.df.iloc[62]

reactant_index                                          12454.138210
reactant_SMILES    C1COCCN1.Cc1cc2c(cc1Cl)c(-c1ccccc1)o[n+]2C(C)(C)C
product                  Cc1cc2c(cc1N1CCOCC1)c(-c1ccccc1)on2C(C)(C)C
validate                                                       False
Name: 62, dtype: object

### 3 Log-likelihood calculator

Compute the log-likelihood given the samples(index of reactant)

In [15]:
# set target
prop = ['XLogP', 'TPSA']
target_range = {'XLogP': (-2, 2), 'TPSA': (0, 25)}

# build Gaussian likelihood calculator and set the target of region of the properties
likelihood_calculator = GaussianLogLikelihood(descriptor=RD, targets = target_range)

In [16]:
%%time

# train forward models inside ismd
likelihood_calculator.fit(data, data[prop])

RDKit ERROR: [20:39:37] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 13
RDKit ERROR: 
RDKit ERROR: [20:39:37] Can't kekulize mol.  Unkekulized atoms: 6 13 14 15 22 23 24
RDKit ERROR: 
RDKit ERROR: [20:39:37] Can't kekulize mol.  Unkekulized atoms: 9 10 11 13 14 15 16 18 19
RDKit ERROR: 
RDKit ERROR: [20:39:37] Can't kekulize mol.  Unkekulized atoms: 9 10 11 13 14 15 16 18 19
RDKit ERROR: 
RDKit ERROR: [20:39:37] SMILES Parse Error: unclosed ring for input: 'CC(=O)O[C@H]1[C@]2(O)C[C@]3(O)SS[C@]4(CO)C(=O)N(C)C(=O)N3[C@@H]2C[C@@]13OC(C)C(C)(C)C3=O'
RDKit ERROR: [20:39:37] SMILES Parse Error: unclosed ring for input: 'COc1cccc(C23CCc4[nH]nc(O)c4C2)c1'


CPU times: user 44.3 s, sys: 256 ms, total: 44.6 s
Wall time: 15.7 s


In [17]:
samples

Unnamed: 0,reactant_index,reactant_SMILES,product,validate
0,12163.22445,CCS(=O)(=O)Cl.OCCBr,CCS(=O)(=O)OCCBr,True
1,863.20896,CC(C)CS(=O)(=O)Cl.OCCCl,CC(C)CS(=O)(=O)OCCCl,True
2,249087,O=[N+]([O-])c1cccc2cnc(Cl)cc12,Nc1cccc2cnc(Cl)cc12,True
3,153658.23440,Cc1cc2c([N+](=O)[O-])cccc2c[n+]1[O-].O=P(Cl)(C...,Cc1cc2c([N+](=O)[O-])cccc2c(Cl)n1,True
4,297070,CCCCC[C@H](O)C=CC1C=CC(=O)C1CC=CCCCC(=O)O,CCCCC[C@H](O)/C=C/C1C=CC(=O)C1CC=CCCCC(=O)O,True
...,...,...,...,...
95,499625.479808,CCCOS(=O)(=O)c1ccc(C)cc1.CN(C)c1nc(O)cc(O)n1,CCCOc1cc(O)nc(N(C)C)n1,True
96,41132.117367,CC(C)(C)[O-].CNc1nc(=O)c2cccnc2n1Cc1ccc(F)cc1,Cn1c(=O)c2cccnc2n(Cc2ccc(F)cc2)c1=O,True
97,9818.621851,CC#N.O=C1CCCc2ccccc21,N#CC=C1CCCc2ccccc21,True
98,39851.29982,CCOC(=O)CCl.COc1ccc(C=O)cc1,CCOC(=O)C1OC1c1ccc(OC)cc1,True


In [18]:
# predicted properties of samples
property_prediction = likelihood_calculator.predict(samples)
print(property_prediction.head())

RDKit ERROR: [20:39:43] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 13
RDKit ERROR: 
RDKit ERROR: [20:39:43] Can't kekulize mol.  Unkekulized atoms: 6 13 14 15 22 23 24
RDKit ERROR: 


   XLogP: mean  XLogP: std  TPSA: mean  TPSA: std
0     0.768960    0.640034   52.334120  10.346404
1     1.869018    0.651625   50.718846  10.548338
2     2.878999    0.654422   39.950279  10.582209
3     3.515118    0.659330   52.648232  10.688316
4     3.488992    0.609510   50.955043  10.184122


In [19]:
property_prediction.iloc[62]

XLogP: mean   NaN
XLogP: std    NaN
TPSA: mean    NaN
TPSA: std     NaN
Name: 62, dtype: float64

In [20]:
# compute the log likelihood of samples
likelihood_prediction = likelihood_calculator(samples, **target_range)
print(likelihood_prediction.head())

RDKit ERROR: [20:39:44] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 13
RDKit ERROR: 
RDKit ERROR: [20:39:44] Can't kekulize mol.  Unkekulized atoms: 6 13 14 15 22 23 24
RDKit ERROR: 


      XLogP      TPSA
0 -0.027600 -5.491427
1 -0.545324 -4.909014
2 -2.412301 -2.541083
3 -4.529888 -5.330103
4 -4.922046 -5.219903


In [21]:
likelihood_prediction.iloc[62]

XLogP   -1000.0
TPSA    -1000.0
Name: 62, dtype: float64

### 4 proposal model

proposal from the given reactant pool, sample(index of reactant) is modified by randomly changing one reactant to a similar one.

In [22]:
# proposal based on samples
new_samples = pool_obj.proposal(samples)
print(samples)
print(new_samples)

   reactant_index                                    reactant_SMILES  \
0     12163.22445                                CCS(=O)(=O)Cl.OCCBr   
1       863.20896                            CC(C)CS(=O)(=O)Cl.OCCCl   
2          249087                     O=[N+]([O-])c1cccc2cnc(Cl)cc12   
3    153658.23440  Cc1cc2c([N+](=O)[O-])cccc2c[n+]1[O-].O=P(Cl)(C...   
4          297070          CCCCC[C@H](O)C=CC1C=CC(=O)C1CC=CCCCC(=O)O   
..            ...                                                ...   
95  499625.479808       CCCOS(=O)(=O)c1ccc(C)cc1.CN(C)c1nc(O)cc(O)n1   
96   41132.117367      CC(C)(C)[O-].CNc1nc(=O)c2cccnc2n1Cc1ccc(F)cc1   
97    9818.621851                              CC#N.O=C1CCCc2ccccc21   
98    39851.29982                        CCOC(=O)CCl.COc1ccc(C=O)cc1   
99    582137.1976       CCOC(=O)C1OC1c1ccc(OC)cc1.NCCC1=CCCc2ccccc21   

                                        product  validate  
0                              CCS(=O)(=O)OCCBr      True  
1              

### 5 complete run of ismd

In [23]:
# set up initial reactants
cans = [smi for i, smi in enumerate(data['reactant_index'])
        if (data['XLogP'].iloc[i] > 4)]
sample_index = np.random.choice(cans, 10)
init_samples = pd.DataFrame({"reactant_index":sample_index})
print(init_samples)

      reactant_index
0      207094.539309
1        15047.87024
2       36295.150865
3      163405.328890
4             391266
5  1504.32715.212539
6             132260
7      195000.115955
8        12295.36295
9       191348.19854


In [24]:
# set up annealing schedule
beta = np.hstack([np.linspace(0.01,0.2,20),np.linspace(0.21,0.4,10),np.linspace(0.4,1,10),np.linspace(1,1,10)])
print('Number of steps: %i' % len(beta))
print(beta)

Number of steps: 50
[0.01       0.02       0.03       0.04       0.05       0.06
 0.07       0.08       0.09       0.1        0.11       0.12
 0.13       0.14       0.15       0.16       0.17       0.18
 0.19       0.2        0.21       0.23111111 0.25222222 0.27333333
 0.29444444 0.31555556 0.33666667 0.35777778 0.37888889 0.4
 0.4        0.46666667 0.53333333 0.6        0.66666667 0.73333333
 0.8        0.86666667 0.93333333 1.         1.         1.
 1.         1.         1.         1.         1.         1.
 1.         1.        ]


In [25]:
samples.sample(n=10)

Unnamed: 0,reactant_index,reactant_SMILES,product,validate
12,446589,CCOC(=O)C=C1Cc2ccccc2N(C)c2ccc(SCC)cc21,CCOC(=O)CC1Cc2ccccc2N(C)c2ccc(SCC)cc21,True
74,289800.287411.440417,CC(=O)C(=NO)C(C)C.CC(=O)CC(C)C.O=NCl,CC(=O)C(Cl)(N=O)C(C)C,True
77,382410.27212.440417,CCCCCCC(=NO)C(C)=O.CCCCCCCC(C)=O.O=NCl,CCCCCCc1nc(C)c(CCCCCC)[n+]([O-])c1C,True
84,167825.362152,Clc1ccc2c(Cl)ncnc2c1.NCCCCCCCCCN,NCCCCCCCCCNc1ncnc2cc(Cl)ccc12,True
32,10899.50482,Nc1ccc(C(=O)O)cc1[N+](=O)[O-].O=C(Cl)C1CC1,O=C(O)c1ccc(NC(=O)C2CC2)c([N+](=O)[O-])c1,True
25,447861,CC(C)(C)NNC1(C#N)CCCCCCC1,CC(C)(C)NNC1(CN)CCCCCCC1,True
46,12454.57249.38938.199909,C1COCCN1.CCCCCC.CO.NC(=S)Cc1ccccn1,S=C(Cc1ccccn1)N1CCOCC1,True
43,14777.31804,Nc1ccc(C(=O)O)c(O)c1.O=C(Cl)C1CCCC1,O=C(O)c1ccc(NC(=O)C2CCCC2)cc1O,True
51,21872.381555.27282,C=O.CC(C)C(C(N)=S)c1ccccn1.CNc1ccccc1,CC(C)C(C(=S)NCN(C)c1ccccc1)c1ccccn1,True
99,582137.1976,CCOC(=O)C1OC1c1ccc(OC)cc1.NCCC1=CCCc2ccccc21,CCOC(=O)C1OC1c1ccc(OC)cc1,True


In [26]:
# set up likelihood and modifier models in iQSPR
ismd = ISMD(estimator=likelihood_calculator, modifier=pool_obj)
step = 0
np.random.seed(201906) # fix the random seed
# main loop of iQSPR
ismd_samples, ismd_loglike, ismd_prob, ismd_freq = [], [], [], []
for s, ll, p, freq in ismd(init_samples, beta, yield_lpf=True):
    step += 1
    print(step)
    ismd_samples.append(s)
    ismd_loglike.append(ll)
    ismd_prob.append(p)
    ismd_freq.append(freq)
# record all outputs
ismd_results = {
    "samples": ismd_samples,
    "loglike": ismd_loglike,
    "prob": ismd_prob,
    "freq": ismd_freq,
    "beta": beta
}


RDKit ERROR: [20:39:46] Can't kekulize mol.  Unkekulized atoms: 9 10 11 13 14 15 16 18 19
RDKit ERROR: 


1
2
3


RDKit ERROR: [20:39:49] Can't kekulize mol.  Unkekulized atoms: 5 6 7 8 10 18 24 26 27
RDKit ERROR: 


4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


In [27]:
# have a look at the result
ismd_result_df = pd.DataFrame(ismd_results)
ismd_result_df.head()

Unnamed: 0,samples,loglike,prob,freq,beta
0,reactant_index ...,XLogP TPSA 0 -14.305697 -...,"[0.11145919716902213, 0.11661211757127608, 2.9...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0.01
1,reactant_index ...,XLogP TPSA 0 -7.942146 -10.56178...,"[0.10609689830518268, 0.10756918659286474, 0.1...","[1, 1, 1, 1, 1, 1, 1, 1, 2]",0.02
2,reactant_index ...,XLogP TPSA 0 -13.401643 -1.03740...,"[0.10787539637064425, 0.09183688979615266, 0.0...","[1, 1, 1, 1, 1, 1, 2, 1, 1]",0.03
3,reactant_index ...,XLogP TPSA 0 -4.303701 ...,"[0.1540829353539327, 0.1497046612893601, 3.582...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",0.04
4,reactant_index ...,XLogP TPSA 0 -2.439217 -25.60360...,"[0.04311414521397201, 0.16007459621323933, 0.1...","[1, 1, 1, 1, 2, 1, 1, 1, 1]",0.05


In [28]:
ismd_result_df['samples'][0]

Unnamed: 0,reactant_index,reactant_SMILES,product,validate
0,12295.36295,CC1CCC(=O)C(CC(=O)c2ccccc2)C1.Nc1ccc(O)c(C(=O)...,CC1CCc2c(cc(-c3ccccc3)n2-c2ccc(O)c(C(=O)O)c2)C1,True
1,132260,CCCCCC(C)(O)C(C)(C)c1cc(OC)cc(OC)c1,CCCCCC(C)(O)C(C)(C)c1cc(OC)cc(OC)c1,True
2,1504.32715.212539,CC(=O)C1CCCCC1=O.CC(=O)O.CCCCCC(C)C(C)c1cc(O)c...,CCCCCC(C)C(C)c1cc(O)c2c3c(c(C)c2c1)CCCC3,False
3,15047.87024,ICCCCCI.NC1CCC2(CCc3ccccc3C2)CC1,c1ccc2c(c1)CCC1(CCC(N3CCCCC3)CC1)C2,True
4,163405.328890,Cc1ccc(S(=O)(=O)NCCNC(C)C)cc1.O=C(O)c1ccccc1C(...,Cc1ccc(S(=O)(=O)NCCN(C(=O)c2ccccc2C(=O)c2ccccc...,True
5,191348.19854,COc1c(CCC(=O)O)cccc1C(=O)c1ccc(Cl)cc1.O=S(Cl)Cl,COc1c(CCC(=O)Cl)cccc1C(=O)c1ccc(Cl)cc1,True
6,195000.115955,CCCCCC(O)/C=C/I.COc1ccc(CC(Cl)(c2ccccc2)c2cccc...,CCCCCC(/C=C/I)OC(Cc1ccc(OC)cc1)(c1ccccc1)c1ccccc1,True
7,207094.539309,O=C(O)CC(=O)O.O=CCc1cccc(C(=O)c2ccc(Cl)cc2)c1O,O=C(O)C=CCc1cccc(C(=O)c2ccc(Cl)cc2)c1O,True
8,36295.150865,Nc1ccc(O)c(C(=O)O)c1.O=C(CC1CCCCC1=O)c1ccc(Br)cc1,O=C(O)c1cc(-n2c(-c3ccc(Br)cc3)cc3c2CCCC3)ccc1O,True
9,391266,CCCCCC(CC[C@H]1CCC(=O)[C@@H]1CCCCCCC(=O)OCC)OC...,CCCCCC(O)CC[C@H]1CCC(=O)[C@@H]1CCCCCCC(=O)OCC,True


In [29]:
ismd_result_df['loglike'][0]

Unnamed: 0,XLogP,TPSA
0,-14.305697,-10.018233
1,-17.731557,-2.072912
2,-1000.0,-1000.0
3,-10.422158,-0.469796
4,-8.613701,-20.029711
5,-9.541776,-5.302802
6,-47.169153,-0.332215
7,-8.514424,-11.767595
8,-21.199962,-10.752987
9,-18.133373,-9.874892


In [30]:
ismd_result_df['prob'][0]

array([1.11459197e-01, 1.16612118e-01, 2.92997394e-10, 1.27482403e-01,
       1.06747236e-01, 1.22541788e-01, 8.84010819e-02, 1.16056563e-01,
       1.03272217e-01, 1.07427396e-01])