In [6]:
import sys
import os
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw, MolStandardize
from rdkit.Chem.Draw import IPythonConsole, MolDrawing, DrawingOptions

import numpy as np

from itertools import product
from joblib import Parallel, delayed
import re
from collections import defaultdict

#from Ipython.display import clear_output
#IPythonConsole.ipython_useSVG = True

from datacat4ml.DeepCoy.DeepCoy import DenseGGNNChemModel
from datacat4ml.DeepCoy.data.prepare_data import read_file, preprocess
from datacat4ml.DeepCoy.evaluation.select_and_evaluate_decoys import select_and_evaluate_decoys
from datacat4ml.const import SPLIT_DATA_DIR

In [2]:
use_gpu = True

# Preprocess actives data

The function `preprocess` is used to parse smiles as graphs.

In [3]:
data_path = './P38-alpha_actives.smi'
raw_data = read_file(data_path)
preprocess(raw_data, "zinc", "P38-alpha_actives") # def preprocess(raw_data, dataset, name, save_dir=''): parsing smiles as graphs. => 'molecules_P38-alpha_actives.json

Finished reading: 10 / 10
Parsing smiles as graphs.
Processed: 10 / 10
Saving data.
Length raw data: 	10
Length processed data: 	10


# Load DeepCoy model and generate decoys.

The below settings generate 100 candidate decoys for each active molecule.

In [8]:
if not use_gpu:
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
else:
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Arguments for DeepCoy
args = defaultdict(None)
args['--dataset'] = 'zinc'
args['--config'] = '{"generation": true, \
                     "batch_size": 1, \
                     "number_of_generation_per_valid": 100, \
                     "train_file": "molecules_P38-alpha_actives.json", \
                     "valid_file": "molecules_P38-alpha_actives.json", \
                     "output_name": "P38-alpha_example_decoys.smi", \
                     "use_subgraph_freqs": false}'

args['--freeze-graph-model'] = False
args['--restore'] = os.path.join(SPLIT_DATA_DIR, 'fsmol_alike', 'MHDsFold', 'decoys', 'deepcoy_models', 'DeepCoy_DUDE_model_e09.pickle') #?Yu

In [10]:
# Setup model and generate molecules
model = DenseGGNNChemModel(args)
model.train()
# Fre up some memory
model = ''

Run 2025-07-10-13-05-54_2109547 starting with following parameters:
{"task_sample_ratios": {}, "use_edge_bias": true, "clamp_gradient_norm": 1.0, "out_layer_dropout_keep_prob": 1.0, "tie_fwd_bkwd": true, "random_seed": 0, "batch_size": 1, "num_epochs": 10, "epoch_to_generate": 10, "number_of_generation_per_valid": 100, "maximum_distance": 50, "use_argmax_generation": false, "residual_connection_on": true, "residual_connections": {"2": [0], "4": [0, 2], "6": [0, 2, 4], "8": [0, 2, 4, 6], "10": [0, 2, 4, 6, 8], "12": [0, 2, 4, 6, 8, 10], "14": [0, 2, 4, 6, 8, 10, 12]}, "num_timesteps": 7, "hidden_size": 100, "encoding_size": 8, "kl_trade_off_lambda": 0.3, "learning_rate": 0.001, "graph_state_dropout_keep_prob": 1, "compensate_num": 0, "train_file": "molecules_P38-alpha_actives.json", "valid_file": "molecules_P38-alpha_actives.json", "try_different_starting": true, "num_different_starting": 1, "generation": true, "use_graph": true, "label_one_hot": false, "multi_bfs_path": false, "bfs_pat

  cell = tf.nn.rnn_cell.GRUCell(new_h_dim) # modified by Yu


Restoring weights from file /storage/homefs/yc24j783/datacat4ml/datacat4ml/Data/data_prep/data_split/fsmol_alike/MHDsFold/decoys/deepcoy_models/DeepCoy_DUDE_model_e09.pickle.
Generated mol 0
Generated mol 100
Generated mol 200
Generated mol 300
Generated mol 400
Generated mol 500
Generated mol 600
Generated mol 700
Generated mol 800
Generated mol 900
Generation done
Number of generated SMILES: 1000


# Assess generated decoys

Now we need to select the final set of decoys from the candidate decoys.
We will select 20 decoys per active.

In [11]:
chosen_properties = "ALL"
num_decoys_per_active = 20

results = select_and_evaluate_decoys('P38-alpha_example_decoys.smi', file_loc='./', output_loc='./', 
                                     dataset=chosen_properties, num_cand_dec_per_act=num_decoys_per_active*2, num_dec_per_act=num_decoys_per_active)

Processing:  P38-alpha_example_decoys.smi
Processed smiles: 0
Done calculating properties
Processed smiles: 0
Done calculating properties
Processed smiles: 0
Done calculating properties
Processed smiles: 0
Done calculating properties
Processed smiles: 0
Done calculating properties
Processed smiles: 0
Done calculating properties
Unable to assess ML model prediction. Check there are sufficient active molecules if these metrics are desired.


In [12]:
results

['P38-alpha_example_decoys.smi',
 'ALL',
 10,
 10,
 1000,
 993,
 -1,
 -1,
 0.07578833333333335,
 0.4352395680379868,
 0.2101596861694829,
 0.24299065420560748]

The following results are calculated and contained in results:

- File name - Name of input file
- Chosen properties - Name of the property set chosen
- Number of actives in input file
- Number of actives after applying the minimum size filter
- Number of candidate decoys
- Number of unique candidate decoys
- AUC ROC - 1NN - Performance as measured by AUC ROC of 1-nearest neighbour (1NN) - algorithm in 10-fold cross-validation using all of the chosen properties
- AUC ROC - RF - Performance as measured by AUC ROC of random forest (RF) algorithm in - 10-fold cross-validation using all of the chosen properties,
- DOE score - Deviation from Optimal Embedding score, a measure of property matching
- LADS score - Latent Active in Decoy Set score
- Average Doppelganger score - A measure of the structural similarity between actives - and decoys
- Maximum Doppelganger score - A measure of the structural similarity between actives and decoys