# PMO benchmark evaluation

The goal is to test all functionalities of the PMO benchmark.

1. Get a random subset of 100 molecules, get all metrics necessary for reporting the results. 
2. Sample from a pre-trained Hyformer and do the same. 
3. Come up with a script:
    a. Sample 1000 molecules
    b. Evaluate
    c. Jointly fine-tune: first, on generated, then on generated (upsampled) + ZINC with NANs for properties (= -1)
    d. Iterate 10x
    e. print results
4. Frozen + predictor vs joint

In [1]:
# Imports

import os
import torch

from hyformer.configs.dataset import DatasetConfig
from hyformer.configs.tokenizer import TokenizerConfig
from hyformer.configs.model import ModelConfig
from hyformer.configs.trainer import TrainerConfig

from hyformer.utils.datasets.auto import AutoDataset
from hyformer.utils.tokenizers.auto import AutoTokenizer
from hyformer.models.auto import AutoModel
from hyformer.trainers.trainer_fixed import Trainer

from hyformer.utils.runtime import set_seed

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm
2025-03-26 13:52:37.145342: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/aih/adam.izdebski/miniconda3/envs/hyformer/lib/python3.9/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [57]:
# Additional imports

from typing import List, Union

from tqdm.auto import tqdm
from tdc import Oracle as TDCOracle
from rdkit import Chem

import numpy as np 

from hyformer.utils.chemistry import is_valid

import pandas as pd

from hyformer.utils.datasets.sequence import SequenceDataset

from hyformer.utils.transforms.enumerator import SmilesEnumerator


# Constants

AVAILABLE_ORACLES = [
    "Albuterol_Similarity",
    "Amlodipine_MPO",
    "Celecoxib_Rediscovery",
    "Deco_Hop",
    "DRD2", 
    "Fexofenadine_MPO",
    "GSK3B",
    "Isomers_C7H8N2O2",
    "Isomers_C7H8N2O3",
    "Isomers_C9H10N2O2PF2Cl",
    "JNK3",
    "Median 1",
    "Median 2",
    "Mestranol_Similarity", 
    "Osimertinib_MPO",
    "Perindopril_MPO",
    "QED",
    "Ranolazine_MPO",
    "Scaffold_Hop",
    "Sitagliptin_MPO",
    "Thiothixene_Rediscovery", 
    "Troglitazone_Rediscovery",
    "Valsartan_Smarts",
    "Zaleplon_MPO"
    ]

GUACAMOL_ORACLES = [oracle_name for oracle_name in AVAILABLE_ORACLES if oracle_name.endswith('MPO')]

# Classes 

class Oracle:

    def __init__(self, name: str, dtype: str = None) -> None:
        if name not in AVAILABLE_ORACLES:
            raise ValueError(f"Oracle {name} not available")
        self.oracle = TDCOracle(name) if name in AVAILABLE_ORACLES else None
        self.dtype = dtype

    def _to_dtype(self, outputs: List[float]) -> Union[torch.Tensor, np.array, List[float]]:
        if self.dtype == 'pt':
            return torch.tensor(outputs)
        elif self.dtype == 'np':    
            return np.array(outputs)
        else:
            return outputs

    def __call__(self, molecules: Union[List[str], str]) -> float:

        if isinstance(molecules, str):
            molecules = [molecules]

        _values = []
        for molecule in molecules:
        
            try:
                _is_valid = is_valid(molecule)
            except:
                _is_valid = False
        
            _value = self.oracle(molecule) if _is_valid else np.nan
            _values.append(_value)
        
        return self._to_dtype(_values)
    
    def __str__(self) -> str:
        return f"Oracle {self.oracle.name}"
    
    def __repr__(self) -> str:
        return self.__str__()
        
            
class MoleculeBuffer:

    def __init__(self, max_oracle_calls, freq_log, initial_mol_buffer: dict = None):
        self.max_oracle_calls = max_oracle_calls
        self.freq_log = freq_log
        self.mol_buffer = {} if initial_mol_buffer is None else initial_mol_buffer
    
    def __str__(self) -> str:
        return f"Buffer with {len(self.mol_buffer)} molecules"
    
    def __repr__(self) -> str:
        return self.__str__()
    
    def __len__(self) -> int:
        return len(self.mol_buffer)
    
    def get_data(self, top_k: int = None):
        # get smiles and properties from the buffer that are not nan
        self.sort_buffer()
        smiles = []
        properties = []
        for molecule, (prop, _) in self.mol_buffer.items():
            if prop == prop:
                smiles.append(molecule)
                properties.append(prop)
        if top_k is not None and top_k < len(smiles):
            smiles = smiles[:top_k]
            properties = properties[:top_k]
        return smiles, properties
    
    def add(self, molecules: Union[str, List[str]], properties: Union[float, List[float]]):
        
        if isinstance(molecules, str):
            molecules = [molecules]
            properties = [properties]
        
        assert len(molecules) == len(properties), "Molecules and properties must have the same length"
        assert len(self.mol_buffer) + len(molecules) <= self.max_oracle_calls, "Buffer full"

        for molecule, prop in zip(molecules, properties):
            if molecule not in self.mol_buffer:
                self.mol_buffer[molecule] = [prop, len(self.mol_buffer) + 1]
        
        self.sort_buffer()
    
    def sort_buffer(self):
        self.mol_buffer = dict(sorted(self.mol_buffer.items(), key=lambda kv: kv[1][0], reverse=True))
    
    def calculate_top_auc(self, k: int):
        return self._calculate_top_auc(self.mol_buffer, k, True, self.freq_log, self.max_oracle_calls)

    def save(self, path):

        idx = []
        molecules = []
        properties = []

        for _, (_molecule, (_prop, _idx)) in enumerate(self.mol_buffer.items()):
            idx.append(_idx)
            molecules.append(_molecule)
            properties.append(_prop)

        _df = pd.DataFrame({
            'idx': idx,
            'molecule': molecules,
            'property': properties
        })
        _df.to_csv(path, index=False)

    @staticmethod
    def _calculate_top_auc(buffer, top_n, finish, freq_log, max_oracle_calls):

        # filter out properties that are equal to NaNs
        _buffer = buffer.copy()
        _buffer = {k: v for k, v in buffer.items() if v[0] == v[0]}

        # calculate the top auc
        sum = 0
        prev = 0
        called = 0
        ordered_results = list(sorted(_buffer.items(), key=lambda kv: kv[1][1], reverse=False))
        for idx in range(freq_log, min(len(_buffer), max_oracle_calls), freq_log):
            temp_result = ordered_results[:idx]
            temp_result = list(sorted(temp_result, key=lambda kv: kv[1][0], reverse=True))[:top_n]
            top_n_now = np.mean([item[1][0] for item in temp_result])
            sum += freq_log * (top_n_now + prev) / 2
            prev = top_n_now
            called = idx
        temp_result = list(sorted(ordered_results, key=lambda kv: kv[1][0], reverse=True))[:top_n]
        top_n_now = np.mean([item[1][0] for item in temp_result])
        sum += (len(_buffer) - called) * (top_n_now + prev) / 2
        if finish and len(_buffer) < max_oracle_calls:
            sum += (max_oracle_calls - len(_buffer)) * top_n_now
        return sum / max_oracle_calls


## Experiment 1: unconditional generation

In [21]:
# hyperparameters // hparams search on mean AUC top-10 calculated across 3 random seeds and 7 MPOs

ORACLE_NAME = 'Zaleplon_MPO'
MAX_ORACLE_CALLS = 10000
FREQ_LOG = 100
SEED = 0
TOP_K = 10
PATH_RESULTS = '/home/aih/adam.izdebski/project/hyformer-interface/hyformer/results.csv'

PATH_TO_TOKENIZER_CONFIG = 'configs/tokenizers/smiles_separate_task_token/config.json'
PATH_TO_MODEL_CONFIG = 'configs/models/hyformer_prediction_deep/config.json'
PATH_TO_TRAINER_CONFIG = 'configs/trainers/pmo/config.json'

PATH_TO_MODEL_CKPT = '/lustre/groups/aih/hyformer/icml25/results/pretrain/guacamol/hyformer/llama_backbone/pretrain_generation_mlm_physchem/ckpt.pt'

In [22]:
# Set logging

import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [23]:
# Set device

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [24]:
# set seed

set_seed(SEED)

In [52]:
# Tokenizer
tokenizer_config = TokenizerConfig.from_config_file(PATH_TO_TOKENIZER_CONFIG)
tokenizer = AutoTokenizer.from_config(tokenizer_config)

# Model

model_config = ModelConfig.from_config_file(PATH_TO_MODEL_CONFIG)
model = AutoModel.from_config(model_config, downstream_task="regression", num_tasks=1)
model.load_pretrained(PATH_TO_MODEL_CKPT)
model.to(device)
model.eval()

HyformerForDownstreamPredictionDeep(
  (token_embedding): Embedding(596, 256)
  (layers): ModuleList(
    (0-7): 8 x TransformerLayer(
      (attention_layer): Attention(
        (qkv): Linear(in_features=256, out_features=768, bias=False)
        (out): Linear(in_features=256, out_features=256, bias=False)
        (relative_embedding): RotaryPositionalEmbedding()
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=256, out_features=1024, bias=False)
        (w3): Linear(in_features=256, out_features=1024, bias=False)
        (w2): Linear(in_features=1024, out_features=256, bias=False)
      )
      (attention_layer_normalization): RMSNorm()
      (feed_forward_normalization): RMSNorm()
    )
  )
  (layer_norm): RMSNorm()
  (lm_head): Linear(in_features=256, out_features=596, bias=False)
  (mlm_head): Linear(in_features=256, out_features=596, bias=False)
  (physchem_head): RegressionHead(
    (net): Sequential(
      (0): Linear(in_features=256, out_features=256

In [53]:
# Freeze weights // try to have only layer 7 and 8 unfreezed 

for name, param in model.named_parameters():
    if name.startswith('prediction_head') or name.startswith('layers.7') or name.startswith('layers.6'):
        param.requires_grad = True
    else:
        param.requires_grad = False
print("Freezing all weights except for the predictor head...", flush=True)

Freezing all weights except for the predictor head...


In [54]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

token_embedding.weight False
layers.0.attention_layer.qkv.weight False
layers.0.attention_layer.out.weight False
layers.0.feed_forward.w1.weight False
layers.0.feed_forward.w3.weight False
layers.0.feed_forward.w2.weight False
layers.0.attention_layer_normalization.weight False
layers.0.feed_forward_normalization.weight False
layers.1.attention_layer.qkv.weight False
layers.1.attention_layer.out.weight False
layers.1.feed_forward.w1.weight False
layers.1.feed_forward.w3.weight False
layers.1.feed_forward.w2.weight False
layers.1.attention_layer_normalization.weight False
layers.1.feed_forward_normalization.weight False
layers.2.attention_layer.qkv.weight False
layers.2.attention_layer.out.weight False
layers.2.feed_forward.w1.weight False
layers.2.feed_forward.w3.weight False
layers.2.feed_forward.w2.weight False
layers.2.attention_layer_normalization.weight False
layers.2.feed_forward_normalization.weight False
layers.3.attention_layer.qkv.weight False
layers.3.attention_layer.out.wei

In [55]:
# Trainer config

trainer_config = TrainerConfig.from_config_file(PATH_TO_TRAINER_CONFIG)
trainer_config.tasks = {'prediction': 0.5, 'generation': 0.5}

In [58]:
# Set number of optimization steps
num_optimization_steps = MAX_ORACLE_CALLS // FREQ_LOG
num_samples_per_optimization_step = MAX_ORACLE_CALLS // num_optimization_steps
batch_size = 256
temperature = 1.0
top_k = 25

print(f"Number of optimization steps: {num_optimization_steps}")
print(f"Number of samples per optimization step: {num_samples_per_optimization_step}")

# Initialize the oracle
oracle = Oracle(name=ORACLE_NAME)
print(oracle)

# Initialize the augmentation
data_transform = SmilesEnumerator(enumeration_probability=0.75)

# Initialize the buffer
mol_buffer = MoleculeBuffer(MAX_ORACLE_CALLS, FREQ_LOG)

# Run the optimization loop
_filter = False
_smiles_cache = []
for step, optimization_step in tqdm(enumerate(range(num_optimization_steps))):

    # Sample molecules and properties
    smiles = []
    targets = []
    for _ in tqdm(range(20)):
        _smiles = model.generate(tokenizer=tokenizer, batch_size=batch_size, temperature=temperature, top_k=top_k, device=device).detach().cpu()
        _smiles = tokenizer.decode(_smiles)
        inputs = tokenizer(_smiles, task='prediction')
        inputs.to(device)
        _targets = model.predict(**inputs).detach().cpu()
        smiles.extend(_smiles)
        targets.extend(_targets.numpy()) 
    
    # Convert to numpy
    targets = np.array(targets).reshape(-1, 1)

    # filter
    if _filter:
        sorted_indices = np.argsort(-targets, axis=0)
        sorted_indices = sorted_indices.flatten()
        sorted_smiles = [smiles[idx] for idx in sorted_indices]
        sorted_targets = [targets[idx] for idx in sorted_indices]
        smiles = sorted_smiles[:num_samples_per_optimization_step]
    else:
        smiles = smiles[:num_samples_per_optimization_step]

    # Remove duplicates
    smiles = list(set(smiles))    
    smiles = [smile for smile in smiles if smile not in _smiles_cache]
    _smiles_cache.extend(smiles)

    # Call the oracle
    properties = oracle(smiles)
    
    # Add to the buffer
    mol_buffer.add(smiles, properties)

    # Calculate the top AUC
    top_auc = mol_buffer.calculate_top_auc(TOP_K)
    print(f"Step {step}, AUC Top-{TOP_K}: {top_auc.item():.3f}")

    # Save the buffer
    mol_buffer.save(PATH_RESULTS)

    # Update the model 
    data, target = mol_buffer.get_data(top_k=100)
    print(f"Number of data points: {len(data)}")
    print(f"MAX: {max(target)}")
    dataset = SequenceDataset(data=data, target=np.array(target, dtype=np.float32).reshape(-1, 1), data_transform=data_transform)
    trainer_config.correct_for_num_train_examples(num_train_examples=len(dataset))
    model = AutoModel.from_config(model_config, downstream_task="regression", num_tasks=1)
    model.load_pretrained(PATH_TO_MODEL_CKPT)
    model.to(device)
    trainer = Trainer(config=trainer_config, model=model, tokenizer=tokenizer, device=device, train_dataset=dataset, val_dataset=None)
    trainer.train()
    model = trainer.model
    model.eval()
    _filter = True


Number of optimization steps: 100
Number of samples per optimization step: 100
Oracle zaleplon_mpo


100%|██████████| 20/20 [01:44<00:00,  5.20s/it]


Step 0, AUC Top-10: 0.000
Number of data points: 93
MAX: 0.0023516068153027645


100%|██████████| 20/20 [01:43<00:00,  5.15s/it]


Step 1, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 2, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:44<00:00,  5.21s/it]


Step 3, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 4, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.14s/it]


Step 5, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.14s/it]


Step 6, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 7, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:43<00:00,  5.17s/it]


Step 8, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.14s/it]


Step 9, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 10, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 11, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.14s/it]


Step 12, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 13, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 14, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 15, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 16, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 17, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:43<00:00,  5.16s/it]


Step 18, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 19, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 20, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 21, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 22, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 23, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:43<00:00,  5.16s/it]


Step 24, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.14s/it]


Step 25, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 26, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]


Step 27, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 28, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 29, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.14s/it]


Step 30, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 31, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 32, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 33, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 34, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 35, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 36, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 37, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 38, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 39, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]


Step 40, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 41, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]


Step 42, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 43, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 44, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 45, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 46, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]


Step 47, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 48, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]


Step 49, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 50, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 51, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.14s/it]


Step 52, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:43<00:00,  5.15s/it]


Step 53, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:43<00:00,  5.16s/it]


Step 54, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.14s/it]


Step 55, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 56, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 57, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 58, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]


Step 59, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 60, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 61, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 62, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]


Step 63, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:46<00:00,  5.33s/it]


Step 64, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 65, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:43<00:00,  5.18s/it]


Step 66, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 67, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 68, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]


Step 69, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 70, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 71, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 72, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 73, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:43<00:00,  5.15s/it]


Step 74, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]


Step 75, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 76, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 77, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:46<00:00,  5.34s/it]


Step 78, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 79, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 80, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:43<00:00,  5.19s/it]


Step 81, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 82, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]


Step 83, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 84, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.14s/it]


Step 85, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 86, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:43<00:00,  5.16s/it]


Step 87, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]


Step 88, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 89, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 90, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 91, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 92, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 93, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 94, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 95, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:43<00:00,  5.15s/it]


Step 96, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 97, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.12s/it]


Step 98, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100%|██████████| 20/20 [01:42<00:00,  5.13s/it]


Step 99, AUC Top-10: 0.006
Number of data points: 100
MAX: 0.056770659761769295


100it [2:55:57, 105.58s/it]


In [13]:
from tdc.generation import MolGen
data = MolGen(name = 'ZINC')
split = data.get_split()


Downloading...
100%|██████████| 11.8M/11.8M [00:01<00:00, 9.84MiB/s]
Loading...
Done!


In [49]:
oracle = Oracle(name=ORACLE_NAME)
new_properties = oracle(split['train'].sample(1000)['smiles'].tolist())

In [50]:
max(new_properties)

0.06354327489573058

In [48]:
set_seed(42)

In [51]:
len(split['train'])

174618