In [1]:
import scipy.stats as ss

# 如果没有 gibrat 但有 gilbrat，就做一个别名
if not hasattr(ss, "gibrat") and hasattr(ss, "gilbrat"):
    ss.gibrat = ss.gilbrat


In [2]:
import pandas as pd
import numpy as np
import random
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
from rdkit import Chem
import sys
from deepchem.feat.smiles_tokenizer import SmilesTokenizer
from minGPT.pipeline import minGPT
# minGPT including: data_preprocessing, load_model, train, generate, evaluate

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
  from .autonotebook import tqdm as notebook_tqdm
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/Users/yuezhou/opt/anaconda3/envs/env_polygen_mingpt/lib/python3.8/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


## Data preprocessing

In [3]:
pipeline = minGPT()
data_config = pipeline.get_default_data_config()
data_config.file_path = "minGPT/htp_md.csv"
data_config.block_size = 64

print(data_config)
train_dataset, test_dataset = pipeline.data_preprocessing(data_config)

input_col: mol_smiles
length: 5
block_size: 64
train_test_split: (0.8, 0.2)
task: conditional
file_path: minGPT/htp_md.csv



In [3]:
## Model initializing

In [4]:
# Model
model_config = pipeline.get_default_model_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
pipeline.load_model(model_config)

number of parameters: 0.12M


## Training configuring

In [5]:
# Train
train_config = pipeline.get_default_train_config()
print("--------Training configuration--------")
print(train_config)


print(train_config.device)
train_config.max_iters = 10000
train_config.ckpt_path = "./minGPT/ckpts/"
# Uncomment the following line if load from pre-trained model chkpts
train_config.pretrain = "./ckpts/10000.pt"

## Define call back function
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}, val loss {trainer.loss_val.item():.5f}")

train_config.call_back = batch_end_callback
# Uncomment the following line to start training
# loss = pipeline.train(train_config)

--------Training configuration--------
device: auto
num_workers: 0
max_iters: None
batch_size: 64
learning_rate: 0.0005
betas: (0.9, 0.95)
weight_decay: 0.1
grad_norm_clip: 1.0
model: None
call_back: None
pretrain: None

auto


## Generating with model

In [None]:
# generate_config = pipeline.get_default_generate_config()
# generate_config.ckpt_path = "./minGPT/ckpts/10000.pt"
# assert generate_config.task == data_config.task
# print(generate_config)

# results = pipeline.generate(generate_config)

ckpts_path: None
num_samples: 100
temperature: 1.0
task: conditional
ckpt_path: ./minGPT/ckpts/10000.pt



## Evaluate model
Calculate the scores for 
**uniqueness, novelty, validity, synthesibility, similarity, diversity**, respectively

In [None]:
# print(pipeline.evaluate())

(0.95, 0.5800000000000001, 0.89, 0.9565217391304348, 0.25568710931368277, 0.704277235149268)


1. Uniqueness:
Measures how many of the generated molecules are not duplicates of each other.

    Uniqueness = 1 - number of duplicated molecules​ / total number of generated molecules

    A higher value (closer to 1) means the model is generating more diverse, non-repetitive molecules.

2. Novelty: 
Measures how many of the generated molecules are not seen in the training set.

    For each generated SMILES, check if it also appears in the training set (check_novelty).

    1) If it appears in the training set -> label as “In the original data set”

    2) Otherwise -> label as “novel”

    Novelty = 1 - number of molecules found in the training set​ / total number of generated molecules

3. Validity:
Measures how many generated SMILES correspond to chemically valid and structurally correct molecules.

    Use RDKit (validate_mol) to check:

    1) Whether the SMILES can be parsed by Chem.MolFromSmiles().

    2) Whether the terminal atoms (Cu, Au) have exactly one bond.

    3) Whether the molecule contains exactly one [Cu] and one [Au] atom (two endpoints). Use has_two_ends() to explicitly label molecules with both endpoints.

    Count how many satisfy both conditions:

    1) validity == 'ok'

    2) has_two_ends == True

    Validity = number of valid molecules with two ends / total number of generated molecules

4. Synthesizability:
Estimates how feasible it would be to chemically synthesize the generated molecules.

    Compute the Synthetic Accessibility (SA) score for each valid molecule using calculateScore(mol). Estimating synthesis difficulty based on molecular rarity.

    ranges from 1 (easy to synthesize) to 10 (hard to synthesize). Define “synthesizable” molecules as those with SA < 5

5. Similarity:
Measures how similar the generated molecules are to those in the training set.

    Compute Morgan fingerprints (radius = 2, 2048 bits) for both training and generated molecules.

    For each generated molecule: Compute Tanimoto similarity with all molecules in the training set. Take the average of these similarity scores.

    Finally, take the overall mean of all generated molecules’ similarity values:

6. Diversity:
Quantifies how structurally different the generated molecules are from each other.

    Compute pairwise Tanimoto similarity among all generated molecules.

    Higher diversity means the generated set spans a wider range of chemical space.

    Diversity = 1 − average pairwise Tanimoto similarity among generated molecules

In [None]:
# results[:10]

['O=C([Au])OCCN(CCOC(=O)O)CO[Cu]',
 'CN(CCO[Cu])CCNCCOC(=O)[Au]',
 'O=C([Au])OCCCNCCCCO[Cu]',
 'CC(CNCCN(C)CCO[Cu])OC(=O)[Au]',
 'O=C([Au])NCCNCCNCCN[Cu]',
 'N#CCN(CCC#O)CC(OC(=O)[Au])O[Cu]',
 'CN(CC(=O)N(C)CCO[Cu])CCOC(=O)[Au]',
 'CCN(CC)CC(C(CO[Cu])OC(=O)[Au])OC',
 'CN(C)CCCN(CCN[Cu])C(=O)CNC(=O)[Au]',
 'CCCN(CCO[Cu])CCOC(=O)[Au]']

In [6]:
generate_config = pipeline.get_default_generate_config()
generate_config.ckpt_path = "./minGPT/ckpts/10000.pt"

generate_config.target_property_value = 9   # 高导电
generate_config.num_samples = 100

high_results = pipeline.generate(generate_config)
high_results[:10]


['COC(=O)CN(CCNC(=O)[Au])CCO[Cu]',
 'C=CCN(CCCO[Cu])CCOC(=O)[Au]',
 'O=C([Au])OCCCCNCCCCO[Cu]',
 'COCC(CNCCCOC(=O)[Au])O[Cu]',
 'O=C(COCCO[Cu])NCCOC(=O)[Au]',
 'O=C(CCN[Cu])NCCOCCOC(=O)[Au]',
 'C=CCCCCN(C)CC(CO[Cu])OC(=O)[Au]',
 'O=C([Au])OCCNCCO[Cu]',
 'CC(C)C(COC)NCC(CNC(=O)[Au])O[Cu]',
 'CC(CCO[Cu])N(C)CCCCCOC(=O)[Au]']

In [7]:
generate_config_low = pipeline.get_default_generate_config()
generate_config_low.ckpt_path = "./minGPT/ckpts/10000.pt"

generate_config_low.target_property_value = 8   # 低导电
generate_config_low.num_samples = 100

low_results = pipeline.generate(generate_config_low)
low_results[:10]


['COC(CO[Cu])N(C)CC(C)O[Cu])OC(=O)[Au]',
 'NC(CN[Cu])NC(=O)[Au]',
 'C#CC(C)(COC(=O)[Au])C(C)C(C)O[Cu]',
 'CC(F)CC(N[Cu])C(=O)NCCCOC(=O)[Au]',
 'CNC(=O)C(CCO[Cu])NC(=O)[Au]',
 'CC(CNC(=O)[Au])N(C)CO[Cu]',
 'C#CCN(CCCOC(=O)[Au])CCC(C)O[Cu]',
 'CC(NC(=O)[Au])C(=O)NCCNCCO[Cu]',
 'NC(=O)C(NC(=O)[Au])NCC(=O)O)O[Cu]',
 'CCCC(C)(O[Cu])C(C)COC(=O)[Au]']

In [8]:
import json

# 保存高导电
with open("generated_high.json", "w") as f:
    json.dump(high_results, f)

# 保存低导电
with open("generated_low.json", "w") as f:
    json.dump(low_results, f)

print("Saved!")


Saved!
