In [1]:
import os
import numpy as np
import pandas as pd
import lmdb
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm
import pickle
import glob

In [5]:
def get_structure(mol,n_confs):
  mol = Chem.AddHs(mol)
  new_mol = Chem.Mol(mol)

  AllChem.EmbedMultipleConfs(mol,numConfs=n_confs,useExpTorsionAnglePrefs=True,useBasicKnowledge=True, numThreads=0)
  energies = AllChem.MMFFOptimizeMoleculeConfs(mol,maxIters=2000, nonBondedThresh=100.0, numThreads=0)

  energies_list = [e[1] for e in energies]
  min_e_index = energies_list.index(min(energies_list))

  new_mol.AddConformer(mol.GetConformer(min_e_index))

  return new_mol

def smi2coords(smi, seed):
    mol = Chem.MolFromSmiles(smi)
    new_mol = get_structure(mol, 10) 
    # mol = AllChem.AddHs(mol)
    atoms = [atom.GetSymbol() for atom in new_mol.GetAtoms()]
    coordinate_list = []
    # res = AllChem.EmbedMolecule(mol, randomSeed=seed)
    coordinates = new_mol.GetConformer().GetPositions()

    assert len(atoms) == len(coordinates), "coordinates shape is not align with {}".format(smi)
    coordinate_list.append(coordinates.astype(np.float32))
    return pickle.dumps({'atoms': atoms, 'coordinates': coordinate_list, 'smi': smi}, protocol=-1)

def write_lmdb(smiles_list, job_name, seed=42, outpath='./results'):
    os.makedirs(outpath, exist_ok=True)
    output_name = os.path.join(outpath,'{}.lmdb'.format(job_name))
    try:
        os.remove(output_name)
    except:
        pass
    env_new = lmdb.open(
        output_name,
        subdir=False,
        readonly=False,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=1,
        map_size=int(100e9),
    )
    txn_write = env_new.begin(write=True)
    for i, smiles in tqdm(enumerate(smiles_list)):
        inner_output = smi2coords(smiles, seed=seed)
        txn_write.put(f"{i}".encode("ascii"), inner_output)
    txn_write.commit()
    env_new.close()

In [3]:
seed = 42
job_name = '19_data'   # replace to your custom name
data_path = './Uni_mol/19data'  # replace to your data path
# data_path = '../HTE_pred/data/' # HTE datasets
# weight_path='../ckp/mol_pre_no_h_220816.pt'  # replace to your ckpt path
weight_path='../mol_pre_no_h_220816.pt'  # replace to your ckpt path
only_polar=0  # no h
dict_name='dict.txt'
batch_size=16

In [7]:
df_19 = pd.read_csv('data/19_science/19_science_sorted.csv', )
results_path=data_path   # replace to your save path

smi_columns = ['Catalyst_smi', 'Imine_smi', 'Thiol_smi']
for col in smi_columns:
    job_name = '19_data'+col   # replace to your custom name
    smi_list = df_19[col].unique()
    print(len(smi_list))
    # repr = clf.get_repr(smiles)
    # df_repr = pd.DataFrame(repr)

    # df_repr.to_csv(f'./data/{col}.csv',index=False)
    
    write_lmdb(smiles_list=smi_list, job_name=job_name, seed=seed, outpath=data_path)

43


26it [00:14,  1.70it/s][04:02:57] UFFTYPER: Unrecognized atom type: S_6+6 (10)
[04:02:57] UFFTYPER: Unrecognized atom type: S_6+6 (34)
43it [00:25,  1.68it/s]


5


5it [00:00, 22.79it/s]


5


5it [00:00, 49.28it/s]
