This notebook generates dataset according to CrossDock 100k subset
- The goal is to use full protein from original CrossDock dataset instead of the version with 12A cutoff.
- pockets not from PDBBind set are removed 

You need to have CrossDock subset and full set downloaded to run this script.

In [1]:
import os
from glob import glob
import torch
from tqdm import tqdm
from collections import defaultdict 
import sys

sys.path.append('../')

from src.tacogfn.eval import docking
from src.tacogfn.utils import molecules

  from .autonotebook import tqdm as notebook_tqdm


There are 24478 receptors

In [2]:
pocket_paths = glob('../dataset/crossdock/*_rec.pdb')
len(pocket_paths)

44646

There are 21231 ligands

In [3]:
ligand_paths = glob('../dataset/crossdock/*_lig.pdb')
len(ligand_paths)

22288

In [4]:
pocket_ids = [i.split('/')[-1].split('_rec')[0] for i in pocket_paths]

In [5]:
pocket_ids_to_paths = dict(zip(pocket_ids, pocket_paths))

We now match ligands to pockets based on filenames and resolve based on docking scores

In [6]:
# Pocket2Mol pdb ids
split_by_name = torch.load('../dataset/split_by_name.pt')
train_pdb_ids = [val[0].split('/')[-1].split('_rec')[0] for val in split_by_name['train']]
val_pdb_ids = [val[0].split('/')[-1].split('_rec')[0] for val in split_by_name['test']]
pocket2mol_split = split_by_name['train'] + split_by_name['test']
pocket2mol_pdb_ids = set(train_pdb_ids + val_pdb_ids)

In [7]:
cross_dock_pocket_ids = set(pocket_ids)

In [66]:
pocket_to_ligands = {}

for pair in pocket2mol_split:
    pdb_id = pair[0].split('/')[-1].split('_rec')[0]
    ligand_path = pair[1]

    pocket_to_ligands[pdb_id] = ligand_path

In [67]:
torch.save(pocket_to_ligands, '../dataset/pocket_to_ligands.pt')

Compute pocket centroids

In [7]:
pocket_to_ligands = torch.load('../dataset/pocket_to_ligands.pt')

In [23]:
pocket_to_centroid = {}
for key, lig_path in pocket_to_ligands.items():
    lig_path = os.path.join('../dataset/crossdocked_pocket10/', lig_path)
    pocket_to_centroid[key] = molecules.get_centroid_from_sdf(lig_path)

In [25]:
torch.save(pocket_to_centroid, '../dataset/pocket_to_centroid.pt')

In [1]:
import torch
pocket2centroid = torch.load('../dataset/pocket_to_centroid.pt')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pocket2centroid['1a0g_A']

(46.82177272727272, 19.169818181818183, 13.434545454545455)

Compute pocket to native ligand docking scores

In [2]:
pocket_to_ligands = torch.load('../dataset/pocket_to_ligands.pt')

In [6]:
from src.tacogfn.eval import docking

for key, lig_path in tqdm(pocket_to_ligands.items()):
    rec_path = os.path.join('../dataset/crossdock/', key + '_rec.pdb')
    target_path = rec_path.replace('.pdb', '.pdbqt').replace('crossdock', 'crossdock_pdbqt')
    if not os.path.exists(target_path):
        docking._prepare_receptor(
            rec_path,
            target_path
        )

  0%|          | 0/15307 [00:00<?, ?it/s]Traceback (most recent call last):
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/Utilities24/prepare_receptor4.py", line 216, in <module>
    dict=dictionary)    
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 558, in __init__
    version=version, delete_single_nonstd_residues=delete_single_nonstd_residues)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 141, in __init__
    self.addCharges(mol, charges_to_add)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 227, in addCharges
    chargeCalculator.addCharges(mol.allAtoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/MolKit/chargeCalculator.py", line 80, in addCharges
    babel.assignHybridization(atoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 136, in assignHybridization
    self.valence_three()
  File "/home/tsa87/ADFRsuite-1.0/CCSBpc


adding gasteiger charges to peptide
4b3b_A_rec:A:LYS317:HB2 and 4b3b_A_rec:A:LYS317:HB3 have the same coordinates


Traceback (most recent call last):
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/Utilities24/prepare_receptor4.py", line 216, in <module>
    dict=dictionary)    
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 558, in __init__
    version=version, delete_single_nonstd_residues=delete_single_nonstd_residues)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 141, in __init__
    self.addCharges(mol, charges_to_add)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 227, in addCharges
    chargeCalculator.addCharges(mol.allAtoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/MolKit/chargeCalculator.py", line 80, in addCharges
    babel.assignHybridization(atoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 137, in assignHybridization
    self.valence_two()
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 266, in val

2dua_A_rec:A:CL 549:CL and 2dua_A_rec:A:CL 549:CL have the same coordinates



Traceback (most recent call last):
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/Utilities24/prepare_receptor4.py", line 216, in <module>
    dict=dictionary)    
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 558, in __init__
    version=version, delete_single_nonstd_residues=delete_single_nonstd_residues)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 141, in __init__
    self.addCharges(mol, charges_to_add)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 227, in addCharges
    chargeCalculator.addCharges(mol.allAtoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/MolKit/chargeCalculator.py", line 80, in addCharges
    babel.assignHybridization(atoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 137, in assignHybridization
    self.valence_two()
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 266, in val

4gg1_A_rec:A:NA 604:NA and 4gg1_A_rec:A:NA 604:NA have the same coordinates


Traceback (most recent call last):
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/Utilities24/prepare_receptor4.py", line 216, in <module>
    dict=dictionary)    
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 558, in __init__
    version=version, delete_single_nonstd_residues=delete_single_nonstd_residues)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 141, in __init__
    self.addCharges(mol, charges_to_add)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 227, in addCharges
    chargeCalculator.addCharges(mol.allAtoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/MolKit/chargeCalculator.py", line 80, in addCharges
    babel.assignHybridization(atoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 136, in assignHybridization
    self.valence_three()
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 231, in v




Traceback (most recent call last):
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/Utilities24/prepare_receptor4.py", line 216, in <module>
    dict=dictionary)    
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 558, in __init__
    version=version, delete_single_nonstd_residues=delete_single_nonstd_residues)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 141, in __init__
    self.addCharges(mol, charges_to_add)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 227, in addCharges
    chargeCalculator.addCharges(mol.allAtoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/MolKit/chargeCalculator.py", line 80, in addCharges
    babel.assignHybridization(atoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 136, in assignHybridization
    self.valence_three()
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 231, in v




Traceback (most recent call last):
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/Utilities24/prepare_receptor4.py", line 216, in <module>
    dict=dictionary)    
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 558, in __init__
    version=version, delete_single_nonstd_residues=delete_single_nonstd_residues)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 141, in __init__
    self.addCharges(mol, charges_to_add)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 227, in addCharges
    chargeCalculator.addCharges(mol.allAtoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/MolKit/chargeCalculator.py", line 80, in addCharges
    babel.assignHybridization(atoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 137, in assignHybridization
    self.valence_two()
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 266, in val

4jd5_A_rec:A:NA 604:NA and 4jd5_A_rec:A:NA 604:NA have the same coordinates



Traceback (most recent call last):
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/Utilities24/prepare_receptor4.py", line 216, in <module>
    dict=dictionary)    
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 558, in __init__
    version=version, delete_single_nonstd_residues=delete_single_nonstd_residues)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 141, in __init__
    self.addCharges(mol, charges_to_add)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/AutoDockTools/MoleculePreparation.py", line 227, in addCharges
    chargeCalculator.addCharges(mol.allAtoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/MolKit/chargeCalculator.py", line 80, in addCharges
    babel.assignHybridization(atoms)
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 136, in assignHybridization
    self.valence_three()
  File "/home/tsa87/ADFRsuite-1.0/CCSBpckgs/PyBabel/atomTypes.py", line 231, in v

In [6]:
pocket_to_centroid = torch.load('../dataset/pocket_to_centroid.pt')

In [9]:
pocket_to_score = {}
for key, lig_path in tqdm(pocket_to_ligands.items()):
    rec_path = os.path.join('../dataset/crossdock_pdbqt/', key + '_rec.pdbqt')
    lig_path = os.path.join('../dataset/crossdocked_pocket10/', lig_path)
    
    
    score = docking.compute_docking_score_from_sdf(
        pdb_path=rec_path,
        sdf_path=lig_path,
        local_search=True
    )
    
    pocket_to_score[key] = score

 14%|█▎        | 2069/15307 [18:56<2:11:02,  1.68it/s][22:38:46] ERROR: CTAB version string invalid at line 4
[22:38:46] ERROR: moving to the beginning of the next molecule
Traceback (most recent call last):
  File "/home/tsa87/anaconda3/envs/tacogfn/bin/mk_prepare_ligand.py", line 286, in <module>
    mol_supplier = parsers[ext](input_molecule_filename, removeHs=False) # input must have explicit H
OSError: File error: Invalid input file /tmp/tmp1h9d5cuc/ligand.sdf
 23%|██▎       | 3584/15307 [32:36<1:43:07,  1.89it/s]atom number 0 has non finite charge, mol name: __4b1g_A_rec_4na0_ar6_lig_min.pdb, charge: nan
atom number 1 has non finite charge, mol name: __4b1g_A_rec_4na0_ar6_lig_min.pdb, charge: nan
atom number 2 has non finite charge, mol name: __4b1g_A_rec_4na0_ar6_lig_min.pdb, charge: nan
atom number 3 has non finite charge, mol name: __4b1g_A_rec_4na0_ar6_lig_min.pdb, charge: nan
atom number 4 has non finite charge, mol name: __4b1g_A_rec_4na0_ar6_lig_min.pdb, charge: nan
atom n

In [10]:
torch.save(pocket_to_score, '../dataset/pocket_to_score.pt')