In [1]:
import os, sys
import glob, subprocess, pickle
import pandas as pd
from tqdm import tqdm

# Protein and ligand structure sources

In [2]:
# 1. Obtain and process co-crystal structures 
# In this work, we used three proteins as examples, i.e. CDK2, FXA and HSP90
# We used the attached jupyter notebook from the work of 'Ricci-Lopez, Joel, et al. 
# "Improving structure-based virtual screening with ensemble docking and machine learning." 
# Journal of Chemical Information and Modeling 61.11 (2021): 5362-5376.'
# These jupyter notebooks could be found in https://github.com/jRicciL/ML-ensemble-docking
# For example, for CDK2, jupyter notebooks in cdk2/1_Download_and_prepare_protein_ensembles/ 
# were used. Then we copied the pdb_structures/pdb_prepared/ and pdb_structures/pocket_ligand/
# for further processes.

# 2. Obtain virtual screening ligands
# For fair method comparison, we used the datasets documented in
# 2_Molecular_libraries/datasets/ from Ricci-Lopez, Joel, et al.
# Specifically, the original DUDE and Dekois2 datasets contained 
# 3D structures of the ligands, while CSAR dataset of CDK2 only 
# contained SMILES. So we generated 3D structures for them 
# using openbabel.

In [3]:
# data view
for pro in ['CDK2', 'FXA', 'HSP90']:
    l_pro = [x for x in glob.glob(f'{pro}/pdb_raw/*.pdb')]
    l_lig = [x for x in glob.glob(f'{pro}/cocry_dataset/*.pdb')]
    l_dude = [x for x in glob.glob(f'{pro}/dude_dataset/*.sdf')]
    l_dekois = [x for x in glob.glob(f'{pro}/dekois2_dataset/*.sdf')]
    print(pro)
    print(f'    The number of protein is {len(l_pro)}')
    print(f'    The number of cocrystal ligand is {len(l_lig)}')
    print(f'    The number of dude ligand is {len(l_dude)}')
    print(f'    The number of dekois ligand is {len(l_dekois)}')
    if pro == 'CDK2':
        l_csar = [x for x in glob.glob(f'{pro}/csar_dataset/*.sdf')]
        print(f'    The number of csar ligand is {len(l_csar)}')

CDK2
    The number of protein is 10
    The number of cocrystal ligand is 10
    The number of dude ligand is 10
    The number of dekois ligand is 10
    The number of csar ligand is 10
FXA
    The number of protein is 10
    The number of cocrystal ligand is 10
    The number of dude ligand is 10
    The number of dekois ligand is 10
HSP90
    The number of protein is 10
    The number of cocrystal ligand is 10
    The number of dude ligand is 10
    The number of dekois ligand is 10


# Protein structure preparation

In [4]:
from process_pdb import process_pdb
import pymol
from pymol import cmd
from pymol_center_of_mass import get_com

In [5]:
# change the unatural amino acids to standard ones and 
# save the pdbs to new pdbs. Get the secondary structure 
# fragments for the proteins.

for pdb in ['CDK2', 'FXA', 'HSP90']:
    subprocess.run(f'mkdir {pdb}/pdb_processed/'.split(' '))
    l_pro = sorted([x for x in glob.glob(f'{pdb}/pdb_raw/*.pdb')])
    for path in l_pro:
        PDB = path.split('/')[-1].split('_')[0]
        process_pdb(path, PDB, f'{pdb}/pdb_processed/')

# we then calculate the center of mass based on aligned ligands
# using the pymol script from http://www.pymolwiki.org/index.php/center_of_mass

for pdb in ['CDK2', 'FXA', 'HSP90']:
    l_lig = [x for x in glob.glob(f'{pdb}/cocry_dataset/*.pdb')]
    for i in l_lig:
        cmd.load(i, i)
        com = get_com('all')
        f = open(f'{pdb}/center.txt','w')
        f.write(f'X,{com[0]}\nY,{com[1]}\nZ,{com[2]}')
        f.close()
        cmd.delete('all')

In [6]:
%%bash
# we used 1fin for CDK2, 1ezq for FXA, and 1uyg for HSP90 as examples.
cp CDK2/pdb_processed/1fin.pdb CDK2/
cp CDK2/cocry_dataset/1fin_ATP_LIG.pdb CDK2/
cp FXA/pdb_processed/1ezq.pdb FXA/
cp FXA/cocry_dataset/1ezq_RPR_LIG.pdb FXA/
cp HSP90/pdb_processed/1uyg.pdb HSP90/
cp HSP90/cocry_dataset/1uyg_PU2_LIG.pdb HSP90/

In [7]:
ref_lig_map = {'CDK2':'1fin', 'FXA': '1ezq', 'HSP90': '1uyg'}

## Ligand preparation

In [8]:
from process_ligand import prelig4swiss, batch_rotatable_bonds

In [9]:
# dock the ligands from different datasets to the reference protein targets
# we used smina for docking and recommended to run the docking processes 
# in parallel in shell.

for pdb in ['CDK2', 'FXA', 'HSP90']:
    ligs = sorted([x for x in glob.glob(f'{pdb}/dekois2_dataset/*.sdf')]+
                  [x for x in glob.glob(f'{pdb}/dude_dataset/*.sdf')]+
                  [x for x in glob.glob(f'{pdb}/csar_dataset/*.sdf')]+
                  [x for x in glob.glob(f'{pdb}/cocry_dataset/*.pdb')])
    dock_txt = open(f'{pdb}/docking.txt','w')
    subprocess.run(f'mkdir {pdb}/ligand_docked/'.split(' '))
    for i in ligs:
        lig = i
        ref_lig = [x for x in glob.glob(f'{pdb}/*.pdb') if 'LIG' in x][0]
        ref_pro = [x for x in glob.glob(f'{pdb}/*.pdb') if 'LIG' not in x][0]
        out = lig.split('/')[-2].split('_')[0]+'_'\
             +lig.split('/')[-1].replace('.sdf','_docking.sdf').replace('.pdb','_docking.sdf')
        s = f"smina -r {ref_pro} -l {lig} -o {pdb}/ligand_docked/{out}\
              --autobox_ligand={ref_lig} \
              --scoring=vinardo --factor=100 --num_modes=3\
              --exhaustiveness=16\n"
        dock_txt.write(s)
    dock_txt.close()

In [10]:
# we split the docked ligands
# if the ligands come form cocrystal set, we kept the top three conformation
# else we kept the top one conformation.

for pdb in ['CDK2', 'FXA', 'HSP90']:
    l = sorted([x for x in glob.glob(f'{pdb}/ligand_docked/*_docking.sdf')])
    f = open(f'{pdb}/spliting.txt','w')
    for i in l:
        if 'cocry' in i:
            j = i.replace('docking.sdf', f'dock_{ref_lig_map[pdb]}_.sdf')
            s = f'obabel -isdf {i} -l 3 -O {j} -m\n'
            f.write(s)
        else:
            j = i.replace('docking.sdf', f'dock_{ref_lig_map[pdb]}_1.sdf')
            s = f'obabel -isdf {i} -l 1 -O {j}\n'
            f.write(s)
    f.close()

In [11]:
# hydrogens were adding to the ligands with pymol.
# ligand are saved to mol2 file and the atom number are added

for pdb in ['CDK2', 'FXA', 'HSP90']:
    l = sorted([x for x in glob.glob(f'{pdb}/ligand_docked/*_dock_{ref_lig_map[pdb]}_*.sdf')])
    subprocess.run(f'mkdir {pdb}/ligand_processed/'.split(' '))
    pbar = tqdm(total = len(l), leave = True, position = 0)
    for lig in l:
        out = lig.replace('ligand_docked', 'ligand_processed').replace('.sdf','.mol2')
        prelig4swiss(lig, out)
        pbar.update(1)

100%|██████████| 60/60 [00:19<00:00, 76.08it/s] 

In [12]:
# prepare the rotatable bond information for the ligands
for pdb in ['CDK2', 'FXA', 'HSP90']:
    batch_rotatable_bonds(f'{pdb}/ligand_processed/')