In [None]:
import os
os.makedirs("data", exist_ok=True)
!wget "https://raw.githubusercontent.com/FNTwin/summerschool_lab2/main/data/6vhn.pdb" -O "data/6vhn.pdb"
!wget "https://raw.githubusercontent.com/FNTwin/summerschool_lab2/main/data/6vhn_prepared.pdb" -O "data/6vhn_prepared.pdb"
!wget "https://raw.githubusercontent.com/FNTwin/summerschool_lab2/main/data/Enamine_Hinge_Binders_Library_plated_24000cmds_20210316%20(1).sdf" -O "data/Enamine_Hinge_Binders_Library_plated_24000cmds_20210316%20(1).sdf"
!wget "https://raw.githubusercontent.com/FNTwin/summerschool_lab2/main/__init__.py" "__init__.py"
!wget "https://raw.githubusercontent.com/FNTwin/summerschool_lab2/main/workshop_2_utils.py" "workshop_2_utils.py"
!wget "https://raw.githubusercontent.com/FNTwin/summerschool_lab2/main/env.yml" "env.yml"

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
import condacolab
condacolab.check()

In [None]:
!mamba env update -n base -f env.yml

In [None]:
!pip install py3Dmol

In [None]:
from google.colab import output
output.enable_custom_widget_manager()


In [None]:
import nglview as ng
import mdtraj
import numpy as np
import os
from workshop_2_utils import *


os.makedirs("sdf_inputs", exist_ok=True)
os.makedirs("smina_inputs", exist_ok=True)



In [None]:
traj = mdtraj.load("data/6vhn.pdb")

def get_protein_ligand_idxs(traj ,resname=None):
    protein = traj.top.select("protein")
    resname = "not protein" if not resname else resname
    ligand = traj.top.select(resname)
    return protein, ligand

def save_trimmed_pdb(path, traj, idxs):
    traj.atom_slice(idxs).save_pdb(path)


receptor, ligand = get_protein_ligand_idxs(traj, "not protein and not water")

save_trimmed_pdb("data/ligand.pdb", traj,ligand)
#save_trimmed_pdb("data/receptor.pdb", traj,receptor)



In [None]:
import py3Dmol
#First we assign the py3Dmol.view as view
view=py3Dmol.view()
#The following lines are used to add the addModel class
#to read the PDB files of chain B and C
view.addModel(open('data/6vhn_prepared.pdb', 'r').read(),'pdb')
view.addModel(open('data/6vhn.pdb', 'r').read(),'pdb')
#Zooming into all visualized structures
view.zoomTo()
#Here we set the background color as white
view.setBackgroundColor('white')
view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
#And we finally visualize the structures using the command below
view.show()

In [None]:
def read_pdb_with_ob(file):
    """Read a molecule file with open babel

    Args:
        infile (Union[str os.PathLike]): input file

    Returns:
        mols (list): list of molecules found in the input file
    """

    try:
        from openbabel import pybel
    except ImportError:
        raise ImportError("Pybel is required for reading openbabel molecules")
    mols = [m for m in pybel.readfile(format="pdb",filename=file)]
    return mols

def prepare_ob_mols(ligand, outpath, overwrite=False):
    from openbabel import pybel
    out = pybel.Outputfile(format="pdbqt" , filename=outpath,  overwrite=overwrite)
    ligand.addh()
    if not ligand.OBMol.HasNonZeroCoords():
        ligand.make3D()
    ligand.calccharges(model="gasteiger")
    out.write(ligand)
    out.close()

ligand_mol= read_pdb_with_ob("data/ligand.pdb")


In [None]:
ligand_mol[0]

In [None]:
prepare_ob_mols(ligand_mol[0], "smina_inputs/ligand.pdbqt", overwrite=True)

In [None]:
prep=Preprocessor()
prep.prepare_receptor("data/6vhn_prepared.pdb", "smina_inputs/receptor.pdbqt")
#prep.prepare_ligand("data/ligand.pdb", "smina_inputs/ligand.pdbqt", in_format="pdb")

In [None]:
# Binding box creations

In [None]:
ligand=mdtraj.load("data/ligand.pdb")
def create_box_from_ligand(ligand):
    xyz=ligand.xyz[0]*10 # convert to Angstrom from nm
    pocket_center = (xyz.max(axis=0) + xyz.min(axis=0)) / 2
    pocket_size = xyz.max(axis=0) - xyz.min(axis=0) + 5
    return Box.from_array(pocket_center, pocket_size)

box=create_box_from_ligand(ligand)
box

In [None]:
from workshop_2_utils import Docking

docker=Docking("smina_inputs/receptor.pdbqt", box)


In [None]:
os.makedirs("outputs", exist_ok=True)
text=docker.dock_one("smina_inputs/ligand.pdbqt", "outputs/ligand_out.sdf")
docker.parse_output(text)

In [None]:
view = py3Dmol.view()
view.addModel(open('data/6vhn_prepared.pdb', 'r').read())
view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
view.addModel(open('outputs/ligand_out.sdf', 'r').read())
view.setStyle({'model': -1}, {"stick" :  {'color': "yellow"}})
view.zoomTo()
view.show()

In [None]:
poses=dm.read_sdf("outputs/ligand_out.sdf", as_df=True, mol_column="mols", n_jobs=-1)
poses

In [None]:
dm.viz.to_image(poses["mols"])

In [None]:
# Now for some real molecules

In [None]:
import datamol as dm
df_mols = dm.read_sdf("data/Enamine_Hinge_Binders_Library_plated_24000cmds_20210316%20(1).sdf", as_df=True, mol_column="mols", n_jobs=-1)
docker.parse_mol_to_pbdqt(df_mols["mols"][0])



In [None]:
text=docker.dock_one("smina_inputs/mol_0.pdbqt", "outputs/poses_0.sdf")
docker.parse_output(text)
poses=dm.read_sdf("outputs/poses_0.sdf", as_df=True, mol_column="mols", n_jobs=-1)
poses

In [None]:
dm.viz.to_image(poses["mols"])

In [None]:
df_mols.head()

In [None]:
df_mols["fp"]=df_mols["mols"].apply(lambda x : dm.to_fp(x))
df_mols

In [None]:
docker=Docking("smina_inputs/receptor.pdbqt", box)

In [None]:
docker.dock_multiple_mols(
        df_mols["mols"].tolist()[:5], list(range(5))
)


In [None]:
poses= dm.read_sdf("smina_outputs/poses.sdf", as_df=True, mol_column="mols", n_jobs=-1)
poses.sort_values("minimizedAffinity",inplace=True)
poses

In [None]:
dm.viz.to_image(poses["mols"].tolist()[:10])

In [None]:

def create_py3d_model(sdf_file):
  molecules=dm.read_sdf(sdf_file, remove_hs=False)
  view = py3Dmol.view()
  view.addModel(open('data/6vhn_prepared.pdb', 'r').read())
  view.setStyle({'model': -1}, {"cartoon": {'color': 'spectrum'}})
  for mol in molecules:
    view.addModel(Chem.MolToMolBlock(mol,confId=0), "sdf")
    view.setStyle({'model': -1}, {"stick" :  {}})
  view.zoomTo()
  return view

view=create_py3d_model("smina_outputs/poses.sdf")
view.show()



In [None]:
def get_random_idxs(df, n=10, seed=42):
    np.random.seed(seed)
    return np.random.randint(0, len(df), n)


df = init_df_fields(df_mols)
df.head()

In [None]:
def train_gp(df) -> GaussianProcessRegressor:
    from sklearn.gaussian_process.kernels import RBF
    X = np.vstack(df["fp"][df["sampled"]>=1].tolist())
    Y = np.vstack(df["true_affinity"][df["sampled"]>=1].tolist())
    return GaussianProcessRegressor(kernel=RBF(length_scale=2.0,
                                               length_scale_bounds=(1e-1, 20.0)),
                                     random_state=0).fit(X,Y)

def predict_with_gp(df, gp):
    X = np.vstack(df["fp"].tolist())
    Y = np.vstack(df["true_affinity"].tolist())
    mean, std = gp.predict(X, return_std=True)
    df["pred_affinity"] = mean
    df["uncertainty"] = std
    return df

def samples_next(df, n: int = 10, sort_by_uncertainty = True) -> List[int]:
    original_df = df
    if sort_by_uncertainty:
        ascending=False
        name="uncertainty"
    else:
        ascending=True
        name="pred_affinity"
    return df.sort_values(name, ascending=ascending)["idxs"].tolist()[:n]



#gp=train_gp(df_mols_al)
#df = predict_with_gp(df, gp)
#df.head()


In [None]:
def get_results(output_dir, idxs):
    values=[]
    key="minimizedAffinity"
    for idx in idxs:
        poses= dm.read_sdf(os.path.join(output_dir, f"poses_{idx}.sdf"), as_df=True, mol_column="mols", n_jobs=-1, sanitize=False)
        poses=poses.sort_values("minimizedAffinity",inplace=False)
        values.append(poses["minimizedAffinity"][0])
    return values

def format_df(df, affinities, sampled_idxs, iteration):
    df["true_affinity"][sampled_idxs]=affinities
    df["sampled"][sampled_idxs]=iteration
    return df


In [None]:
from copy import deepcopy

N_OF_AL_ITERATIONS = 5
AL_ON_UNCERTAINTY = True
FIRST_LOOP=True # get random idxs at the first loop
N_OF_ORACLE_CALLS=3 # n of samples for initialization
SEED = 42

docker=Docking("smina_inputs/receptor.pdbqt", box, num_poses=3)

ultimate_df = deepcopy(df)
for iteration in range(N_OF_AL_ITERATIONS):
    if FIRST_LOOP:
        FIRST_LOOP = not FIRST_LOOP
        sampled_idxs= get_random_idxs(ultimate_df, n=N_OF_ORACLE_CALLS, seed=SEED).tolist()

    # Create iteration directory
    output_dir = f"al_loop_{iteration}"
    os.makedirs(output_dir,exist_ok=True)

    # Select molecules to dock and dock them
    # really slow on colab
    mols_to_dock=ultimate_df["mols"].to_numpy()[sampled_idxs]
    docker.dock_multiple_mols(mols_to_dock, sampled_idxs, output_dir)

    # Get results
    affinities = get_results(output_dir, sampled_idxs)
    ultimate_df = format_df(ultimate_df, affinities, sampled_idxs, iteration + 1)

    # AL call
    GP = train_gp(ultimate_df)
    ultimate_df = predict_with_gp(ultimate_df, GP)
    sampled_idxs = samples_next(ultimate_df, N_OF_ORACLE_CALLS, AL_ON_UNCERTAINTY)


