In [53]:
import py3Dmol
import pandas as pd
import numpy as np
import plotly.express as px
import requests
import os
import re
import glob
import time

def view_pymol(receptor, ligand, docked, receptor_highlight=None, sticks=False):
  v = py3Dmol.view()
  v.addModel(open(receptor).read())
  if sticks:
    v.setStyle({'cartoon':{},'stick':{'radius':.1}})
  else:
    v.setStyle({'cartoon':{}})
  if receptor_highlight:
    for i in range(receptor_highlight-3, receptor_highlight+3):
      v.setStyle({'model': -1, 'serial': i}, {"cartoon": {'color': 'yellow'}, 'stick':{'radius':.3, 'color':'yellow'}})
  v.addModel(open(ligand).read())
  v.setStyle({'model':1},{'stick':{'colorscheme':'dimgrayCarbon','radius':.125}})
  v.addModelsAsFrames(open(docked).read())
  v.setStyle({'model':2},{'stick':{'colorscheme':'greenCarbon'}})
  v.zoomTo({'model':1})
  v.rotate(90)
  v.animate({'interval':5000})
  return v

from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from Bio.PDB import PDBParser, NeighborSearch

def find_ester_bond(mol):
    for bond in mol.GetBonds():
        atom1, atom2 = bond.GetBeginAtom(), bond.GetEndAtom()
        # Check if the bond is between carbon and oxygen
        if (atom1.GetAtomicNum() == 6 and atom2.GetAtomicNum() == 8) or \
           (atom2.GetAtomicNum() == 6 and atom1.GetAtomicNum() == 8):
            # Identify which atom is carbon and which is oxygen
            c_atom = atom1 if atom1.GetAtomicNum() == 6 else atom2
            o_atom = atom2 if atom1.GetAtomicNum() == 6 else atom1
            
            # Check if the carbon is connected to another oxygen (double-bonded)
            for neighbor in c_atom.GetNeighbors():
                if neighbor.GetAtomicNum() == 8 and neighbor.GetIdx() != o_atom.GetIdx():
                    return bond
    return None

def is_part_of_cyclopropane(atom):
    ring_info = atom.GetOwningMol().GetRingInfo()
    for ring in ring_info.AtomRings():
        if atom.GetIdx() in ring and len(ring) == 3:
            return True
    return False

def calculate_distances(protein_file, ligand_file, residue_number, atom_name):
    # Load the protein structure
    parser = PDBParser()
    structure = parser.get_structure("protein", protein_file)
    
    # Find the specific atom in the protein
    target_atom = None
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.id[1] == residue_number:
                    for atom in residue:
                        if atom.name == atom_name:
                            target_atom = atom
                            break
    
    if target_atom is None:
        raise ValueError(f"Atom {atom_name} not found in residue {residue_number}")
    
    # Load the ligand with all conformations
    suppl = Chem.SDMolSupplier(ligand_file, removeHs=False)
    
    distances = []
    for mol in suppl:
        if mol is None:
            distances.append(None)
            continue
        
        # Find the ester bond in the ligand
        ester_bond = find_ester_bond(mol)
        
        if ester_bond is None:
            distances.append(None)
            continue
        
        # Calculate the midpoint of the ester bond
        conf = mol.GetConformer()
        pos1 = conf.GetAtomPosition(ester_bond.GetBeginAtomIdx())
        pos2 = conf.GetAtomPosition(ester_bond.GetEndAtomIdx())
        midpoint = ((pos1.x + pos2.x) / 2, (pos1.y + pos2.y) / 2, (pos1.z + pos2.z) / 2)
        
        # Calculate distance
        distance = np.linalg.norm(target_atom.coord - midpoint)
        distances.append(distance)
    
    return distances

def load_docking_results(protein_file, ligand_file, gnina_log_file, residue_number, atom_name):
    # Calculate distances
    distances = calculate_distances(protein_file, ligand_file, residue_number, atom_name)

    # Read gnina log file
    with open(gnina_log_file, 'r') as f:
        log_content = f.read()
    
    # Extract docking results
    pattern = r"^\s*(\d+)\s+([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s*$"
    docking_results = re.findall(pattern, log_content, re.MULTILINE)
    
    # Create DataFrame
    df = pd.DataFrame(docking_results, columns=['Mode', 'Affinity', 'Intramol', 'CNN_score', 'CNN_affinity'])
    df = df.astype({'Mode': int, 'Affinity': float, 'Intramol': float, 'CNN_score': float, 'CNN_affinity': float})
    
    # Add distances
    df['Distance'] = pd.Series(distances)
    
    # Reorder columns
    df = df[['Mode', 'Affinity', 'Intramol', 'CNN_score', 'CNN_affinity', 'Distance']]
    
    return df

In [171]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdFMCS, AllChem
from scipy.cluster.hierarchy import linkage
import plotly.graph_objects as go
import plotly.figure_factory as ff

def load_poses(sdf_file):
    """Load poses from an SDF file and rename them."""
    poses = Chem.SDMolSupplier(sdf_file)
    renamed_poses = []
    for index, p in enumerate(poses):
        if p is not None:
            p.SetProp('_Name', str(index + 1))
            renamed_poses.append(p)
    return renamed_poses

def calculate_rmsd_matrix(poses):
    """Calculate RMSD matrix for all pose pairs."""
    import math
    size = len(poses)
    rmsd_matrix = np.zeros((size, size))
    
    for i, mol in enumerate(poses):
        for j, jmol in enumerate(poses):
            # MCS identification between reference pose and target pose
            r = rdFMCS.FindMCS([mol, jmol])
            # Atom map for reference and target
            a = mol.GetSubstructMatch(Chem.MolFromSmarts(r.smartsString))
            b = jmol.GetSubstructMatch(Chem.MolFromSmarts(r.smartsString))
            # Atom map generation
            amap = list(zip(a, b))

            # Calculate RMSD
            # distance calculation per atom pair
            distances=[]
            for atomA, atomB in amap:
                pos_A=mol.GetConformer().GetAtomPosition (atomA)
                pos_B=jmol.GetConformer().GetAtomPosition (atomB)
                coord_A=np.array((pos_A.x,pos_A.y,pos_A.z))
                coord_B=np.array ((pos_B.x,pos_B.y,pos_B.z))
                dist_numpy = np.linalg.norm(coord_A-coord_B)
                distances.append(dist_numpy)
    
            # This is the RMSD formula from wikipedia
            rmsd=math.sqrt(1/len(distances)*sum([i*i for i in distances]))
    
            #saving the rmsd values to a matrix and a table for clustering
            rmsd_matrix[i ,j]=rmsd
    
    return rmsd_matrix

def perform_clustering(rmsd_matrix):
    """Perform hierarchical clustering on the RMSD matrix."""
    return linkage(rmsd_matrix, method='complete')

def plot_dendrogram(
    dist,
    linkage_method,
    leaf_data,
    width,
    height,
    title=None,
    count_sort=True,
    distance_sort=False,
    line_width=0.5,
    line_color='black',
    marker_size=15,
    leaf_color='CNN_score',
    render_mode='svg',
    leaf_y=0,
    leaf_color_discrete_map=None,
    leaf_category_orders=None,
    template='simple_white',
):

    import plotly.express as px
    import scipy.cluster.hierarchy as sch

    # Hierarchical clustering.
    Z = sch.linkage(dist, method=linkage_method)
    # Compute the dendrogram but don't plot it.
    dend = sch.dendrogram(
        Z,
        count_sort=count_sort,
        distance_sort=distance_sort,
        no_plot=True,
    )

    # Compile the line coordinates into a single dataframe.
    icoord = dend["icoord"]
    dcoord = dend["dcoord"]
    line_segments_x = []
    line_segments_y = []
    for ik, dk in zip(icoord, dcoord):
        # Adding None here breaks up the lines.
        line_segments_x += ik + [None]
        line_segments_y += dk + [None]
    df_line_segments = pd.DataFrame({"x": line_segments_x, "y": line_segments_y})

    # Convert X coordinates to haplotype indices (scipy multiplies coordinates by 10).
    df_line_segments["x"] = (df_line_segments["x"] - 5) / 10

    # Plot the lines.
    fig = px.line(
        df_line_segments,
        x="x",
        y="y",
        render_mode=render_mode,
        template=template,
    )

    # Reorder leaf data to align with dendrogram.
    leaves = dend["leaves"]
    n_leaves = len(leaves)
    leaf_data = leaf_data.iloc[leaves]

    # Add scatter plot to draw the leaves.
    fig.add_traces(
        list(
            px.scatter(
                data_frame=leaf_data,
                x=np.arange(n_leaves),
                y=np.repeat(leaf_y, n_leaves),
                color=leaf_color,
                render_mode=render_mode,
                hover_name='Mode',
                hover_data=leaf_data.columns.to_list(),
                template=template,
                color_discrete_map=leaf_color_discrete_map,
                category_orders=leaf_category_orders,
            ).select_traces()
        )
    )

    # Style the lines and markers.
    line_props = dict(
        width=line_width,
        color=line_color,
    )
    marker_props = dict(
        size=marker_size,
    )
    fig.update_traces(line=line_props, marker=marker_props)

    # Style the figure.
    fig.update_layout(
        width=width,
        height=height,
        title=title,
        autosize=True,
        hovermode="closest",
        # I cannot get the xaxis title to appear below the plot, and when
        # it's above the plot it often overlaps the title, so hiding it
        # for now.
        xaxis_title=None,
        yaxis_title=None,
        showlegend=True,
    )

    # Style axes.
    fig.update_xaxes(
        mirror=False,
        showgrid=False,
        showline=False,
        showticklabels=False,
        ticks="",
        # range=(-2, n_leaves + 2),
    )
    fig.update_yaxes(
        mirror=False,
        showgrid=False,
        showline=False,
        showticklabels=True,
        ticks="outside",
        # range=(leaf_y - y_axis_buffer, np.max(dcoord) + y_axis_buffer),
    )

    return fig, leaf_data

def plot_heatmap(df_rmsd, leaf_data):
    """Create an interactive heatmap using Plotly."""
    pose_order = leaf_data['Mode'].to_numpy() - 1
    
    fig = go.Figure(data=go.Heatmap(
        z=df_rmsd,
        x=np.arange(len(pose_order)),
        y=np.arange(len(pose_order)),
        colorscale='Viridis',
        showscale=False
    ))

    # fig.update_layout(xaxis={'categoryarray': pose_order})

    return fig

def _concat_subplots(
        figures,
        width,
        height,
    ):
        from plotly.subplots import make_subplots  # type: ignore
        # make subplots
        fig = make_subplots(
            rows=2,
            cols=1,
            vertical_spacing=0.05,
            shared_xaxes=True
        )

        for i, figure in enumerate(figures):
            if isinstance(figure, go.Figure):
                # This is a figure, access the traces within it.
                for trace in range(len(figure["data"])):
                    fig.append_trace(figure["data"][trace], row=i+1, col=1)
            else:
                # Assume this is a trace, add directly.
                fig.append_trace(figure, row=i+1, col=1)

        fig.update_xaxes(visible=False)
        fig.update_layout(
            width=width,
            height=height,
            hovermode="closest",
            plot_bgcolor="white",
        )

        return fig

def analyse_poses(receptor_path, ligand_path, sdf_path, docking_log_path, residue_number, atom_name, width=1000, height=500):
    """Analyze poses and create visualizations."""
    # Load poses
    poses = load_poses(sdf_path)
    results_df = load_docking_results(receptor_path, sdf_path, docking_log_path, residue_number, atom_name)
    
    # Calculate RMSD matrix
    rmsd_matrix = calculate_rmsd_matrix(poses)

    from scipy.spatial.distance import squareform
    condensed_rmsd_matrix = squareform(rmsd_matrix, checks=False)
    
    # Create visualizations
    dendrogram, leaf_data = plot_dendrogram(dist=condensed_rmsd_matrix, linkage_method='complete', leaf_data=results_df, width=500, height=500)
    
    pose_order = leaf_data['Mode'].to_numpy() - 1
    df_rmsd = pd.DataFrame(rmsd_matrix).iloc[pose_order, pose_order]
    heatmap = plot_heatmap(df_rmsd, leaf_data)

    fig = _concat_subplots(figures=[dendrogram, heatmap], width=width, height=height)
    
    return fig, leaf_data

gene_id = 'AGAP006227'
ligand = 'pirimiphos-methyl'
receptor_path=f"../receptors/{gene_id}.pdb"
ligand_path = f"../ligands/{ligand}.pdbqt"
sdf_path = f"../docking/{gene_id}_{ligand}.sdf"
docking_log_path = f"../docking/{gene_id}_{ligand}.log"
residue_number= 192
atom_name="OG"

# Example usage
fig, df_docking = analyse_poses(receptor_path=receptor_path,
                    ligand_path=ligand_path, 
                    sdf_path=sdf_path, 
                    docking_log_path=docking_log_path, 
                    residue_number=residue_number,
                    atom_name=atom_name
                   )
fig.show()

In [160]:
df_docking

Unnamed: 0,Mode,Affinity,Intramol,CNN_score,CNN_affinity,Distance
6,7,17.11,16.51,0.2573,4.063,
4,5,-5.06,12.41,0.261,4.109,
5,6,-5.65,13.12,0.2596,4.109,
8,9,-0.38,14.5,0.2328,3.467,
0,1,8.08,13.11,0.4328,4.223,
2,3,-2.36,12.08,0.2709,3.559,
3,4,-5.5,12.44,0.2656,3.547,
1,2,2.01,12.58,0.3798,4.008,
7,8,-4.62,13.67,0.248,3.731,


In [123]:
# Example usage
gene_id = 'AGAP006227'
ligand = 'pirimiphos-methyl'

protein_file = f"../receptors/{gene_id}.pdb"
ligand_file = f"../docking/{gene_id}_{ligand}.sdf"
gnina_log_file = f"../docking/{gene_id}_{ligand}.log"
residue_number = 192
atom_name = "OG"

distances = calculate_distances(protein_file, ligand_file, 192, "OG")  # 192 is the residue number, "OG" is the atom name for Serine's hydroxyl oxygen

for i, distance in enumerate(distances, 1):
    print(f"Conformation {i}: Distance = {distance:.2f} Å")

# Calculate and print statistics
if distances:
    print(f"\nMinimum distance: {min(distances):.2f} Å")
    print(f"Maximum distance: {max(distances):.2f} Å")
    print(f"Average distance: {sum(distances) / len(distances):.2f} Å")
else:
    print("No valid conformations found.")

TypeError: unsupported format string passed to NoneType.__format__

In [56]:
result_df = load_docking_results(protein_file, ligand_file, gnina_log_file, residue_number, atom_name)
result_df

Unnamed: 0,Mode,Affinity,Intramol,CNN_score,CNN_affinity,Distance
0,1,-7.49,21.7,0.168,5.189,5.124323
1,2,-7.31,25.07,0.1635,5.176,5.606119
2,3,-7.49,21.92,0.1391,4.991,6.82162
3,4,-8.61,22.43,0.1356,5.62,5.820997
4,5,-8.6,21.6,0.1208,4.822,5.931378
5,6,-6.89,22.28,0.1179,5.043,5.583759
6,7,-7.47,21.76,0.1127,5.192,3.907323
7,8,-4.3,21.43,0.1118,5.278,6.325963
8,9,-6.63,21.38,0.1098,5.347,5.994756
